blob: d14e1759683c9d6d6e86cb73a269119fb94a86c5 [file] [log] [blame]
Aniket Dhoka80c64e2019-04-25 09:29:28 +05301/*
James Zernb7c05bd2024-06-11 19:15:10 -07002 * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
Aniket Dhoka80c64e2019-04-25 09:29:28 +05303 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12#include <immintrin.h>
13#include "config/av1_rtcd.h"
14#include "av1/common/warped_motion.h"
Sachin Kumar Garg6b103332019-05-10 19:18:19 +053015#include "aom_dsp/x86/synonyms.h"
Aniket Dhoka80c64e2019-04-25 09:29:28 +053016
Aniket Dhokccdbb8a2019-05-03 17:15:20 +053017DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask01_avx2[32]) = {
Aniket Dhoka80c64e2019-04-25 09:29:28 +053018 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
19 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
20};
21
Aniket Dhokccdbb8a2019-05-03 17:15:20 +053022DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask23_avx2[32]) = {
Aniket Dhoka80c64e2019-04-25 09:29:28 +053023 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
24 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3
25};
26
Aniket Dhokccdbb8a2019-05-03 17:15:20 +053027DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask45_avx2[32]) = {
Aniket Dhoka80c64e2019-04-25 09:29:28 +053028 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,
29 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5
30};
31
Aniket Dhokccdbb8a2019-05-03 17:15:20 +053032DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask67_avx2[32]) = {
Aniket Dhoka80c64e2019-04-25 09:29:28 +053033 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
34 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7
35};
36
Aniket Dhokccdbb8a2019-05-03 17:15:20 +053037DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask0_avx2[32]) = {
38 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
39 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
40};
Aniket Dhoka80c64e2019-04-25 09:29:28 +053041
Aniket Dhokccdbb8a2019-05-03 17:15:20 +053042DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask1_avx2[32]) = {
43 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7,
44 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7
45};
Aniket Dhoka80c64e2019-04-25 09:29:28 +053046
Aniket Dhokccdbb8a2019-05-03 17:15:20 +053047DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask2_avx2[32]) = {
Aniket Dhoka80c64e2019-04-25 09:29:28 +053048 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11,
49 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11
50};
51
Aniket Dhokccdbb8a2019-05-03 17:15:20 +053052DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask3_avx2[32]) = {
Aniket Dhoka80c64e2019-04-25 09:29:28 +053053 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15,
54 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15
55};
56
Aniket Dhokccdbb8a2019-05-03 17:15:20 +053057DECLARE_ALIGNED(32, static const uint8_t,
58 shuffle_src0[32]) = { 0, 2, 2, 4, 4, 6, 6, 8, 1, 3, 3,
59 5, 5, 7, 7, 9, 0, 2, 2, 4, 4, 6,
60 6, 8, 1, 3, 3, 5, 5, 7, 7, 9 };
Aniket Dhoka80c64e2019-04-25 09:29:28 +053061
Aniket Dhokccdbb8a2019-05-03 17:15:20 +053062DECLARE_ALIGNED(32, static const uint8_t,
63 shuffle_src1[32]) = { 4, 6, 6, 8, 8, 10, 10, 12, 5, 7, 7,
64 9, 9, 11, 11, 13, 4, 6, 6, 8, 8, 10,
65 10, 12, 5, 7, 7, 9, 9, 11, 11, 13 };
Aniket Dhoka80c64e2019-04-25 09:29:28 +053066
Aniket Dhokccdbb8a2019-05-03 17:15:20 +053067DECLARE_ALIGNED(32, static const uint8_t,
68 shuffle_src2[32]) = { 1, 3, 3, 5, 5, 7, 7, 9, 2, 4, 4,
69 6, 6, 8, 8, 10, 1, 3, 3, 5, 5, 7,
70 7, 9, 2, 4, 4, 6, 6, 8, 8, 10 };
Aniket Dhoka80c64e2019-04-25 09:29:28 +053071
Aniket Dhokccdbb8a2019-05-03 17:15:20 +053072DECLARE_ALIGNED(32, static const uint8_t,
73 shuffle_src3[32]) = { 5, 7, 7, 9, 9, 11, 11, 13, 6, 8, 8,
74 10, 10, 12, 12, 14, 5, 7, 7, 9, 9, 11,
75 11, 13, 6, 8, 8, 10, 10, 12, 12, 14 };
Aniket Dhoka80c64e2019-04-25 09:29:28 +053076
77static INLINE void filter_src_pixels_avx2(const __m256i src, __m256i *horz_out,
78 __m256i *coeff,
79 const __m256i *shuffle_src,
80 const __m256i *round_const,
81 const __m128i *shift, int row) {
82 const __m256i src_0 = _mm256_shuffle_epi8(src, shuffle_src[0]);
83 const __m256i src_1 = _mm256_shuffle_epi8(src, shuffle_src[1]);
84 const __m256i src_2 = _mm256_shuffle_epi8(src, shuffle_src[2]);
85 const __m256i src_3 = _mm256_shuffle_epi8(src, shuffle_src[3]);
86
87 const __m256i res_02 = _mm256_maddubs_epi16(src_0, coeff[0]);
88 const __m256i res_46 = _mm256_maddubs_epi16(src_1, coeff[1]);
89 const __m256i res_13 = _mm256_maddubs_epi16(src_2, coeff[2]);
90 const __m256i res_57 = _mm256_maddubs_epi16(src_3, coeff[3]);
91
92 const __m256i res_even = _mm256_add_epi16(res_02, res_46);
93 const __m256i res_odd = _mm256_add_epi16(res_13, res_57);
94 const __m256i res =
95 _mm256_add_epi16(_mm256_add_epi16(res_even, res_odd), *round_const);
96 horz_out[row] = _mm256_srl_epi16(res, *shift);
97}
98
99static INLINE void prepare_horizontal_filter_coeff_avx2(int alpha, int beta,
100 int sx,
101 __m256i *coeff) {
102 __m128i tmp_0 = _mm_loadl_epi64(
Ruiling Songd01ff592019-09-13 17:00:28 +0800103 (__m128i *)&av1_filter_8bit[((unsigned)(sx + 0 * alpha)) >>
104 WARPEDDIFF_PREC_BITS]);
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530105 __m128i tmp_1 = _mm_loadl_epi64(
Ruiling Songd01ff592019-09-13 17:00:28 +0800106 (__m128i *)&av1_filter_8bit[((unsigned)(sx + 1 * alpha)) >>
107 WARPEDDIFF_PREC_BITS]);
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530108 __m128i tmp_2 = _mm_loadl_epi64(
Ruiling Songd01ff592019-09-13 17:00:28 +0800109 (__m128i *)&av1_filter_8bit[((unsigned)(sx + 2 * alpha)) >>
110 WARPEDDIFF_PREC_BITS]);
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530111 __m128i tmp_3 = _mm_loadl_epi64(
Ruiling Songd01ff592019-09-13 17:00:28 +0800112 (__m128i *)&av1_filter_8bit[((unsigned)(sx + 3 * alpha)) >>
113 WARPEDDIFF_PREC_BITS]);
114
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530115 __m128i tmp_4 = _mm_loadl_epi64(
Ruiling Songd01ff592019-09-13 17:00:28 +0800116 (__m128i *)&av1_filter_8bit[((unsigned)(sx + 4 * alpha)) >>
117 WARPEDDIFF_PREC_BITS]);
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530118 __m128i tmp_5 = _mm_loadl_epi64(
Ruiling Songd01ff592019-09-13 17:00:28 +0800119 (__m128i *)&av1_filter_8bit[((unsigned)(sx + 5 * alpha)) >>
120 WARPEDDIFF_PREC_BITS]);
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530121 __m128i tmp_6 = _mm_loadl_epi64(
Ruiling Songd01ff592019-09-13 17:00:28 +0800122 (__m128i *)&av1_filter_8bit[((unsigned)(sx + 6 * alpha)) >>
123 WARPEDDIFF_PREC_BITS]);
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530124 __m128i tmp_7 = _mm_loadl_epi64(
Ruiling Songd01ff592019-09-13 17:00:28 +0800125 (__m128i *)&av1_filter_8bit[((unsigned)(sx + 7 * alpha)) >>
126 WARPEDDIFF_PREC_BITS]);
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530127
Ruiling Songd01ff592019-09-13 17:00:28 +0800128 __m256i tmp0_256 = _mm256_castsi128_si256(tmp_0);
129 __m256i tmp2_256 = _mm256_castsi128_si256(tmp_2);
130 __m256i tmp1_256 = _mm256_castsi128_si256(tmp_1);
131 __m256i tmp3_256 = _mm256_castsi128_si256(tmp_3);
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530132
Ruiling Songd01ff592019-09-13 17:00:28 +0800133 __m256i tmp4_256 = _mm256_castsi128_si256(tmp_4);
134 __m256i tmp6_256 = _mm256_castsi128_si256(tmp_6);
135 __m256i tmp5_256 = _mm256_castsi128_si256(tmp_5);
136 __m256i tmp7_256 = _mm256_castsi128_si256(tmp_7);
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530137
Ruiling Songd01ff592019-09-13 17:00:28 +0800138 __m128i tmp_8 = _mm_loadl_epi64(
139 (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 0 * alpha) >>
140 WARPEDDIFF_PREC_BITS]);
141 tmp0_256 = _mm256_inserti128_si256(tmp0_256, tmp_8, 1);
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530142
Ruiling Songd01ff592019-09-13 17:00:28 +0800143 __m128i tmp_9 = _mm_loadl_epi64(
144 (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 1 * alpha) >>
145 WARPEDDIFF_PREC_BITS]);
146 tmp1_256 = _mm256_inserti128_si256(tmp1_256, tmp_9, 1);
147
148 __m128i tmp_10 = _mm_loadl_epi64(
149 (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 2 * alpha) >>
150 WARPEDDIFF_PREC_BITS]);
151 tmp2_256 = _mm256_inserti128_si256(tmp2_256, tmp_10, 1);
152
153 __m128i tmp_11 = _mm_loadl_epi64(
154 (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 3 * alpha) >>
155 WARPEDDIFF_PREC_BITS]);
156 tmp3_256 = _mm256_inserti128_si256(tmp3_256, tmp_11, 1);
157
158 tmp_2 = _mm_loadl_epi64(
159 (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 4 * alpha) >>
160 WARPEDDIFF_PREC_BITS]);
161 tmp4_256 = _mm256_inserti128_si256(tmp4_256, tmp_2, 1);
162
163 tmp_3 = _mm_loadl_epi64(
164 (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 5 * alpha) >>
165 WARPEDDIFF_PREC_BITS]);
166 tmp5_256 = _mm256_inserti128_si256(tmp5_256, tmp_3, 1);
167
168 tmp_6 = _mm_loadl_epi64(
169 (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 6 * alpha) >>
170 WARPEDDIFF_PREC_BITS]);
171 tmp6_256 = _mm256_inserti128_si256(tmp6_256, tmp_6, 1);
172
173 tmp_7 = _mm_loadl_epi64(
174 (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 7 * alpha) >>
175 WARPEDDIFF_PREC_BITS]);
176 tmp7_256 = _mm256_inserti128_si256(tmp7_256, tmp_7, 1);
177
178 const __m256i tmp_12 = _mm256_unpacklo_epi16(tmp0_256, tmp2_256);
179 const __m256i tmp_13 = _mm256_unpacklo_epi16(tmp1_256, tmp3_256);
180 const __m256i tmp_14 = _mm256_unpacklo_epi16(tmp4_256, tmp6_256);
181 const __m256i tmp_15 = _mm256_unpacklo_epi16(tmp5_256, tmp7_256);
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530182
183 const __m256i res_0 = _mm256_unpacklo_epi32(tmp_12, tmp_14);
184 const __m256i res_1 = _mm256_unpackhi_epi32(tmp_12, tmp_14);
185 const __m256i res_2 = _mm256_unpacklo_epi32(tmp_13, tmp_15);
186 const __m256i res_3 = _mm256_unpackhi_epi32(tmp_13, tmp_15);
187
188 coeff[0] = _mm256_unpacklo_epi64(res_0, res_2);
189 coeff[1] = _mm256_unpackhi_epi64(res_0, res_2);
190 coeff[2] = _mm256_unpacklo_epi64(res_1, res_3);
191 coeff[3] = _mm256_unpackhi_epi64(res_1, res_3);
192}
193
194static INLINE void prepare_horizontal_filter_coeff_beta0_avx2(int alpha, int sx,
195 __m256i *coeff) {
196 __m128i tmp_0 = _mm_loadl_epi64(
Yaowu Xu2c619702019-05-03 08:55:45 -0700197 (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530198 __m128i tmp_1 = _mm_loadl_epi64(
Yaowu Xu2c619702019-05-03 08:55:45 -0700199 (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530200 __m128i tmp_2 = _mm_loadl_epi64(
Yaowu Xu2c619702019-05-03 08:55:45 -0700201 (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530202 __m128i tmp_3 = _mm_loadl_epi64(
Yaowu Xu2c619702019-05-03 08:55:45 -0700203 (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530204 __m128i tmp_4 = _mm_loadl_epi64(
Yaowu Xu2c619702019-05-03 08:55:45 -0700205 (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530206 __m128i tmp_5 = _mm_loadl_epi64(
Yaowu Xu2c619702019-05-03 08:55:45 -0700207 (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530208 __m128i tmp_6 = _mm_loadl_epi64(
Yaowu Xu2c619702019-05-03 08:55:45 -0700209 (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530210 __m128i tmp_7 = _mm_loadl_epi64(
Yaowu Xu2c619702019-05-03 08:55:45 -0700211 (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530212
213 tmp_0 = _mm_unpacklo_epi16(tmp_0, tmp_2);
214 tmp_1 = _mm_unpacklo_epi16(tmp_1, tmp_3);
215 tmp_4 = _mm_unpacklo_epi16(tmp_4, tmp_6);
216 tmp_5 = _mm_unpacklo_epi16(tmp_5, tmp_7);
217
218 const __m256i tmp_12 = _mm256_broadcastsi128_si256(tmp_0);
219 const __m256i tmp_13 = _mm256_broadcastsi128_si256(tmp_1);
220 const __m256i tmp_14 = _mm256_broadcastsi128_si256(tmp_4);
221 const __m256i tmp_15 = _mm256_broadcastsi128_si256(tmp_5);
222
223 const __m256i res_0 = _mm256_unpacklo_epi32(tmp_12, tmp_14);
224 const __m256i res_1 = _mm256_unpackhi_epi32(tmp_12, tmp_14);
225 const __m256i res_2 = _mm256_unpacklo_epi32(tmp_13, tmp_15);
226 const __m256i res_3 = _mm256_unpackhi_epi32(tmp_13, tmp_15);
227
228 coeff[0] = _mm256_unpacklo_epi64(res_0, res_2);
229 coeff[1] = _mm256_unpackhi_epi64(res_0, res_2);
230 coeff[2] = _mm256_unpacklo_epi64(res_1, res_3);
231 coeff[3] = _mm256_unpackhi_epi64(res_1, res_3);
232}
233
234static INLINE void prepare_horizontal_filter_coeff_alpha0_avx2(int beta, int sx,
235 __m256i *coeff) {
236 const __m128i tmp_0 =
Yaowu Xu2c619702019-05-03 08:55:45 -0700237 _mm_loadl_epi64((__m128i *)&av1_filter_8bit[sx >> WARPEDDIFF_PREC_BITS]);
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530238 const __m128i tmp_1 = _mm_loadl_epi64(
Yaowu Xu2c619702019-05-03 08:55:45 -0700239 (__m128i *)&av1_filter_8bit[(sx + beta) >> WARPEDDIFF_PREC_BITS]);
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530240
241 const __m256i res_0 =
242 _mm256_inserti128_si256(_mm256_castsi128_si256(tmp_0), tmp_1, 0x1);
243
244 coeff[0] = _mm256_shuffle_epi8(
Aniket Dhokccdbb8a2019-05-03 17:15:20 +0530245 res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask01_avx2));
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530246 coeff[1] = _mm256_shuffle_epi8(
Aniket Dhokccdbb8a2019-05-03 17:15:20 +0530247 res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask23_avx2));
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530248 coeff[2] = _mm256_shuffle_epi8(
Aniket Dhokccdbb8a2019-05-03 17:15:20 +0530249 res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask45_avx2));
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530250 coeff[3] = _mm256_shuffle_epi8(
Aniket Dhokccdbb8a2019-05-03 17:15:20 +0530251 res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask67_avx2));
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530252}
253
254static INLINE void horizontal_filter_avx2(const __m256i src, __m256i *horz_out,
255 int sx, int alpha, int beta, int row,
256 const __m256i *shuffle_src,
257 const __m256i *round_const,
258 const __m128i *shift) {
259 __m256i coeff[4];
260 prepare_horizontal_filter_coeff_avx2(alpha, beta, sx, coeff);
261 filter_src_pixels_avx2(src, horz_out, coeff, shuffle_src, round_const, shift,
262 row);
263}
264static INLINE void prepare_horizontal_filter_coeff(int alpha, int sx,
265 __m256i *coeff) {
266 const __m128i tmp_0 = _mm_loadl_epi64(
Yaowu Xu2c619702019-05-03 08:55:45 -0700267 (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530268 const __m128i tmp_1 = _mm_loadl_epi64(
Yaowu Xu2c619702019-05-03 08:55:45 -0700269 (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530270 const __m128i tmp_2 = _mm_loadl_epi64(
Yaowu Xu2c619702019-05-03 08:55:45 -0700271 (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530272 const __m128i tmp_3 = _mm_loadl_epi64(
Yaowu Xu2c619702019-05-03 08:55:45 -0700273 (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530274 const __m128i tmp_4 = _mm_loadl_epi64(
Yaowu Xu2c619702019-05-03 08:55:45 -0700275 (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530276 const __m128i tmp_5 = _mm_loadl_epi64(
Yaowu Xu2c619702019-05-03 08:55:45 -0700277 (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530278 const __m128i tmp_6 = _mm_loadl_epi64(
Yaowu Xu2c619702019-05-03 08:55:45 -0700279 (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530280 const __m128i tmp_7 = _mm_loadl_epi64(
Yaowu Xu2c619702019-05-03 08:55:45 -0700281 (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530282
283 const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2);
284 const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3);
285 const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6);
286 const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7);
287
288 const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10);
289 const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10);
290 const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11);
291 const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11);
292
293 coeff[0] = _mm256_castsi128_si256(_mm_unpacklo_epi64(tmp_12, tmp_14));
294 coeff[1] = _mm256_castsi128_si256(_mm_unpackhi_epi64(tmp_12, tmp_14));
295 coeff[2] = _mm256_castsi128_si256(_mm_unpacklo_epi64(tmp_13, tmp_15));
296 coeff[3] = _mm256_castsi128_si256(_mm_unpackhi_epi64(tmp_13, tmp_15));
297}
298
299static INLINE void warp_horizontal_filter_avx2(
300 const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
301 int32_t sx4, int alpha, int beta, int p_height, int height, int i,
302 const __m256i *round_const, const __m128i *shift,
303 const __m256i *shuffle_src) {
304 int k, iy, sx, row = 0;
305 __m256i coeff[4];
306 for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
307 iy = iy4 + k;
308 iy = clamp(iy, 0, height - 1);
309 const __m128i src_0 =
310 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
311 iy = iy4 + k + 1;
312 iy = clamp(iy, 0, height - 1);
313 const __m128i src_1 =
314 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
315 const __m256i src_01 =
316 _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1);
317 sx = sx4 + beta * (k + 4);
318 horizontal_filter_avx2(src_01, horz_out, sx, alpha, beta, row, shuffle_src,
319 round_const, shift);
320 row += 1;
321 }
322 iy = iy4 + k;
323 iy = clamp(iy, 0, height - 1);
324 const __m256i src_01 = _mm256_castsi128_si256(
325 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
326 sx = sx4 + beta * (k + 4);
327 prepare_horizontal_filter_coeff(alpha, sx, coeff);
328 filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
329 shift, row);
330}
331
332static INLINE void warp_horizontal_filter_alpha0_avx2(
333 const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
334 int32_t sx4, int alpha, int beta, int p_height, int height, int i,
335 const __m256i *round_const, const __m128i *shift,
336 const __m256i *shuffle_src) {
337 (void)alpha;
338 int k, iy, sx, row = 0;
339 __m256i coeff[4];
340 for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
341 iy = iy4 + k;
342 iy = clamp(iy, 0, height - 1);
343 const __m128i src_0 =
344 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
345 iy = iy4 + k + 1;
346 iy = clamp(iy, 0, height - 1);
347 const __m128i src_1 =
348 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
349 const __m256i src_01 =
350 _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1);
351 sx = sx4 + beta * (k + 4);
352 prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx, coeff);
353 filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
354 shift, row);
355 row += 1;
356 }
357 iy = iy4 + k;
358 iy = clamp(iy, 0, height - 1);
359 const __m256i src_01 = _mm256_castsi128_si256(
360 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
361 sx = sx4 + beta * (k + 4);
362 prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx, coeff);
363 filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
364 shift, row);
365}
366
367static INLINE void warp_horizontal_filter_beta0_avx2(
368 const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
369 int32_t sx4, int alpha, int beta, int p_height, int height, int i,
370 const __m256i *round_const, const __m128i *shift,
371 const __m256i *shuffle_src) {
372 (void)beta;
373 int k, iy, row = 0;
374 __m256i coeff[4];
375 prepare_horizontal_filter_coeff_beta0_avx2(alpha, sx4, coeff);
376 for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
377 iy = iy4 + k;
378 iy = clamp(iy, 0, height - 1);
379 const __m128i src_0 =
380 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
381 iy = iy4 + k + 1;
382 iy = clamp(iy, 0, height - 1);
383 const __m128i src_1 =
384 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
385 const __m256i src_01 =
386 _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1);
387 filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
388 shift, row);
389 row += 1;
390 }
391 iy = iy4 + k;
392 iy = clamp(iy, 0, height - 1);
393 const __m256i src_01 = _mm256_castsi128_si256(
394 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
395 filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
396 shift, row);
397}
398
399static INLINE void warp_horizontal_filter_alpha0_beta0_avx2(
400 const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
401 int32_t sx4, int alpha, int beta, int p_height, int height, int i,
402 const __m256i *round_const, const __m128i *shift,
403 const __m256i *shuffle_src) {
404 (void)alpha;
405 int k, iy, row = 0;
406 __m256i coeff[4];
407 prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx4, coeff);
408 for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
409 iy = iy4 + k;
410 iy = clamp(iy, 0, height - 1);
411 const __m128i src0 =
412 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
413 iy = iy4 + k + 1;
414 iy = clamp(iy, 0, height - 1);
415 const __m128i src1 =
416 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
417 const __m256i src_01 =
418 _mm256_inserti128_si256(_mm256_castsi128_si256(src0), src1, 0x1);
419 filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
420 shift, row);
421 row += 1;
422 }
423 iy = iy4 + k;
424 iy = clamp(iy, 0, height - 1);
425 const __m256i src_01 = _mm256_castsi128_si256(
426 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
427 filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
428 shift, row);
429}
430
431static INLINE void unpack_weights_and_set_round_const_avx2(
432 ConvolveParams *conv_params, const int round_bits, const int offset_bits,
433 __m256i *res_sub_const, __m256i *round_bits_const, __m256i *wt) {
434 *res_sub_const =
435 _mm256_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) -
436 (1 << (offset_bits - conv_params->round_1 - 1)));
437 *round_bits_const = _mm256_set1_epi16(((1 << round_bits) >> 1));
438
439 const int w0 = conv_params->fwd_offset;
440 const int w1 = conv_params->bck_offset;
Hien Hodd2c6072019-08-23 16:27:13 -0700441 const __m256i wt0 = _mm256_set1_epi16((short)w0);
442 const __m256i wt1 = _mm256_set1_epi16((short)w1);
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530443 *wt = _mm256_unpacklo_epi16(wt0, wt1);
444}
445
446static INLINE void prepare_vertical_filter_coeffs_avx2(int gamma, int delta,
447 int sy,
448 __m256i *coeffs) {
Yaowu Xu3a19b8a2019-05-01 08:40:42 -0700449 __m128i filt_00 =
450 _mm_loadu_si128((__m128i *)(av1_warped_filter +
451 ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
452 __m128i filt_01 =
453 _mm_loadu_si128((__m128i *)(av1_warped_filter +
454 ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
455 __m128i filt_02 =
456 _mm_loadu_si128((__m128i *)(av1_warped_filter +
457 ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
458 __m128i filt_03 =
459 _mm_loadu_si128((__m128i *)(av1_warped_filter +
460 ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530461
Yaowu Xu3a19b8a2019-05-01 08:40:42 -0700462 __m128i filt_10 = _mm_loadu_si128(
463 (__m128i *)(av1_warped_filter +
464 (((sy + delta) + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
465 __m128i filt_11 = _mm_loadu_si128(
466 (__m128i *)(av1_warped_filter +
467 (((sy + delta) + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
468 __m128i filt_12 = _mm_loadu_si128(
469 (__m128i *)(av1_warped_filter +
470 (((sy + delta) + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
471 __m128i filt_13 = _mm_loadu_si128(
472 (__m128i *)(av1_warped_filter +
473 (((sy + delta) + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530474
475 __m256i filt_0 =
476 _mm256_inserti128_si256(_mm256_castsi128_si256(filt_00), filt_10, 0x1);
477 __m256i filt_1 =
478 _mm256_inserti128_si256(_mm256_castsi128_si256(filt_01), filt_11, 0x1);
479 __m256i filt_2 =
480 _mm256_inserti128_si256(_mm256_castsi128_si256(filt_02), filt_12, 0x1);
481 __m256i filt_3 =
482 _mm256_inserti128_si256(_mm256_castsi128_si256(filt_03), filt_13, 0x1);
483
484 __m256i res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
485 __m256i res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
486 __m256i res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
487 __m256i res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
488
489 coeffs[0] = _mm256_unpacklo_epi64(res_0, res_1);
490 coeffs[1] = _mm256_unpackhi_epi64(res_0, res_1);
491 coeffs[2] = _mm256_unpacklo_epi64(res_2, res_3);
492 coeffs[3] = _mm256_unpackhi_epi64(res_2, res_3);
493
Yaowu Xu3a19b8a2019-05-01 08:40:42 -0700494 filt_00 =
495 _mm_loadu_si128((__m128i *)(av1_warped_filter +
496 ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
497 filt_01 =
498 _mm_loadu_si128((__m128i *)(av1_warped_filter +
499 ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
500 filt_02 =
501 _mm_loadu_si128((__m128i *)(av1_warped_filter +
502 ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
503 filt_03 =
504 _mm_loadu_si128((__m128i *)(av1_warped_filter +
505 ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530506
Yaowu Xu3a19b8a2019-05-01 08:40:42 -0700507 filt_10 = _mm_loadu_si128(
508 (__m128i *)(av1_warped_filter +
509 (((sy + delta) + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
510 filt_11 = _mm_loadu_si128(
511 (__m128i *)(av1_warped_filter +
512 (((sy + delta) + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
513 filt_12 = _mm_loadu_si128(
514 (__m128i *)(av1_warped_filter +
515 (((sy + delta) + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
516 filt_13 = _mm_loadu_si128(
517 (__m128i *)(av1_warped_filter +
518 (((sy + delta) + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530519
520 filt_0 =
521 _mm256_inserti128_si256(_mm256_castsi128_si256(filt_00), filt_10, 0x1);
522 filt_1 =
523 _mm256_inserti128_si256(_mm256_castsi128_si256(filt_01), filt_11, 0x1);
524 filt_2 =
525 _mm256_inserti128_si256(_mm256_castsi128_si256(filt_02), filt_12, 0x1);
526 filt_3 =
527 _mm256_inserti128_si256(_mm256_castsi128_si256(filt_03), filt_13, 0x1);
528
529 res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
530 res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
531 res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
532 res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
533
534 coeffs[4] = _mm256_unpacklo_epi64(res_0, res_1);
535 coeffs[5] = _mm256_unpackhi_epi64(res_0, res_1);
536 coeffs[6] = _mm256_unpacklo_epi64(res_2, res_3);
537 coeffs[7] = _mm256_unpackhi_epi64(res_2, res_3);
538}
539
540static INLINE void prepare_vertical_filter_coeffs_delta0_avx2(int gamma, int sy,
541 __m256i *coeffs) {
Yaowu Xu3a19b8a2019-05-01 08:40:42 -0700542 __m128i filt_00 =
543 _mm_loadu_si128((__m128i *)(av1_warped_filter +
544 ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
545 __m128i filt_01 =
546 _mm_loadu_si128((__m128i *)(av1_warped_filter +
547 ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
548 __m128i filt_02 =
549 _mm_loadu_si128((__m128i *)(av1_warped_filter +
550 ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
551 __m128i filt_03 =
552 _mm_loadu_si128((__m128i *)(av1_warped_filter +
553 ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530554
555 __m256i filt_0 = _mm256_broadcastsi128_si256(filt_00);
556 __m256i filt_1 = _mm256_broadcastsi128_si256(filt_01);
557 __m256i filt_2 = _mm256_broadcastsi128_si256(filt_02);
558 __m256i filt_3 = _mm256_broadcastsi128_si256(filt_03);
559
560 __m256i res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
561 __m256i res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
562 __m256i res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
563 __m256i res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
564
565 coeffs[0] = _mm256_unpacklo_epi64(res_0, res_1);
566 coeffs[1] = _mm256_unpackhi_epi64(res_0, res_1);
567 coeffs[2] = _mm256_unpacklo_epi64(res_2, res_3);
568 coeffs[3] = _mm256_unpackhi_epi64(res_2, res_3);
569
Yaowu Xu3a19b8a2019-05-01 08:40:42 -0700570 filt_00 =
571 _mm_loadu_si128((__m128i *)(av1_warped_filter +
572 ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
573 filt_01 =
574 _mm_loadu_si128((__m128i *)(av1_warped_filter +
575 ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
576 filt_02 =
577 _mm_loadu_si128((__m128i *)(av1_warped_filter +
578 ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
579 filt_03 =
580 _mm_loadu_si128((__m128i *)(av1_warped_filter +
581 ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530582
583 filt_0 = _mm256_broadcastsi128_si256(filt_00);
584 filt_1 = _mm256_broadcastsi128_si256(filt_01);
585 filt_2 = _mm256_broadcastsi128_si256(filt_02);
586 filt_3 = _mm256_broadcastsi128_si256(filt_03);
587
588 res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
589 res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
590 res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
591 res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
592
593 coeffs[4] = _mm256_unpacklo_epi64(res_0, res_1);
594 coeffs[5] = _mm256_unpackhi_epi64(res_0, res_1);
595 coeffs[6] = _mm256_unpacklo_epi64(res_2, res_3);
596 coeffs[7] = _mm256_unpackhi_epi64(res_2, res_3);
597}
598
599static INLINE void prepare_vertical_filter_coeffs_gamma0_avx2(int delta, int sy,
600 __m256i *coeffs) {
601 const __m128i filt_0 = _mm_loadu_si128(
Yaowu Xu3a19b8a2019-05-01 08:40:42 -0700602 (__m128i *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530603 const __m128i filt_1 = _mm_loadu_si128(
Yaowu Xu3a19b8a2019-05-01 08:40:42 -0700604 (__m128i *)(av1_warped_filter + ((sy + delta) >> WARPEDDIFF_PREC_BITS)));
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530605
606 __m256i res_0 =
607 _mm256_inserti128_si256(_mm256_castsi128_si256(filt_0), filt_1, 0x1);
608
609 coeffs[0] = _mm256_shuffle_epi8(
Aniket Dhokccdbb8a2019-05-03 17:15:20 +0530610 res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask0_avx2));
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530611 coeffs[1] = _mm256_shuffle_epi8(
Aniket Dhokccdbb8a2019-05-03 17:15:20 +0530612 res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask1_avx2));
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530613 coeffs[2] = _mm256_shuffle_epi8(
Aniket Dhokccdbb8a2019-05-03 17:15:20 +0530614 res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask2_avx2));
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530615 coeffs[3] = _mm256_shuffle_epi8(
Aniket Dhokccdbb8a2019-05-03 17:15:20 +0530616 res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask3_avx2));
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530617
618 coeffs[4] = coeffs[0];
619 coeffs[5] = coeffs[1];
620 coeffs[6] = coeffs[2];
621 coeffs[7] = coeffs[3];
622}
623
624static INLINE void filter_src_pixels_vertical_avx2(__m256i *horz_out,
625 __m256i *src,
626 __m256i *coeffs,
627 __m256i *res_lo,
628 __m256i *res_hi, int row) {
629 const __m256i src_6 = horz_out[row + 3];
630 const __m256i src_7 =
631 _mm256_permute2x128_si256(horz_out[row + 3], horz_out[row + 4], 0x21);
632
633 src[6] = _mm256_unpacklo_epi16(src_6, src_7);
634
635 const __m256i res_0 = _mm256_madd_epi16(src[0], coeffs[0]);
636 const __m256i res_2 = _mm256_madd_epi16(src[2], coeffs[1]);
637 const __m256i res_4 = _mm256_madd_epi16(src[4], coeffs[2]);
638 const __m256i res_6 = _mm256_madd_epi16(src[6], coeffs[3]);
639
640 const __m256i res_even = _mm256_add_epi32(_mm256_add_epi32(res_0, res_2),
641 _mm256_add_epi32(res_4, res_6));
642
643 src[7] = _mm256_unpackhi_epi16(src_6, src_7);
644
645 const __m256i res_1 = _mm256_madd_epi16(src[1], coeffs[4]);
646 const __m256i res_3 = _mm256_madd_epi16(src[3], coeffs[5]);
647 const __m256i res_5 = _mm256_madd_epi16(src[5], coeffs[6]);
648 const __m256i res_7 = _mm256_madd_epi16(src[7], coeffs[7]);
649
650 const __m256i res_odd = _mm256_add_epi32(_mm256_add_epi32(res_1, res_3),
651 _mm256_add_epi32(res_5, res_7));
652
653 // Rearrange pixels back into the order 0 ... 7
654 *res_lo = _mm256_unpacklo_epi32(res_even, res_odd);
655 *res_hi = _mm256_unpackhi_epi32(res_even, res_odd);
656}
657
658static INLINE void store_vertical_filter_output_avx2(
659 const __m256i *res_lo, const __m256i *res_hi, const __m256i *res_add_const,
660 const __m256i *wt, const __m256i *res_sub_const,
661 const __m256i *round_bits_const, uint8_t *pred, ConvolveParams *conv_params,
662 int i, int j, int k, const int reduce_bits_vert, int p_stride, int p_width,
663 const int round_bits) {
664 __m256i res_lo_1 = *res_lo;
665 __m256i res_hi_1 = *res_hi;
666
667 if (conv_params->is_compound) {
668 __m128i *const p_0 =
669 (__m128i *)&conv_params->dst[(i + k + 4) * conv_params->dst_stride + j];
670 __m128i *const p_1 =
671 (__m128i *)&conv_params
672 ->dst[(i + (k + 1) + 4) * conv_params->dst_stride + j];
673
674 res_lo_1 = _mm256_srai_epi32(_mm256_add_epi32(res_lo_1, *res_add_const),
675 reduce_bits_vert);
676
677 const __m256i temp_lo_16 = _mm256_packus_epi32(res_lo_1, res_lo_1);
678 __m256i res_lo_16;
679 if (conv_params->do_average) {
680 __m128i *const dst8_0 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
681 __m128i *const dst8_1 =
682 (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j];
683 const __m128i p_16_0 = _mm_loadl_epi64(p_0);
684 const __m128i p_16_1 = _mm_loadl_epi64(p_1);
685 const __m256i p_16 =
686 _mm256_inserti128_si256(_mm256_castsi128_si256(p_16_0), p_16_1, 1);
687 if (conv_params->use_dist_wtd_comp_avg) {
688 const __m256i p_16_lo = _mm256_unpacklo_epi16(p_16, temp_lo_16);
689 const __m256i wt_res_lo = _mm256_madd_epi16(p_16_lo, *wt);
690 const __m256i shifted_32 =
691 _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
692 res_lo_16 = _mm256_packus_epi32(shifted_32, shifted_32);
693 } else {
694 res_lo_16 = _mm256_srai_epi16(_mm256_add_epi16(p_16, temp_lo_16), 1);
695 }
696 res_lo_16 = _mm256_add_epi16(res_lo_16, *res_sub_const);
697 res_lo_16 = _mm256_srai_epi16(
698 _mm256_add_epi16(res_lo_16, *round_bits_const), round_bits);
699 const __m256i res_8_lo = _mm256_packus_epi16(res_lo_16, res_lo_16);
700 const __m128i res_8_lo_0 = _mm256_castsi256_si128(res_8_lo);
701 const __m128i res_8_lo_1 = _mm256_extracti128_si256(res_8_lo, 1);
James Zernbf733e62022-07-30 19:48:54 -0700702 *(int *)dst8_0 = _mm_cvtsi128_si32(res_8_lo_0);
703 *(int *)dst8_1 = _mm_cvtsi128_si32(res_8_lo_1);
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530704 } else {
705 const __m128i temp_lo_16_0 = _mm256_castsi256_si128(temp_lo_16);
706 const __m128i temp_lo_16_1 = _mm256_extracti128_si256(temp_lo_16, 1);
707 _mm_storel_epi64(p_0, temp_lo_16_0);
708 _mm_storel_epi64(p_1, temp_lo_16_1);
709 }
710 if (p_width > 4) {
711 __m128i *const p4_0 =
712 (__m128i *)&conv_params
713 ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
714 __m128i *const p4_1 =
715 (__m128i *)&conv_params
716 ->dst[(i + (k + 1) + 4) * conv_params->dst_stride + j + 4];
717 res_hi_1 = _mm256_srai_epi32(_mm256_add_epi32(res_hi_1, *res_add_const),
718 reduce_bits_vert);
719 const __m256i temp_hi_16 = _mm256_packus_epi32(res_hi_1, res_hi_1);
720 __m256i res_hi_16;
721 if (conv_params->do_average) {
722 __m128i *const dst8_4_0 =
723 (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
724 __m128i *const dst8_4_1 =
725 (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j + 4];
726 const __m128i p4_16_0 = _mm_loadl_epi64(p4_0);
727 const __m128i p4_16_1 = _mm_loadl_epi64(p4_1);
728 const __m256i p4_16 = _mm256_inserti128_si256(
729 _mm256_castsi128_si256(p4_16_0), p4_16_1, 1);
730 if (conv_params->use_dist_wtd_comp_avg) {
731 const __m256i p_16_hi = _mm256_unpacklo_epi16(p4_16, temp_hi_16);
732 const __m256i wt_res_hi = _mm256_madd_epi16(p_16_hi, *wt);
733 const __m256i shifted_32 =
734 _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
735 res_hi_16 = _mm256_packus_epi32(shifted_32, shifted_32);
736 } else {
737 res_hi_16 = _mm256_srai_epi16(_mm256_add_epi16(p4_16, temp_hi_16), 1);
738 }
739 res_hi_16 = _mm256_add_epi16(res_hi_16, *res_sub_const);
740 res_hi_16 = _mm256_srai_epi16(
741 _mm256_add_epi16(res_hi_16, *round_bits_const), round_bits);
742 __m256i res_8_hi = _mm256_packus_epi16(res_hi_16, res_hi_16);
743 const __m128i res_8_hi_0 = _mm256_castsi256_si128(res_8_hi);
744 const __m128i res_8_hi_1 = _mm256_extracti128_si256(res_8_hi, 1);
James Zernbf733e62022-07-30 19:48:54 -0700745 *(int *)dst8_4_0 = _mm_cvtsi128_si32(res_8_hi_0);
746 *(int *)dst8_4_1 = _mm_cvtsi128_si32(res_8_hi_1);
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530747 } else {
748 const __m128i temp_hi_16_0 = _mm256_castsi256_si128(temp_hi_16);
749 const __m128i temp_hi_16_1 = _mm256_extracti128_si256(temp_hi_16, 1);
750 _mm_storel_epi64(p4_0, temp_hi_16_0);
751 _mm_storel_epi64(p4_1, temp_hi_16_1);
752 }
753 }
754 } else {
755 const __m256i res_lo_round = _mm256_srai_epi32(
756 _mm256_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert);
757 const __m256i res_hi_round = _mm256_srai_epi32(
758 _mm256_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert);
759
760 const __m256i res_16bit = _mm256_packs_epi32(res_lo_round, res_hi_round);
761 const __m256i res_8bit = _mm256_packus_epi16(res_16bit, res_16bit);
762 const __m128i res_8bit0 = _mm256_castsi256_si128(res_8bit);
763 const __m128i res_8bit1 = _mm256_extracti128_si256(res_8bit, 1);
764
765 // Store, blending with 'pred' if needed
766 __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
767 __m128i *const p1 = (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j];
768
769 if (p_width == 4) {
James Zernbf733e62022-07-30 19:48:54 -0700770 *(int *)p = _mm_cvtsi128_si32(res_8bit0);
771 *(int *)p1 = _mm_cvtsi128_si32(res_8bit1);
Aniket Dhoka80c64e2019-04-25 09:29:28 +0530772 } else {
773 _mm_storel_epi64(p, res_8bit0);
774 _mm_storel_epi64(p1, res_8bit1);
775 }
776 }
777}
778
779static INLINE void warp_vertical_filter_avx2(
780 uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
781 int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
782 int i, int j, int sy4, const int reduce_bits_vert,
783 const __m256i *res_add_const, const int round_bits,
784 const __m256i *res_sub_const, const __m256i *round_bits_const,
785 const __m256i *wt) {
786 int k, row = 0;
787 __m256i src[8];
788 const __m256i src_0 = horz_out[0];
789 const __m256i src_1 =
790 _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
791 const __m256i src_2 = horz_out[1];
792 const __m256i src_3 =
793 _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
794 const __m256i src_4 = horz_out[2];
795 const __m256i src_5 =
796 _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
797
798 src[0] = _mm256_unpacklo_epi16(src_0, src_1);
799 src[2] = _mm256_unpacklo_epi16(src_2, src_3);
800 src[4] = _mm256_unpacklo_epi16(src_4, src_5);
801
802 src[1] = _mm256_unpackhi_epi16(src_0, src_1);
803 src[3] = _mm256_unpackhi_epi16(src_2, src_3);
804 src[5] = _mm256_unpackhi_epi16(src_4, src_5);
805
806 for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
807 int sy = sy4 + delta * (k + 4);
808 __m256i coeffs[8];
809 prepare_vertical_filter_coeffs_avx2(gamma, delta, sy, coeffs);
810 __m256i res_lo, res_hi;
811 filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
812 row);
813 store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
814 res_sub_const, round_bits_const, pred,
815 conv_params, i, j, k, reduce_bits_vert,
816 p_stride, p_width, round_bits);
817 src[0] = src[2];
818 src[2] = src[4];
819 src[4] = src[6];
820 src[1] = src[3];
821 src[3] = src[5];
822 src[5] = src[7];
823
824 row += 1;
825 }
826}
827
828static INLINE void warp_vertical_filter_gamma0_avx2(
829 uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
830 int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
831 int i, int j, int sy4, const int reduce_bits_vert,
832 const __m256i *res_add_const, const int round_bits,
833 const __m256i *res_sub_const, const __m256i *round_bits_const,
834 const __m256i *wt) {
835 (void)gamma;
836 int k, row = 0;
837 __m256i src[8];
838 const __m256i src_0 = horz_out[0];
839 const __m256i src_1 =
840 _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
841 const __m256i src_2 = horz_out[1];
842 const __m256i src_3 =
843 _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
844 const __m256i src_4 = horz_out[2];
845 const __m256i src_5 =
846 _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
847
848 src[0] = _mm256_unpacklo_epi16(src_0, src_1);
849 src[2] = _mm256_unpacklo_epi16(src_2, src_3);
850 src[4] = _mm256_unpacklo_epi16(src_4, src_5);
851
852 src[1] = _mm256_unpackhi_epi16(src_0, src_1);
853 src[3] = _mm256_unpackhi_epi16(src_2, src_3);
854 src[5] = _mm256_unpackhi_epi16(src_4, src_5);
855
856 for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
857 int sy = sy4 + delta * (k + 4);
858 __m256i coeffs[8];
859 prepare_vertical_filter_coeffs_gamma0_avx2(delta, sy, coeffs);
860 __m256i res_lo, res_hi;
861 filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
862 row);
863 store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
864 res_sub_const, round_bits_const, pred,
865 conv_params, i, j, k, reduce_bits_vert,
866 p_stride, p_width, round_bits);
867 src[0] = src[2];
868 src[2] = src[4];
869 src[4] = src[6];
870 src[1] = src[3];
871 src[3] = src[5];
872 src[5] = src[7];
873 row += 1;
874 }
875}
876
877static INLINE void warp_vertical_filter_delta0_avx2(
878 uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
879 int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
880 int i, int j, int sy4, const int reduce_bits_vert,
881 const __m256i *res_add_const, const int round_bits,
882 const __m256i *res_sub_const, const __m256i *round_bits_const,
883 const __m256i *wt) {
884 (void)delta;
885 int k, row = 0;
886 __m256i src[8], coeffs[8];
887 const __m256i src_0 = horz_out[0];
888 const __m256i src_1 =
889 _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
890 const __m256i src_2 = horz_out[1];
891 const __m256i src_3 =
892 _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
893 const __m256i src_4 = horz_out[2];
894 const __m256i src_5 =
895 _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
896
897 src[0] = _mm256_unpacklo_epi16(src_0, src_1);
898 src[2] = _mm256_unpacklo_epi16(src_2, src_3);
899 src[4] = _mm256_unpacklo_epi16(src_4, src_5);
900
901 src[1] = _mm256_unpackhi_epi16(src_0, src_1);
902 src[3] = _mm256_unpackhi_epi16(src_2, src_3);
903 src[5] = _mm256_unpackhi_epi16(src_4, src_5);
904
905 prepare_vertical_filter_coeffs_delta0_avx2(gamma, sy4, coeffs);
906
907 for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
908 __m256i res_lo, res_hi;
909 filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
910 row);
911 store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
912 res_sub_const, round_bits_const, pred,
913 conv_params, i, j, k, reduce_bits_vert,
914 p_stride, p_width, round_bits);
915 src[0] = src[2];
916 src[2] = src[4];
917 src[4] = src[6];
918 src[1] = src[3];
919 src[3] = src[5];
920 src[5] = src[7];
921 row += 1;
922 }
923}
924
925static INLINE void warp_vertical_filter_gamma0_delta0_avx2(
926 uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
927 int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
928 int i, int j, int sy4, const int reduce_bits_vert,
929 const __m256i *res_add_const, const int round_bits,
930 const __m256i *res_sub_const, const __m256i *round_bits_const,
931 const __m256i *wt) {
932 (void)gamma;
933 int k, row = 0;
934 __m256i src[8], coeffs[8];
935 const __m256i src_0 = horz_out[0];
936 const __m256i src_1 =
937 _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
938 const __m256i src_2 = horz_out[1];
939 const __m256i src_3 =
940 _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
941 const __m256i src_4 = horz_out[2];
942 const __m256i src_5 =
943 _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
944
945 src[0] = _mm256_unpacklo_epi16(src_0, src_1);
946 src[2] = _mm256_unpacklo_epi16(src_2, src_3);
947 src[4] = _mm256_unpacklo_epi16(src_4, src_5);
948
949 src[1] = _mm256_unpackhi_epi16(src_0, src_1);
950 src[3] = _mm256_unpackhi_epi16(src_2, src_3);
951 src[5] = _mm256_unpackhi_epi16(src_4, src_5);
952
953 prepare_vertical_filter_coeffs_gamma0_avx2(delta, sy4, coeffs);
954
955 for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
956 __m256i res_lo, res_hi;
957 filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
958 row);
959 store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
960 res_sub_const, round_bits_const, pred,
961 conv_params, i, j, k, reduce_bits_vert,
962 p_stride, p_width, round_bits);
963 src[0] = src[2];
964 src[2] = src[4];
965 src[4] = src[6];
966 src[1] = src[3];
967 src[3] = src[5];
968 src[5] = src[7];
969 row += 1;
970 }
971}
972
973static INLINE void prepare_warp_vertical_filter_avx2(
974 uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
975 int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
976 int i, int j, int sy4, const int reduce_bits_vert,
977 const __m256i *res_add_const, const int round_bits,
978 const __m256i *res_sub_const, const __m256i *round_bits_const,
979 const __m256i *wt) {
980 if (gamma == 0 && delta == 0)
981 warp_vertical_filter_gamma0_delta0_avx2(
982 pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width,
983 i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const,
984 round_bits_const, wt);
985 else if (gamma == 0 && delta != 0)
986 warp_vertical_filter_gamma0_avx2(
987 pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width,
988 i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const,
989 round_bits_const, wt);
990 else if (gamma != 0 && delta == 0)
991 warp_vertical_filter_delta0_avx2(
992 pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width,
993 i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const,
994 round_bits_const, wt);
995 else
996 warp_vertical_filter_avx2(pred, horz_out, conv_params, gamma, delta,
997 p_height, p_stride, p_width, i, j, sy4,
998 reduce_bits_vert, res_add_const, round_bits,
999 res_sub_const, round_bits_const, wt);
1000}
1001
1002static INLINE void prepare_warp_horizontal_filter_avx2(
1003 const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
1004 int32_t sx4, int alpha, int beta, int p_height, int height, int i,
1005 const __m256i *round_const, const __m128i *shift,
1006 const __m256i *shuffle_src) {
1007 if (alpha == 0 && beta == 0)
1008 warp_horizontal_filter_alpha0_beta0_avx2(
1009 ref, horz_out, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,
1010 round_const, shift, shuffle_src);
1011 else if (alpha == 0 && beta != 0)
1012 warp_horizontal_filter_alpha0_avx2(ref, horz_out, stride, ix4, iy4, sx4,
1013 alpha, beta, p_height, height, i,
1014 round_const, shift, shuffle_src);
1015 else if (alpha != 0 && beta == 0)
1016 warp_horizontal_filter_beta0_avx2(ref, horz_out, stride, ix4, iy4, sx4,
1017 alpha, beta, p_height, height, i,
1018 round_const, shift, shuffle_src);
1019 else
1020 warp_horizontal_filter_avx2(ref, horz_out, stride, ix4, iy4, sx4, alpha,
1021 beta, p_height, height, i, round_const, shift,
1022 shuffle_src);
1023}
1024
1025void av1_warp_affine_avx2(const int32_t *mat, const uint8_t *ref, int width,
1026 int height, int stride, uint8_t *pred, int p_col,
1027 int p_row, int p_width, int p_height, int p_stride,
1028 int subsampling_x, int subsampling_y,
1029 ConvolveParams *conv_params, int16_t alpha,
1030 int16_t beta, int16_t gamma, int16_t delta) {
1031 __m256i horz_out[8];
1032 int i, j, k;
1033 const int bd = 8;
1034 const int reduce_bits_horiz = conv_params->round_0;
1035 const int reduce_bits_vert = conv_params->is_compound
1036 ? conv_params->round_1
1037 : 2 * FILTER_BITS - reduce_bits_horiz;
1038 const int offset_bits_horiz = bd + FILTER_BITS - 1;
1039 assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
1040
1041 const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
1042 const __m256i reduce_bits_vert_const =
1043 _mm256_set1_epi32(((1 << reduce_bits_vert) >> 1));
1044 const __m256i res_add_const = _mm256_set1_epi32(1 << offset_bits_vert);
1045 const int round_bits =
1046 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
1047 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
1048 assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
1049
1050 const __m256i round_const = _mm256_set1_epi16(
1051 (1 << offset_bits_horiz) + ((1 << reduce_bits_horiz) >> 1));
1052 const __m128i shift = _mm_cvtsi32_si128(reduce_bits_horiz);
1053
1054 __m256i res_sub_const, round_bits_const, wt;
1055 unpack_weights_and_set_round_const_avx2(conv_params, round_bits, offset_bits,
1056 &res_sub_const, &round_bits_const,
1057 &wt);
1058
1059 __m256i res_add_const_1;
1060 if (conv_params->is_compound == 1) {
1061 res_add_const_1 = _mm256_add_epi32(reduce_bits_vert_const, res_add_const);
1062 } else {
1063 res_add_const_1 = _mm256_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
1064 ((1 << reduce_bits_vert) >> 1));
1065 }
1066 const int32_t const1 = alpha * (-4) + beta * (-4) +
1067 (1 << (WARPEDDIFF_PREC_BITS - 1)) +
1068 (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
1069 const int32_t const2 = gamma * (-4) + delta * (-4) +
1070 (1 << (WARPEDDIFF_PREC_BITS - 1)) +
1071 (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
1072 const int32_t const3 = ((1 << WARP_PARAM_REDUCE_BITS) - 1);
1073 const int16_t const4 = (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1));
1074 const int16_t const5 = (1 << (FILTER_BITS - reduce_bits_horiz));
1075
1076 __m256i shuffle_src[4];
Aniket Dhokccdbb8a2019-05-03 17:15:20 +05301077 shuffle_src[0] = _mm256_load_si256((__m256i *)shuffle_src0);
1078 shuffle_src[1] = _mm256_load_si256((__m256i *)shuffle_src1);
1079 shuffle_src[2] = _mm256_load_si256((__m256i *)shuffle_src2);
1080 shuffle_src[3] = _mm256_load_si256((__m256i *)shuffle_src3);
Aniket Dhoka80c64e2019-04-25 09:29:28 +05301081
1082 for (i = 0; i < p_height; i += 8) {
1083 for (j = 0; j < p_width; j += 8) {
1084 const int32_t src_x = (p_col + j + 4) << subsampling_x;
1085 const int32_t src_y = (p_row + i + 4) << subsampling_y;
Debargha Mukherjeeb7614282021-10-20 23:37:58 -07001086 const int64_t dst_x =
1087 (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
1088 const int64_t dst_y =
1089 (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
1090 const int64_t x4 = dst_x >> subsampling_x;
1091 const int64_t y4 = dst_y >> subsampling_y;
Aniket Dhoka80c64e2019-04-25 09:29:28 +05301092
Debargha Mukherjeeb7614282021-10-20 23:37:58 -07001093 int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
Aniket Dhoka80c64e2019-04-25 09:29:28 +05301094 int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
Debargha Mukherjeeb7614282021-10-20 23:37:58 -07001095 int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
Aniket Dhoka80c64e2019-04-25 09:29:28 +05301096 int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
1097
1098 // Add in all the constant terms, including rounding and offset
1099 sx4 += const1;
1100 sy4 += const2;
1101
1102 sx4 &= ~const3;
1103 sy4 &= ~const3;
1104
1105 // Horizontal filter
1106 // If the block is aligned such that, after clamping, every sample
1107 // would be taken from the leftmost/rightmost column, then we can
1108 // skip the expensive horizontal filter.
1109
1110 if (ix4 <= -7) {
1111 int iy, row = 0;
1112 for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
1113 iy = iy4 + k;
1114 iy = clamp(iy, 0, height - 1);
1115 const __m256i temp_0 =
1116 _mm256_set1_epi16(const4 + ref[iy * stride] * const5);
1117 iy = iy4 + k + 1;
1118 iy = clamp(iy, 0, height - 1);
1119 const __m256i temp_1 =
1120 _mm256_set1_epi16(const4 + ref[iy * stride] * const5);
1121 horz_out[row] = _mm256_blend_epi32(temp_0, temp_1, 0xf0);
1122 row += 1;
1123 }
1124 iy = iy4 + k;
1125 iy = clamp(iy, 0, height - 1);
1126 horz_out[row] = _mm256_set1_epi16(const4 + ref[iy * stride] * const5);
1127 } else if (ix4 >= width + 6) {
1128 int iy, row = 0;
1129 for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
1130 iy = iy4 + k;
1131 iy = clamp(iy, 0, height - 1);
1132 const __m256i temp_0 = _mm256_set1_epi16(
1133 const4 + ref[iy * stride + (width - 1)] * const5);
1134 iy = iy4 + k + 1;
1135 iy = clamp(iy, 0, height - 1);
1136 const __m256i temp_1 = _mm256_set1_epi16(
1137 const4 + ref[iy * stride + (width - 1)] * const5);
1138 horz_out[row] = _mm256_blend_epi32(temp_0, temp_1, 0xf0);
1139 row += 1;
1140 }
1141 iy = iy4 + k;
1142 iy = clamp(iy, 0, height - 1);
1143 horz_out[row] =
1144 _mm256_set1_epi16(const4 + ref[iy * stride + (width - 1)] * const5);
1145 } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
1146 const int out_of_boundary_left = -(ix4 - 6);
1147 const int out_of_boundary_right = (ix4 + 8) - width;
1148 int iy, sx, row = 0;
1149 for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
1150 iy = iy4 + k;
1151 iy = clamp(iy, 0, height - 1);
1152 __m128i src0 =
1153 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
1154 iy = iy4 + k + 1;
1155 iy = clamp(iy, 0, height - 1);
1156 __m128i src1 =
1157 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
1158
1159 if (out_of_boundary_left >= 0) {
1160 const __m128i shuffle_reg_left =
1161 _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
1162 src0 = _mm_shuffle_epi8(src0, shuffle_reg_left);
1163 src1 = _mm_shuffle_epi8(src1, shuffle_reg_left);
1164 }
1165 if (out_of_boundary_right >= 0) {
1166 const __m128i shuffle_reg_right = _mm_loadu_si128(
1167 (__m128i *)warp_pad_right[out_of_boundary_right]);
1168 src0 = _mm_shuffle_epi8(src0, shuffle_reg_right);
1169 src1 = _mm_shuffle_epi8(src1, shuffle_reg_right);
1170 }
1171 sx = sx4 + beta * (k + 4);
1172 const __m256i src_01 =
1173 _mm256_inserti128_si256(_mm256_castsi128_si256(src0), src1, 0x1);
1174 horizontal_filter_avx2(src_01, horz_out, sx, alpha, beta, row,
1175 shuffle_src, &round_const, &shift);
1176 row += 1;
1177 }
1178 iy = iy4 + k;
1179 iy = clamp(iy, 0, height - 1);
1180 __m128i src = _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
1181 if (out_of_boundary_left >= 0) {
1182 const __m128i shuffle_reg_left =
1183 _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
1184 src = _mm_shuffle_epi8(src, shuffle_reg_left);
1185 }
1186 if (out_of_boundary_right >= 0) {
1187 const __m128i shuffle_reg_right =
1188 _mm_loadu_si128((__m128i *)warp_pad_right[out_of_boundary_right]);
1189 src = _mm_shuffle_epi8(src, shuffle_reg_right);
1190 }
1191 sx = sx4 + beta * (k + 4);
1192 const __m256i src_01 = _mm256_castsi128_si256(src);
1193 __m256i coeff[4];
1194 prepare_horizontal_filter_coeff(alpha, sx, coeff);
1195 filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src,
1196 &round_const, &shift, row);
1197 } else {
1198 prepare_warp_horizontal_filter_avx2(
1199 ref, horz_out, stride, ix4, iy4, sx4, alpha, beta, p_height, height,
1200 i, &round_const, &shift, shuffle_src);
1201 }
1202
1203 // Vertical filter
1204 prepare_warp_vertical_filter_avx2(
1205 pred, horz_out, conv_params, gamma, delta, p_height, p_stride,
1206 p_width, i, j, sy4, reduce_bits_vert, &res_add_const_1, round_bits,
1207 &res_sub_const, &round_bits_const, &wt);
1208 }
1209 }
1210}