blob: 4b89f945b08213965156c26bdef1ce4e8c42e4cf [file] [log] [blame]
Yaowu Xuc27fc142016-08-22 16:08:15 -07001/*
Lester Lu6bc30d62021-12-16 19:13:21 +00002 * Copyright (c) 2021, Alliance for Open Media. All rights reserved
Yaowu Xuc27fc142016-08-22 16:08:15 -07003 *
Lester Lu6bc30d62021-12-16 19:13:21 +00004 * This source code is subject to the terms of the BSD 3-Clause Clear License
5 * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
6 * License was not distributed with this source code in the LICENSE file, you
7 * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the
8 * Alliance for Open Media Patent License 1.0 was not distributed with this
9 * source code in the PATENTS file, you can obtain it at
10 * aomedia.org/license/patent-license/.
Yaowu Xuc27fc142016-08-22 16:08:15 -070011 */
12
13#include <assert.h>
14#include <immintrin.h>
15
16#include "aom_dsp/x86/synonyms.h"
17
Yaowu Xuf883b422016-08-30 14:01:10 -070018#include "aom/aom_integer.h"
Yaowu Xuc27fc142016-08-22 16:08:15 -070019
20#include "av1/common/reconinter.h"
21
22#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS)
23
24/**
Yaowu Xuf883b422016-08-30 14:01:10 -070025 * See av1_wedge_sse_from_residuals_c
Yaowu Xuc27fc142016-08-22 16:08:15 -070026 */
Yaowu Xuf883b422016-08-30 14:01:10 -070027uint64_t av1_wedge_sse_from_residuals_sse2(const int16_t *r1, const int16_t *d,
28 const uint8_t *m, int N) {
Yaowu Xuc27fc142016-08-22 16:08:15 -070029 int n = -N;
30 int n8 = n + 8;
31
32 uint64_t csse;
33
34 const __m128i v_mask_max_w = _mm_set1_epi16(MAX_MASK_VALUE);
Imdad Sardharwalla51232512018-04-30 14:41:28 +010035 const __m128i v_zext_q = xx_set1_64_from_32i(0xffffffff);
Yaowu Xuc27fc142016-08-22 16:08:15 -070036
37 __m128i v_acc0_q = _mm_setzero_si128();
38
39 assert(N % 64 == 0);
40
41 r1 += N;
42 d += N;
43 m += N;
44
45 do {
46 const __m128i v_r0_w = xx_load_128(r1 + n);
47 const __m128i v_r1_w = xx_load_128(r1 + n8);
48 const __m128i v_d0_w = xx_load_128(d + n);
49 const __m128i v_d1_w = xx_load_128(d + n8);
50 const __m128i v_m01_b = xx_load_128(m + n);
51
52 const __m128i v_rd0l_w = _mm_unpacklo_epi16(v_d0_w, v_r0_w);
53 const __m128i v_rd0h_w = _mm_unpackhi_epi16(v_d0_w, v_r0_w);
54 const __m128i v_rd1l_w = _mm_unpacklo_epi16(v_d1_w, v_r1_w);
55 const __m128i v_rd1h_w = _mm_unpackhi_epi16(v_d1_w, v_r1_w);
56 const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128());
57 const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128());
58
59 const __m128i v_m0l_w = _mm_unpacklo_epi16(v_m0_w, v_mask_max_w);
60 const __m128i v_m0h_w = _mm_unpackhi_epi16(v_m0_w, v_mask_max_w);
61 const __m128i v_m1l_w = _mm_unpacklo_epi16(v_m1_w, v_mask_max_w);
62 const __m128i v_m1h_w = _mm_unpackhi_epi16(v_m1_w, v_mask_max_w);
63
64 const __m128i v_t0l_d = _mm_madd_epi16(v_rd0l_w, v_m0l_w);
65 const __m128i v_t0h_d = _mm_madd_epi16(v_rd0h_w, v_m0h_w);
66 const __m128i v_t1l_d = _mm_madd_epi16(v_rd1l_w, v_m1l_w);
67 const __m128i v_t1h_d = _mm_madd_epi16(v_rd1h_w, v_m1h_w);
68
69 const __m128i v_t0_w = _mm_packs_epi32(v_t0l_d, v_t0h_d);
70 const __m128i v_t1_w = _mm_packs_epi32(v_t1l_d, v_t1h_d);
71
72 const __m128i v_sq0_d = _mm_madd_epi16(v_t0_w, v_t0_w);
73 const __m128i v_sq1_d = _mm_madd_epi16(v_t1_w, v_t1_w);
74
75 const __m128i v_sum0_q = _mm_add_epi64(_mm_and_si128(v_sq0_d, v_zext_q),
76 _mm_srli_epi64(v_sq0_d, 32));
77 const __m128i v_sum1_q = _mm_add_epi64(_mm_and_si128(v_sq1_d, v_zext_q),
78 _mm_srli_epi64(v_sq1_d, 32));
79
80 v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum0_q);
81 v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum1_q);
82
83 n8 += 16;
84 n += 16;
85 } while (n);
86
87 v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8));
88
89#if ARCH_X86_64
90 csse = (uint64_t)_mm_cvtsi128_si64(v_acc0_q);
91#else
92 xx_storel_64(&csse, v_acc0_q);
93#endif
94
95 return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
96}
97
98/**
Yaowu Xuf883b422016-08-30 14:01:10 -070099 * See av1_wedge_sign_from_residuals_c
Yaowu Xuc27fc142016-08-22 16:08:15 -0700100 */
Remyaf75b9842019-06-13 15:54:17 +0530101int8_t av1_wedge_sign_from_residuals_sse2(const int16_t *ds, const uint8_t *m,
102 int N, int64_t limit) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700103 int64_t acc;
104
105 __m128i v_sign_d;
106 __m128i v_acc0_d = _mm_setzero_si128();
107 __m128i v_acc1_d = _mm_setzero_si128();
108 __m128i v_acc_q;
109
110 // Input size limited to 8192 by the use of 32 bit accumulators and m
111 // being between [0, 64]. Overflow might happen at larger sizes,
112 // though it is practically impossible on real video input.
113 assert(N < 8192);
114 assert(N % 64 == 0);
115
116 do {
117 const __m128i v_m01_b = xx_load_128(m);
118 const __m128i v_m23_b = xx_load_128(m + 16);
119 const __m128i v_m45_b = xx_load_128(m + 32);
120 const __m128i v_m67_b = xx_load_128(m + 48);
121
122 const __m128i v_d0_w = xx_load_128(ds);
123 const __m128i v_d1_w = xx_load_128(ds + 8);
124 const __m128i v_d2_w = xx_load_128(ds + 16);
125 const __m128i v_d3_w = xx_load_128(ds + 24);
126 const __m128i v_d4_w = xx_load_128(ds + 32);
127 const __m128i v_d5_w = xx_load_128(ds + 40);
128 const __m128i v_d6_w = xx_load_128(ds + 48);
129 const __m128i v_d7_w = xx_load_128(ds + 56);
130
131 const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128());
132 const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128());
133 const __m128i v_m2_w = _mm_unpacklo_epi8(v_m23_b, _mm_setzero_si128());
134 const __m128i v_m3_w = _mm_unpackhi_epi8(v_m23_b, _mm_setzero_si128());
135 const __m128i v_m4_w = _mm_unpacklo_epi8(v_m45_b, _mm_setzero_si128());
136 const __m128i v_m5_w = _mm_unpackhi_epi8(v_m45_b, _mm_setzero_si128());
137 const __m128i v_m6_w = _mm_unpacklo_epi8(v_m67_b, _mm_setzero_si128());
138 const __m128i v_m7_w = _mm_unpackhi_epi8(v_m67_b, _mm_setzero_si128());
139
140 const __m128i v_p0_d = _mm_madd_epi16(v_d0_w, v_m0_w);
141 const __m128i v_p1_d = _mm_madd_epi16(v_d1_w, v_m1_w);
142 const __m128i v_p2_d = _mm_madd_epi16(v_d2_w, v_m2_w);
143 const __m128i v_p3_d = _mm_madd_epi16(v_d3_w, v_m3_w);
144 const __m128i v_p4_d = _mm_madd_epi16(v_d4_w, v_m4_w);
145 const __m128i v_p5_d = _mm_madd_epi16(v_d5_w, v_m5_w);
146 const __m128i v_p6_d = _mm_madd_epi16(v_d6_w, v_m6_w);
147 const __m128i v_p7_d = _mm_madd_epi16(v_d7_w, v_m7_w);
148
149 const __m128i v_p01_d = _mm_add_epi32(v_p0_d, v_p1_d);
150 const __m128i v_p23_d = _mm_add_epi32(v_p2_d, v_p3_d);
151 const __m128i v_p45_d = _mm_add_epi32(v_p4_d, v_p5_d);
152 const __m128i v_p67_d = _mm_add_epi32(v_p6_d, v_p7_d);
153
154 const __m128i v_p0123_d = _mm_add_epi32(v_p01_d, v_p23_d);
155 const __m128i v_p4567_d = _mm_add_epi32(v_p45_d, v_p67_d);
156
157 v_acc0_d = _mm_add_epi32(v_acc0_d, v_p0123_d);
158 v_acc1_d = _mm_add_epi32(v_acc1_d, v_p4567_d);
159
160 ds += 64;
161 m += 64;
162
163 N -= 64;
164 } while (N);
165
166 v_sign_d = _mm_cmplt_epi32(v_acc0_d, _mm_setzero_si128());
167 v_acc0_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc0_d, v_sign_d),
168 _mm_unpackhi_epi32(v_acc0_d, v_sign_d));
169
170 v_sign_d = _mm_cmplt_epi32(v_acc1_d, _mm_setzero_si128());
171 v_acc1_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc1_d, v_sign_d),
172 _mm_unpackhi_epi32(v_acc1_d, v_sign_d));
173
174 v_acc_q = _mm_add_epi64(v_acc0_d, v_acc1_d);
175
176 v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));
177
178#if ARCH_X86_64
179 acc = (uint64_t)_mm_cvtsi128_si64(v_acc_q);
180#else
181 xx_storel_64(&acc, v_acc_q);
182#endif
183
184 return acc > limit;
185}
186
187// Negate under mask
188static INLINE __m128i negm_epi16(__m128i v_v_w, __m128i v_mask_w) {
189 return _mm_sub_epi16(_mm_xor_si128(v_v_w, v_mask_w), v_mask_w);
190}
191
192/**
Yaowu Xuf883b422016-08-30 14:01:10 -0700193 * av1_wedge_compute_delta_squares_c
Yaowu Xuc27fc142016-08-22 16:08:15 -0700194 */
Yaowu Xuf883b422016-08-30 14:01:10 -0700195void av1_wedge_compute_delta_squares_sse2(int16_t *d, const int16_t *a,
196 const int16_t *b, int N) {
Hien Ho710b12f2019-08-26 10:08:36 -0700197 const __m128i v_neg_w = _mm_set_epi16((short)0xffff, 0, (short)0xffff, 0,
198 (short)0xffff, 0, (short)0xffff, 0);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700199
200 assert(N % 64 == 0);
201
202 do {
203 const __m128i v_a0_w = xx_load_128(a);
204 const __m128i v_b0_w = xx_load_128(b);
205 const __m128i v_a1_w = xx_load_128(a + 8);
206 const __m128i v_b1_w = xx_load_128(b + 8);
207 const __m128i v_a2_w = xx_load_128(a + 16);
208 const __m128i v_b2_w = xx_load_128(b + 16);
209 const __m128i v_a3_w = xx_load_128(a + 24);
210 const __m128i v_b3_w = xx_load_128(b + 24);
211
212 const __m128i v_ab0l_w = _mm_unpacklo_epi16(v_a0_w, v_b0_w);
213 const __m128i v_ab0h_w = _mm_unpackhi_epi16(v_a0_w, v_b0_w);
214 const __m128i v_ab1l_w = _mm_unpacklo_epi16(v_a1_w, v_b1_w);
215 const __m128i v_ab1h_w = _mm_unpackhi_epi16(v_a1_w, v_b1_w);
216 const __m128i v_ab2l_w = _mm_unpacklo_epi16(v_a2_w, v_b2_w);
217 const __m128i v_ab2h_w = _mm_unpackhi_epi16(v_a2_w, v_b2_w);
218 const __m128i v_ab3l_w = _mm_unpacklo_epi16(v_a3_w, v_b3_w);
219 const __m128i v_ab3h_w = _mm_unpackhi_epi16(v_a3_w, v_b3_w);
220
221 // Negate top word of pairs
222 const __m128i v_abl0n_w = negm_epi16(v_ab0l_w, v_neg_w);
223 const __m128i v_abh0n_w = negm_epi16(v_ab0h_w, v_neg_w);
224 const __m128i v_abl1n_w = negm_epi16(v_ab1l_w, v_neg_w);
225 const __m128i v_abh1n_w = negm_epi16(v_ab1h_w, v_neg_w);
226 const __m128i v_abl2n_w = negm_epi16(v_ab2l_w, v_neg_w);
227 const __m128i v_abh2n_w = negm_epi16(v_ab2h_w, v_neg_w);
228 const __m128i v_abl3n_w = negm_epi16(v_ab3l_w, v_neg_w);
229 const __m128i v_abh3n_w = negm_epi16(v_ab3h_w, v_neg_w);
230
231 const __m128i v_r0l_w = _mm_madd_epi16(v_ab0l_w, v_abl0n_w);
232 const __m128i v_r0h_w = _mm_madd_epi16(v_ab0h_w, v_abh0n_w);
233 const __m128i v_r1l_w = _mm_madd_epi16(v_ab1l_w, v_abl1n_w);
234 const __m128i v_r1h_w = _mm_madd_epi16(v_ab1h_w, v_abh1n_w);
235 const __m128i v_r2l_w = _mm_madd_epi16(v_ab2l_w, v_abl2n_w);
236 const __m128i v_r2h_w = _mm_madd_epi16(v_ab2h_w, v_abh2n_w);
237 const __m128i v_r3l_w = _mm_madd_epi16(v_ab3l_w, v_abl3n_w);
238 const __m128i v_r3h_w = _mm_madd_epi16(v_ab3h_w, v_abh3n_w);
239
240 const __m128i v_r0_w = _mm_packs_epi32(v_r0l_w, v_r0h_w);
241 const __m128i v_r1_w = _mm_packs_epi32(v_r1l_w, v_r1h_w);
242 const __m128i v_r2_w = _mm_packs_epi32(v_r2l_w, v_r2h_w);
243 const __m128i v_r3_w = _mm_packs_epi32(v_r3l_w, v_r3h_w);
244
245 xx_store_128(d, v_r0_w);
246 xx_store_128(d + 8, v_r1_w);
247 xx_store_128(d + 16, v_r2_w);
248 xx_store_128(d + 24, v_r3_w);
249
250 a += 32;
251 b += 32;
252 d += 32;
253 N -= 32;
254 } while (N);
255}