blob: 969e4e195db6b8ee8415459bd0e17b715d386749 [file] [log] [blame]
Yue Chen7cae98f2018-08-24 10:43:16 -07001/*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12#include <immintrin.h>
13
14#include "config/aom_dsp_rtcd.h"
15#include "aom/aom_integer.h"
16#include "aom_dsp/x86/bitdepth_conversion_sse2.h"
17#include "aom_ports/mem.h"
18
19static void hadamard_col8_sse2(__m128i *in, int iter) {
20 __m128i a0 = in[0];
21 __m128i a1 = in[1];
22 __m128i a2 = in[2];
23 __m128i a3 = in[3];
24 __m128i a4 = in[4];
25 __m128i a5 = in[5];
26 __m128i a6 = in[6];
27 __m128i a7 = in[7];
28
29 __m128i b0 = _mm_add_epi16(a0, a1);
30 __m128i b1 = _mm_sub_epi16(a0, a1);
31 __m128i b2 = _mm_add_epi16(a2, a3);
32 __m128i b3 = _mm_sub_epi16(a2, a3);
33 __m128i b4 = _mm_add_epi16(a4, a5);
34 __m128i b5 = _mm_sub_epi16(a4, a5);
35 __m128i b6 = _mm_add_epi16(a6, a7);
36 __m128i b7 = _mm_sub_epi16(a6, a7);
37
38 a0 = _mm_add_epi16(b0, b2);
39 a1 = _mm_add_epi16(b1, b3);
40 a2 = _mm_sub_epi16(b0, b2);
41 a3 = _mm_sub_epi16(b1, b3);
42 a4 = _mm_add_epi16(b4, b6);
43 a5 = _mm_add_epi16(b5, b7);
44 a6 = _mm_sub_epi16(b4, b6);
45 a7 = _mm_sub_epi16(b5, b7);
46
47 if (iter == 0) {
48 b0 = _mm_add_epi16(a0, a4);
49 b7 = _mm_add_epi16(a1, a5);
50 b3 = _mm_add_epi16(a2, a6);
51 b4 = _mm_add_epi16(a3, a7);
52 b2 = _mm_sub_epi16(a0, a4);
53 b6 = _mm_sub_epi16(a1, a5);
54 b1 = _mm_sub_epi16(a2, a6);
55 b5 = _mm_sub_epi16(a3, a7);
56
57 a0 = _mm_unpacklo_epi16(b0, b1);
58 a1 = _mm_unpacklo_epi16(b2, b3);
59 a2 = _mm_unpackhi_epi16(b0, b1);
60 a3 = _mm_unpackhi_epi16(b2, b3);
61 a4 = _mm_unpacklo_epi16(b4, b5);
62 a5 = _mm_unpacklo_epi16(b6, b7);
63 a6 = _mm_unpackhi_epi16(b4, b5);
64 a7 = _mm_unpackhi_epi16(b6, b7);
65
66 b0 = _mm_unpacklo_epi32(a0, a1);
67 b1 = _mm_unpacklo_epi32(a4, a5);
68 b2 = _mm_unpackhi_epi32(a0, a1);
69 b3 = _mm_unpackhi_epi32(a4, a5);
70 b4 = _mm_unpacklo_epi32(a2, a3);
71 b5 = _mm_unpacklo_epi32(a6, a7);
72 b6 = _mm_unpackhi_epi32(a2, a3);
73 b7 = _mm_unpackhi_epi32(a6, a7);
74
75 in[0] = _mm_unpacklo_epi64(b0, b1);
76 in[1] = _mm_unpackhi_epi64(b0, b1);
77 in[2] = _mm_unpacklo_epi64(b2, b3);
78 in[3] = _mm_unpackhi_epi64(b2, b3);
79 in[4] = _mm_unpacklo_epi64(b4, b5);
80 in[5] = _mm_unpackhi_epi64(b4, b5);
81 in[6] = _mm_unpacklo_epi64(b6, b7);
82 in[7] = _mm_unpackhi_epi64(b6, b7);
83 } else {
84 in[0] = _mm_add_epi16(a0, a4);
85 in[7] = _mm_add_epi16(a1, a5);
86 in[3] = _mm_add_epi16(a2, a6);
87 in[4] = _mm_add_epi16(a3, a7);
88 in[2] = _mm_sub_epi16(a0, a4);
89 in[6] = _mm_sub_epi16(a1, a5);
90 in[1] = _mm_sub_epi16(a2, a6);
91 in[5] = _mm_sub_epi16(a3, a7);
92 }
93}
94
95static INLINE void hadamard_8x8_sse2(const int16_t *src_diff,
96 ptrdiff_t src_stride, tran_low_t *coeff,
97 int is_final) {
98 __m128i src[8];
99 src[0] = _mm_load_si128((const __m128i *)src_diff);
100 src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
101 src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
102 src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
103 src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
104 src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
105 src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
106 src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
107
108 hadamard_col8_sse2(src, 0);
109 hadamard_col8_sse2(src, 1);
110
111 if (is_final) {
112 store_tran_low(src[0], coeff);
113 coeff += 8;
114 store_tran_low(src[1], coeff);
115 coeff += 8;
116 store_tran_low(src[2], coeff);
117 coeff += 8;
118 store_tran_low(src[3], coeff);
119 coeff += 8;
120 store_tran_low(src[4], coeff);
121 coeff += 8;
122 store_tran_low(src[5], coeff);
123 coeff += 8;
124 store_tran_low(src[6], coeff);
125 coeff += 8;
126 store_tran_low(src[7], coeff);
127 } else {
128 int16_t *coeff16 = (int16_t *)coeff;
129 _mm_store_si128((__m128i *)coeff16, src[0]);
130 coeff16 += 8;
131 _mm_store_si128((__m128i *)coeff16, src[1]);
132 coeff16 += 8;
133 _mm_store_si128((__m128i *)coeff16, src[2]);
134 coeff16 += 8;
135 _mm_store_si128((__m128i *)coeff16, src[3]);
136 coeff16 += 8;
137 _mm_store_si128((__m128i *)coeff16, src[4]);
138 coeff16 += 8;
139 _mm_store_si128((__m128i *)coeff16, src[5]);
140 coeff16 += 8;
141 _mm_store_si128((__m128i *)coeff16, src[6]);
142 coeff16 += 8;
143 _mm_store_si128((__m128i *)coeff16, src[7]);
144 }
145}
146
147void aom_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
148 tran_low_t *coeff) {
149 hadamard_8x8_sse2(src_diff, src_stride, coeff, 1);
150}
151
152static INLINE void hadamard_16x16_sse2(const int16_t *src_diff,
153 ptrdiff_t src_stride, tran_low_t *coeff,
154 int is_final) {
155 // For high bitdepths, it is unnecessary to store_tran_low
156 // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
157 // next stage. Output to an intermediate buffer first, then store_tran_low()
158 // in the final stage.
159 DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]);
160 int16_t *t_coeff = temp_coeff;
161 int16_t *coeff16 = (int16_t *)coeff;
162 int idx;
163 for (idx = 0; idx < 4; ++idx) {
164 const int16_t *src_ptr =
165 src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
166 hadamard_8x8_sse2(src_ptr, src_stride, (tran_low_t *)(t_coeff + idx * 64),
167 0);
168 }
169
170 for (idx = 0; idx < 64; idx += 8) {
171 __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff);
172 __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 64));
173 __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 128));
174 __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 192));
175
176 __m128i b0 = _mm_add_epi16(coeff0, coeff1);
177 __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
178 __m128i b2 = _mm_add_epi16(coeff2, coeff3);
179 __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
180
181 b0 = _mm_srai_epi16(b0, 1);
182 b1 = _mm_srai_epi16(b1, 1);
183 b2 = _mm_srai_epi16(b2, 1);
184 b3 = _mm_srai_epi16(b3, 1);
185
186 coeff0 = _mm_add_epi16(b0, b2);
187 coeff1 = _mm_add_epi16(b1, b3);
188 coeff2 = _mm_sub_epi16(b0, b2);
189 coeff3 = _mm_sub_epi16(b1, b3);
190
191 if (is_final) {
192 store_tran_low(coeff0, coeff);
193 store_tran_low(coeff1, coeff + 64);
194 store_tran_low(coeff2, coeff + 128);
195 store_tran_low(coeff3, coeff + 192);
196 coeff += 8;
197 } else {
198 _mm_store_si128((__m128i *)coeff16, coeff0);
199 _mm_store_si128((__m128i *)(coeff16 + 64), coeff1);
200 _mm_store_si128((__m128i *)(coeff16 + 128), coeff2);
201 _mm_store_si128((__m128i *)(coeff16 + 192), coeff3);
202 coeff16 += 8;
203 }
204
205 t_coeff += 8;
206 }
207}
208
209void aom_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
210 tran_low_t *coeff) {
211 hadamard_16x16_sse2(src_diff, src_stride, coeff, 1);
212}
213
214void aom_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
215 tran_low_t *coeff) {
216 // For high bitdepths, it is unnecessary to store_tran_low
217 // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
218 // next stage. Output to an intermediate buffer first, then store_tran_low()
219 // in the final stage.
220 DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]);
221 int16_t *t_coeff = temp_coeff;
222 int idx;
223 for (idx = 0; idx < 4; ++idx) {
224 const int16_t *src_ptr =
225 src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
226 hadamard_16x16_sse2(src_ptr, src_stride,
227 (tran_low_t *)(t_coeff + idx * 256), 0);
228 }
229
230 for (idx = 0; idx < 256; idx += 8) {
231 __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff);
232 __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 256));
233 __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 512));
234 __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 768));
235
236 __m128i b0 = _mm_add_epi16(coeff0, coeff1);
237 __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
238 __m128i b2 = _mm_add_epi16(coeff2, coeff3);
239 __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
240
241 b0 = _mm_srai_epi16(b0, 2);
242 b1 = _mm_srai_epi16(b1, 2);
243 b2 = _mm_srai_epi16(b2, 2);
244 b3 = _mm_srai_epi16(b3, 2);
245
246 coeff0 = _mm_add_epi16(b0, b2);
247 coeff1 = _mm_add_epi16(b1, b3);
248 store_tran_low(coeff0, coeff);
249 store_tran_low(coeff1, coeff + 256);
250
251 coeff2 = _mm_sub_epi16(b0, b2);
252 coeff3 = _mm_sub_epi16(b1, b3);
253 store_tran_low(coeff2, coeff + 512);
254 store_tran_low(coeff3, coeff + 768);
255
256 coeff += 8;
257 t_coeff += 8;
258 }
259}
260
261int aom_satd_sse2(const tran_low_t *coeff, int length) {
262 int i;
263 const __m128i zero = _mm_setzero_si128();
264 __m128i accum = zero;
265
266 for (i = 0; i < length; i += 8) {
267 const __m128i src_line = load_tran_low(coeff);
268 const __m128i inv = _mm_sub_epi16(zero, src_line);
269 const __m128i abs = _mm_max_epi16(src_line, inv); // abs(src_line)
270 const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero);
271 const __m128i abs_hi = _mm_unpackhi_epi16(abs, zero);
272 const __m128i sum = _mm_add_epi32(abs_lo, abs_hi);
273 accum = _mm_add_epi32(accum, sum);
274 coeff += 8;
275 }
276
277 { // cascading summation of accum
278 __m128i hi = _mm_srli_si128(accum, 8);
279 accum = _mm_add_epi32(accum, hi);
280 hi = _mm_srli_epi64(accum, 32);
281 accum = _mm_add_epi32(accum, hi);
282 }
283
284 return _mm_cvtsi128_si32(accum);
285}