blob: 5add56a61e3ddaab81bb2a261ee3b40c9e0f3d1e [file] [log] [blame]
Yaowu Xuc27fc142016-08-22 16:08:15 -07001/*
Yaowu Xu2ab7ff02016-09-02 12:04:54 -07002 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
Yaowu Xuc27fc142016-08-22 16:08:15 -07003 *
Yaowu Xu2ab7ff02016-09-02 12:04:54 -07004 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
Yaowu Xuc27fc142016-08-22 16:08:15 -070010 */
11
Yaowu Xu6feda062016-05-18 09:41:09 -070012#include <assert.h>
Yaowu Xuc27fc142016-08-22 16:08:15 -070013#include <emmintrin.h> // SSE2
14
Yaowu Xuf883b422016-08-30 14:01:10 -070015#include "./aom_config.h"
16#include "./aom_dsp_rtcd.h"
Yaowu Xuc27fc142016-08-22 16:08:15 -070017
Rupert Swarbrickd2dea662017-10-24 17:23:21 +010018#include "aom_dsp/x86/synonyms.h"
19
Yaowu Xuc27fc142016-08-22 16:08:15 -070020#include "aom_ports/mem.h"
21
Timothy B. Terriberry5d24b6f2017-06-15 13:39:35 -070022#include "./av1_rtcd.h"
23#include "av1/common/filter.h"
24
Yaowu Xuc27fc142016-08-22 16:08:15 -070025typedef void (*getNxMvar_fn_t)(const unsigned char *src, int src_stride,
26 const unsigned char *ref, int ref_stride,
27 unsigned int *sse, int *sum);
28
Yaowu Xuf883b422016-08-30 14:01:10 -070029unsigned int aom_get_mb_ss_sse2(const int16_t *src) {
Yaowu Xuc27fc142016-08-22 16:08:15 -070030 __m128i vsum = _mm_setzero_si128();
31 int i;
32
33 for (i = 0; i < 32; ++i) {
Rupert Swarbrickd2dea662017-10-24 17:23:21 +010034 const __m128i v = xx_loadu_128(src);
Yaowu Xuc27fc142016-08-22 16:08:15 -070035 vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
36 src += 8;
37 }
38
39 vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
40 vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
41 return _mm_cvtsi128_si32(vsum);
42}
43
Rupert Swarbrickd2dea662017-10-24 17:23:21 +010044// Read 4 samples from each of row and row + 1. Interleave the two rows and
45// zero-extend them to 16 bit samples stored in the lower half of an SSE
46// register.
47static __m128i read64(const uint8_t *p, int stride, int row) {
48 __m128i row0 = xx_loadl_32(p + (row + 0) * stride);
49 __m128i row1 = xx_loadl_32(p + (row + 1) * stride);
50 return _mm_unpacklo_epi8(_mm_unpacklo_epi8(row0, row1), _mm_setzero_si128());
51}
Yaowu Xuc27fc142016-08-22 16:08:15 -070052
53static void get4x4var_sse2(const uint8_t *src, int src_stride,
54 const uint8_t *ref, int ref_stride,
55 unsigned int *sse, int *sum) {
Rupert Swarbrickd2dea662017-10-24 17:23:21 +010056 const __m128i src0 = read64(src, src_stride, 0);
57 const __m128i src1 = read64(src, src_stride, 2);
58 const __m128i ref0 = read64(ref, ref_stride, 0);
59 const __m128i ref1 = read64(ref, ref_stride, 2);
Yaowu Xuc27fc142016-08-22 16:08:15 -070060 const __m128i diff0 = _mm_sub_epi16(src0, ref0);
61 const __m128i diff1 = _mm_sub_epi16(src1, ref1);
62
63 // sum
64 __m128i vsum = _mm_add_epi16(diff0, diff1);
65 vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
66 vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
67 vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
68 *sum = (int16_t)_mm_extract_epi16(vsum, 0);
69
70 // sse
71 vsum =
72 _mm_add_epi32(_mm_madd_epi16(diff0, diff0), _mm_madd_epi16(diff1, diff1));
73 vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
74 vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
75 *sse = _mm_cvtsi128_si32(vsum);
76}
77
Yaowu Xuf883b422016-08-30 14:01:10 -070078void aom_get8x8var_sse2(const uint8_t *src, int src_stride, const uint8_t *ref,
Yaowu Xuc27fc142016-08-22 16:08:15 -070079 int ref_stride, unsigned int *sse, int *sum) {
80 const __m128i zero = _mm_setzero_si128();
81 __m128i vsum = _mm_setzero_si128();
82 __m128i vsse = _mm_setzero_si128();
83 int i;
84
85 for (i = 0; i < 8; i += 2) {
Rupert Swarbrickd2dea662017-10-24 17:23:21 +010086 const __m128i src0 =
87 _mm_unpacklo_epi8(xx_loadl_64(src + i * src_stride), zero);
88 const __m128i ref0 =
89 _mm_unpacklo_epi8(xx_loadl_64(ref + i * ref_stride), zero);
Yaowu Xuc27fc142016-08-22 16:08:15 -070090 const __m128i diff0 = _mm_sub_epi16(src0, ref0);
91
Rupert Swarbrickd2dea662017-10-24 17:23:21 +010092 const __m128i src1 =
93 _mm_unpacklo_epi8(xx_loadl_64(src + (i + 1) * src_stride), zero);
94 const __m128i ref1 =
95 _mm_unpacklo_epi8(xx_loadl_64(ref + (i + 1) * ref_stride), zero);
Yaowu Xuc27fc142016-08-22 16:08:15 -070096 const __m128i diff1 = _mm_sub_epi16(src1, ref1);
97
98 vsum = _mm_add_epi16(vsum, diff0);
99 vsum = _mm_add_epi16(vsum, diff1);
100 vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
101 vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
102 }
103
104 // sum
105 vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
106 vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
107 vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
108 *sum = (int16_t)_mm_extract_epi16(vsum, 0);
109
110 // sse
111 vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
112 vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
113 *sse = _mm_cvtsi128_si32(vsse);
114}
115
Yaowu Xuf883b422016-08-30 14:01:10 -0700116void aom_get16x16var_sse2(const uint8_t *src, int src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700117 const uint8_t *ref, int ref_stride, unsigned int *sse,
118 int *sum) {
119 const __m128i zero = _mm_setzero_si128();
120 __m128i vsum = _mm_setzero_si128();
121 __m128i vsse = _mm_setzero_si128();
122 int i;
123
124 for (i = 0; i < 16; ++i) {
Rupert Swarbrickd2dea662017-10-24 17:23:21 +0100125 const __m128i s = xx_loadu_128(src);
126 const __m128i r = xx_loadu_128(ref);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700127
128 const __m128i src0 = _mm_unpacklo_epi8(s, zero);
129 const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
130 const __m128i diff0 = _mm_sub_epi16(src0, ref0);
131
132 const __m128i src1 = _mm_unpackhi_epi8(s, zero);
133 const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
134 const __m128i diff1 = _mm_sub_epi16(src1, ref1);
135
136 vsum = _mm_add_epi16(vsum, diff0);
137 vsum = _mm_add_epi16(vsum, diff1);
138 vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
139 vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
140
141 src += src_stride;
142 ref += ref_stride;
143 }
144
145 // sum
146 vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
147 vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
148 *sum =
149 (int16_t)_mm_extract_epi16(vsum, 0) + (int16_t)_mm_extract_epi16(vsum, 1);
150
151 // sse
152 vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
153 vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
154 *sse = _mm_cvtsi128_si32(vsse);
155}
156
157static void variance_sse2(const unsigned char *src, int src_stride,
158 const unsigned char *ref, int ref_stride, int w,
159 int h, unsigned int *sse, int *sum,
160 getNxMvar_fn_t var_fn, int block_size) {
161 int i, j;
162
163 *sse = 0;
164 *sum = 0;
165
166 for (i = 0; i < h; i += block_size) {
167 for (j = 0; j < w; j += block_size) {
168 unsigned int sse0;
169 int sum0;
170 var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
171 ref_stride, &sse0, &sum0);
172 *sse += sse0;
173 *sum += sum0;
174 }
175 }
176}
177
Yaowu Xu6feda062016-05-18 09:41:09 -0700178unsigned int aom_variance4x4_sse2(const uint8_t *src, int src_stride,
179 const uint8_t *ref, int ref_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700180 unsigned int *sse) {
181 int sum;
182 get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
Yaowu Xu6feda062016-05-18 09:41:09 -0700183 assert(sum <= 255 * 4 * 4);
184 assert(sum >= -255 * 4 * 4);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700185 return *sse - ((sum * sum) >> 4);
186}
187
Yaowu Xuf883b422016-08-30 14:01:10 -0700188unsigned int aom_variance8x4_sse2(const uint8_t *src, int src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700189 const uint8_t *ref, int ref_stride,
190 unsigned int *sse) {
191 int sum;
192 variance_sse2(src, src_stride, ref, ref_stride, 8, 4, sse, &sum,
193 get4x4var_sse2, 4);
Yaowu Xu6feda062016-05-18 09:41:09 -0700194 assert(sum <= 255 * 8 * 4);
195 assert(sum >= -255 * 8 * 4);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700196 return *sse - ((sum * sum) >> 5);
197}
198
Yaowu Xuf883b422016-08-30 14:01:10 -0700199unsigned int aom_variance4x8_sse2(const uint8_t *src, int src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700200 const uint8_t *ref, int ref_stride,
201 unsigned int *sse) {
202 int sum;
203 variance_sse2(src, src_stride, ref, ref_stride, 4, 8, sse, &sum,
204 get4x4var_sse2, 4);
Yaowu Xu6feda062016-05-18 09:41:09 -0700205 assert(sum <= 255 * 8 * 4);
206 assert(sum >= -255 * 8 * 4);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700207 return *sse - ((sum * sum) >> 5);
208}
209
Yaowu Xuf883b422016-08-30 14:01:10 -0700210unsigned int aom_variance8x8_sse2(const unsigned char *src, int src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700211 const unsigned char *ref, int ref_stride,
212 unsigned int *sse) {
213 int sum;
Yaowu Xuf883b422016-08-30 14:01:10 -0700214 aom_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
Yaowu Xu6feda062016-05-18 09:41:09 -0700215 assert(sum <= 255 * 8 * 8);
216 assert(sum >= -255 * 8 * 8);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700217 return *sse - ((sum * sum) >> 6);
218}
219
Yaowu Xuf883b422016-08-30 14:01:10 -0700220unsigned int aom_variance16x8_sse2(const unsigned char *src, int src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700221 const unsigned char *ref, int ref_stride,
222 unsigned int *sse) {
223 int sum;
224 variance_sse2(src, src_stride, ref, ref_stride, 16, 8, sse, &sum,
Yaowu Xuf883b422016-08-30 14:01:10 -0700225 aom_get8x8var_sse2, 8);
Yaowu Xu6feda062016-05-18 09:41:09 -0700226 assert(sum <= 255 * 16 * 8);
227 assert(sum >= -255 * 16 * 8);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700228 return *sse - ((sum * sum) >> 7);
229}
230
Yaowu Xuf883b422016-08-30 14:01:10 -0700231unsigned int aom_variance8x16_sse2(const unsigned char *src, int src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700232 const unsigned char *ref, int ref_stride,
233 unsigned int *sse) {
234 int sum;
235 variance_sse2(src, src_stride, ref, ref_stride, 8, 16, sse, &sum,
Yaowu Xuf883b422016-08-30 14:01:10 -0700236 aom_get8x8var_sse2, 8);
Yaowu Xu6feda062016-05-18 09:41:09 -0700237 assert(sum <= 255 * 16 * 8);
238 assert(sum >= -255 * 16 * 8);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700239 return *sse - ((sum * sum) >> 7);
240}
241
Yaowu Xuf883b422016-08-30 14:01:10 -0700242unsigned int aom_variance16x16_sse2(const unsigned char *src, int src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700243 const unsigned char *ref, int ref_stride,
244 unsigned int *sse) {
245 int sum;
Yaowu Xuf883b422016-08-30 14:01:10 -0700246 aom_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
Yaowu Xu6feda062016-05-18 09:41:09 -0700247 assert(sum <= 255 * 16 * 16);
248 assert(sum >= -255 * 16 * 16);
249 return *sse - ((uint32_t)((int64_t)sum * sum) >> 8);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700250}
251
Yaowu Xuf883b422016-08-30 14:01:10 -0700252unsigned int aom_variance32x32_sse2(const uint8_t *src, int src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700253 const uint8_t *ref, int ref_stride,
254 unsigned int *sse) {
255 int sum;
256 variance_sse2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum,
Yaowu Xuf883b422016-08-30 14:01:10 -0700257 aom_get16x16var_sse2, 16);
Yaowu Xu6feda062016-05-18 09:41:09 -0700258 assert(sum <= 255 * 32 * 32);
259 assert(sum >= -255 * 32 * 32);
Alex Converse2176b7a2016-07-28 09:48:50 -0700260 return *sse - (unsigned int)(((int64_t)sum * sum) >> 10);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700261}
262
Yaowu Xuf883b422016-08-30 14:01:10 -0700263unsigned int aom_variance32x16_sse2(const uint8_t *src, int src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700264 const uint8_t *ref, int ref_stride,
265 unsigned int *sse) {
266 int sum;
267 variance_sse2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum,
Yaowu Xuf883b422016-08-30 14:01:10 -0700268 aom_get16x16var_sse2, 16);
Yaowu Xu6feda062016-05-18 09:41:09 -0700269 assert(sum <= 255 * 32 * 16);
270 assert(sum >= -255 * 32 * 16);
Alex Converse2176b7a2016-07-28 09:48:50 -0700271 return *sse - (unsigned int)(((int64_t)sum * sum) >> 9);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700272}
273
Yaowu Xuf883b422016-08-30 14:01:10 -0700274unsigned int aom_variance16x32_sse2(const uint8_t *src, int src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700275 const uint8_t *ref, int ref_stride,
276 unsigned int *sse) {
277 int sum;
278 variance_sse2(src, src_stride, ref, ref_stride, 16, 32, sse, &sum,
Yaowu Xuf883b422016-08-30 14:01:10 -0700279 aom_get16x16var_sse2, 16);
Yaowu Xu6feda062016-05-18 09:41:09 -0700280 assert(sum <= 255 * 32 * 16);
281 assert(sum >= -255 * 32 * 16);
Alex Converse2176b7a2016-07-28 09:48:50 -0700282 return *sse - (unsigned int)(((int64_t)sum * sum) >> 9);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700283}
284
Yaowu Xuf883b422016-08-30 14:01:10 -0700285unsigned int aom_variance64x64_sse2(const uint8_t *src, int src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700286 const uint8_t *ref, int ref_stride,
287 unsigned int *sse) {
288 int sum;
289 variance_sse2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum,
Yaowu Xuf883b422016-08-30 14:01:10 -0700290 aom_get16x16var_sse2, 16);
Yaowu Xu6feda062016-05-18 09:41:09 -0700291 assert(sum <= 255 * 64 * 64);
292 assert(sum >= -255 * 64 * 64);
Alex Converse2176b7a2016-07-28 09:48:50 -0700293 return *sse - (unsigned int)(((int64_t)sum * sum) >> 12);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700294}
295
Yaowu Xuf883b422016-08-30 14:01:10 -0700296unsigned int aom_variance64x32_sse2(const uint8_t *src, int src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700297 const uint8_t *ref, int ref_stride,
298 unsigned int *sse) {
299 int sum;
300 variance_sse2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum,
Yaowu Xuf883b422016-08-30 14:01:10 -0700301 aom_get16x16var_sse2, 16);
Yaowu Xu6feda062016-05-18 09:41:09 -0700302 assert(sum <= 255 * 64 * 32);
303 assert(sum >= -255 * 64 * 32);
Alex Converse2176b7a2016-07-28 09:48:50 -0700304 return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700305}
306
Yaowu Xuf883b422016-08-30 14:01:10 -0700307unsigned int aom_variance32x64_sse2(const uint8_t *src, int src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700308 const uint8_t *ref, int ref_stride,
309 unsigned int *sse) {
310 int sum;
311 variance_sse2(src, src_stride, ref, ref_stride, 32, 64, sse, &sum,
Yaowu Xuf883b422016-08-30 14:01:10 -0700312 aom_get16x16var_sse2, 16);
Yaowu Xu6feda062016-05-18 09:41:09 -0700313 assert(sum <= 255 * 64 * 32);
314 assert(sum >= -255 * 64 * 32);
Alex Converse2176b7a2016-07-28 09:48:50 -0700315 return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700316}
317
Yaowu Xuf883b422016-08-30 14:01:10 -0700318unsigned int aom_mse8x8_sse2(const uint8_t *src, int src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700319 const uint8_t *ref, int ref_stride,
320 unsigned int *sse) {
Yaowu Xuf883b422016-08-30 14:01:10 -0700321 aom_variance8x8_sse2(src, src_stride, ref, ref_stride, sse);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700322 return *sse;
323}
324
Yaowu Xuf883b422016-08-30 14:01:10 -0700325unsigned int aom_mse8x16_sse2(const uint8_t *src, int src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700326 const uint8_t *ref, int ref_stride,
327 unsigned int *sse) {
Yaowu Xuf883b422016-08-30 14:01:10 -0700328 aom_variance8x16_sse2(src, src_stride, ref, ref_stride, sse);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700329 return *sse;
330}
331
Yaowu Xuf883b422016-08-30 14:01:10 -0700332unsigned int aom_mse16x8_sse2(const uint8_t *src, int src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700333 const uint8_t *ref, int ref_stride,
334 unsigned int *sse) {
Yaowu Xuf883b422016-08-30 14:01:10 -0700335 aom_variance16x8_sse2(src, src_stride, ref, ref_stride, sse);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700336 return *sse;
337}
338
Yaowu Xuf883b422016-08-30 14:01:10 -0700339unsigned int aom_mse16x16_sse2(const uint8_t *src, int src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700340 const uint8_t *ref, int ref_stride,
341 unsigned int *sse) {
Yaowu Xuf883b422016-08-30 14:01:10 -0700342 aom_variance16x16_sse2(src, src_stride, ref, ref_stride, sse);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700343 return *sse;
344}
345
Rupert Swarbrick93c39e92017-07-12 11:11:02 +0100346#if CONFIG_EXT_PARTITION_TYPES
347unsigned int aom_variance4x16_sse2(const uint8_t *src, int src_stride,
348 const uint8_t *ref, int ref_stride,
349 unsigned int *sse) {
350 int sum;
351 variance_sse2(src, src_stride, ref, ref_stride, 4, 16, sse, &sum,
352 get4x4var_sse2, 4);
353 assert(sum <= 255 * 4 * 16);
354 assert(sum >= -255 * 4 * 16);
355 return *sse - (unsigned int)(((int64_t)sum * sum) >> 6);
356}
357
358unsigned int aom_variance16x4_sse2(const uint8_t *src, int src_stride,
359 const uint8_t *ref, int ref_stride,
360 unsigned int *sse) {
361 int sum;
362 variance_sse2(src, src_stride, ref, ref_stride, 16, 4, sse, &sum,
363 get4x4var_sse2, 4);
364 assert(sum <= 255 * 16 * 4);
365 assert(sum >= -255 * 16 * 4);
366 return *sse - (unsigned int)(((int64_t)sum * sum) >> 6);
367}
368
369unsigned int aom_variance8x32_sse2(const uint8_t *src, int src_stride,
370 const uint8_t *ref, int ref_stride,
371 unsigned int *sse) {
372 int sum;
373 variance_sse2(src, src_stride, ref, ref_stride, 8, 32, sse, &sum,
374 aom_get8x8var_sse2, 8);
375 assert(sum <= 255 * 8 * 32);
376 assert(sum >= -255 * 8 * 32);
377 return *sse - (unsigned int)(((int64_t)sum * sum) >> 8);
378}
379
380unsigned int aom_variance32x8_sse2(const uint8_t *src, int src_stride,
381 const uint8_t *ref, int ref_stride,
382 unsigned int *sse) {
383 int sum;
384 variance_sse2(src, src_stride, ref, ref_stride, 32, 8, sse, &sum,
385 aom_get8x8var_sse2, 8);
386 assert(sum <= 255 * 32 * 8);
387 assert(sum >= -255 * 32 * 8);
388 return *sse - (unsigned int)(((int64_t)sum * sum) >> 8);
389}
Rupert Swarbrick72678572017-08-02 12:05:26 +0100390
391unsigned int aom_variance16x64_sse2(const uint8_t *src, int src_stride,
392 const uint8_t *ref, int ref_stride,
393 unsigned int *sse) {
394 int sum;
395 variance_sse2(src, src_stride, ref, ref_stride, 16, 64, sse, &sum,
396 aom_get16x16var_sse2, 16);
397 assert(sum <= 255 * 16 * 64);
398 assert(sum >= -255 * 16 * 64);
399 return *sse - (unsigned int)(((int64_t)sum * sum) >> 10);
400}
401
402unsigned int aom_variance64x16_sse2(const uint8_t *src, int src_stride,
403 const uint8_t *ref, int ref_stride,
404 unsigned int *sse) {
405 int sum;
406 variance_sse2(src, src_stride, ref, ref_stride, 64, 16, sse, &sum,
407 aom_get16x16var_sse2, 16);
408 assert(sum <= 255 * 64 * 16);
409 assert(sum >= -255 * 64 * 16);
410 return *sse - (unsigned int)(((int64_t)sum * sum) >> 10);
411}
Rupert Swarbrick93c39e92017-07-12 11:11:02 +0100412#endif
413
Yaowu Xuc27fc142016-08-22 16:08:15 -0700414// The 2 unused parameters are place holders for PIC enabled build.
415// These definitions are for functions defined in subpel_variance.asm
416#define DECL(w, opt) \
Yaowu Xuf883b422016-08-30 14:01:10 -0700417 int aom_sub_pixel_variance##w##xh_##opt( \
Yaowu Xuc27fc142016-08-22 16:08:15 -0700418 const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
419 const uint8_t *dst, ptrdiff_t dst_stride, int height, unsigned int *sse, \
420 void *unused0, void *unused)
David Barker6c4af6b2017-06-23 17:14:51 +0100421#define DECLS(opt) \
422 DECL(4, opt); \
423 DECL(8, opt); \
424 DECL(16, opt)
Yaowu Xuc27fc142016-08-22 16:08:15 -0700425
David Barker6c4af6b2017-06-23 17:14:51 +0100426DECLS(sse2);
427DECLS(ssse3);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700428#undef DECLS
429#undef DECL
430
431#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \
Yaowu Xuf883b422016-08-30 14:01:10 -0700432 unsigned int aom_sub_pixel_variance##w##x##h##_##opt( \
Yaowu Xuc27fc142016-08-22 16:08:15 -0700433 const uint8_t *src, int src_stride, int x_offset, int y_offset, \
434 const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) { \
435 unsigned int sse; \
Yaowu Xuf883b422016-08-30 14:01:10 -0700436 int se = aom_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \
Yaowu Xuc27fc142016-08-22 16:08:15 -0700437 y_offset, dst, dst_stride, \
438 h, &sse, NULL, NULL); \
439 if (w > wf) { \
440 unsigned int sse2; \
Yaowu Xuf883b422016-08-30 14:01:10 -0700441 int se2 = aom_sub_pixel_variance##wf##xh_##opt( \
Yaowu Xuc27fc142016-08-22 16:08:15 -0700442 src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \
443 &sse2, NULL, NULL); \
444 se += se2; \
445 sse += sse2; \
446 if (w > wf * 2) { \
Yaowu Xuf883b422016-08-30 14:01:10 -0700447 se2 = aom_sub_pixel_variance##wf##xh_##opt( \
Yaowu Xuc27fc142016-08-22 16:08:15 -0700448 src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
449 &sse2, NULL, NULL); \
450 se += se2; \
451 sse += sse2; \
Yaowu Xuf883b422016-08-30 14:01:10 -0700452 se2 = aom_sub_pixel_variance##wf##xh_##opt( \
Yaowu Xuc27fc142016-08-22 16:08:15 -0700453 src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
454 &sse2, NULL, NULL); \
455 se += se2; \
456 sse += sse2; \
457 } \
458 } \
459 *sse_ptr = sse; \
Alex Converse2176b7a2016-07-28 09:48:50 -0700460 return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
Yaowu Xuc27fc142016-08-22 16:08:15 -0700461 }
462
Rupert Swarbrick93c39e92017-07-12 11:11:02 +0100463#if CONFIG_EXT_PARTITION_TYPES
464#define FNS(opt) \
465 FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \
466 FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)); \
467 FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)); \
468 FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)); \
469 FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)); \
470 FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)); \
471 FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \
472 FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t)); \
473 FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t)); \
474 FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t)); \
475 FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t)); \
476 FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t)); \
477 FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t)); \
478 FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)); \
479 FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)); \
Yaowu Xu9f78e852017-10-18 09:22:56 -0700480 FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t)); \
481 FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t)); \
482 FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t)); \
483 FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
Rupert Swarbrick93c39e92017-07-12 11:11:02 +0100484#else
David Barker6c4af6b2017-06-23 17:14:51 +0100485#define FNS(opt) \
486 FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \
487 FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)); \
488 FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)); \
489 FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)); \
490 FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)); \
491 FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)); \
492 FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \
493 FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t)); \
494 FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t)); \
495 FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t)); \
496 FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t)); \
497 FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t)); \
498 FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t))
Rupert Swarbrick93c39e92017-07-12 11:11:02 +0100499#endif
Yaowu Xuc27fc142016-08-22 16:08:15 -0700500
David Barker6c4af6b2017-06-23 17:14:51 +0100501FNS(sse2);
502FNS(ssse3);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700503
504#undef FNS
505#undef FN
506
507// The 2 unused parameters are place holders for PIC enabled build.
508#define DECL(w, opt) \
Yaowu Xuf883b422016-08-30 14:01:10 -0700509 int aom_sub_pixel_avg_variance##w##xh_##opt( \
Yaowu Xuc27fc142016-08-22 16:08:15 -0700510 const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
511 const uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *sec, \
512 ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0, \
513 void *unused)
David Barker6c4af6b2017-06-23 17:14:51 +0100514#define DECLS(opt) \
515 DECL(4, opt); \
516 DECL(8, opt); \
517 DECL(16, opt)
Yaowu Xuc27fc142016-08-22 16:08:15 -0700518
David Barker6c4af6b2017-06-23 17:14:51 +0100519DECLS(sse2);
520DECLS(ssse3);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700521#undef DECL
522#undef DECLS
523
524#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \
Yaowu Xuf883b422016-08-30 14:01:10 -0700525 unsigned int aom_sub_pixel_avg_variance##w##x##h##_##opt( \
Yaowu Xuc27fc142016-08-22 16:08:15 -0700526 const uint8_t *src, int src_stride, int x_offset, int y_offset, \
527 const uint8_t *dst, int dst_stride, unsigned int *sseptr, \
528 const uint8_t *sec) { \
529 unsigned int sse; \
Yaowu Xuf883b422016-08-30 14:01:10 -0700530 int se = aom_sub_pixel_avg_variance##wf##xh_##opt( \
Yaowu Xuc27fc142016-08-22 16:08:15 -0700531 src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
532 NULL, NULL); \
533 if (w > wf) { \
534 unsigned int sse2; \
Yaowu Xuf883b422016-08-30 14:01:10 -0700535 int se2 = aom_sub_pixel_avg_variance##wf##xh_##opt( \
Yaowu Xuc27fc142016-08-22 16:08:15 -0700536 src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, \
537 sec + 16, w, h, &sse2, NULL, NULL); \
538 se += se2; \
539 sse += sse2; \
540 if (w > wf * 2) { \
Yaowu Xuf883b422016-08-30 14:01:10 -0700541 se2 = aom_sub_pixel_avg_variance##wf##xh_##opt( \
Yaowu Xuc27fc142016-08-22 16:08:15 -0700542 src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \
543 sec + 32, w, h, &sse2, NULL, NULL); \
544 se += se2; \
545 sse += sse2; \
Yaowu Xuf883b422016-08-30 14:01:10 -0700546 se2 = aom_sub_pixel_avg_variance##wf##xh_##opt( \
Yaowu Xuc27fc142016-08-22 16:08:15 -0700547 src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \
548 sec + 48, w, h, &sse2, NULL, NULL); \
549 se += se2; \
550 sse += sse2; \
551 } \
552 } \
553 *sseptr = sse; \
Alex Converse2176b7a2016-07-28 09:48:50 -0700554 return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
Yaowu Xuc27fc142016-08-22 16:08:15 -0700555 }
556
Rupert Swarbrick93c39e92017-07-12 11:11:02 +0100557#if CONFIG_EXT_PARTITION_TYPES
558#define FNS(opt) \
559 FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \
560 FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)); \
561 FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)); \
562 FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)); \
563 FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)); \
564 FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)); \
565 FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \
566 FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t)); \
567 FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t)); \
568 FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t)); \
569 FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t)); \
570 FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t)); \
571 FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t)); \
572 FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)); \
573 FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)); \
Yaowu Xu9f78e852017-10-18 09:22:56 -0700574 FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t)); \
575 FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t)); \
576 FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t)); \
577 FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
Rupert Swarbrick93c39e92017-07-12 11:11:02 +0100578#else
David Barker6c4af6b2017-06-23 17:14:51 +0100579#define FNS(opt) \
580 FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \
581 FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)); \
582 FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)); \
583 FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)); \
584 FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)); \
585 FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)); \
586 FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \
587 FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t)); \
588 FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t)); \
589 FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t)); \
590 FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t)); \
591 FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t)); \
592 FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t))
Rupert Swarbrick93c39e92017-07-12 11:11:02 +0100593#endif
Yaowu Xuc27fc142016-08-22 16:08:15 -0700594
David Barker6c4af6b2017-06-23 17:14:51 +0100595FNS(sse2);
596FNS(ssse3);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700597
598#undef FNS
599#undef FN
600
Yaowu Xuf883b422016-08-30 14:01:10 -0700601void aom_upsampled_pred_sse2(uint8_t *comp_pred, int width, int height,
Timothy B. Terriberry5d24b6f2017-06-15 13:39:35 -0700602 int subpel_x_q3, int subpel_y_q3,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700603 const uint8_t *ref, int ref_stride) {
Timothy B. Terriberry5d24b6f2017-06-15 13:39:35 -0700604 if (!subpel_x_q3 && !subpel_y_q3) {
605 if (width >= 16) {
606 int i;
607 assert(!(width & 15));
608 /*Read 16 pixels one row at a time.*/
609 for (i = 0; i < height; i++) {
610 int j;
611 for (j = 0; j < width; j += 16) {
Rupert Swarbrickd2dea662017-10-24 17:23:21 +0100612 xx_storeu_128(comp_pred, xx_loadu_128(ref));
Timothy B. Terriberry5d24b6f2017-06-15 13:39:35 -0700613 comp_pred += 16;
614 ref += 16;
615 }
616 ref += ref_stride - width;
617 }
618 } else if (width >= 8) {
619 int i;
620 assert(!(width & 7));
621 assert(!(height & 1));
622 /*Read 8 pixels two rows at a time.*/
623 for (i = 0; i < height; i += 2) {
Rupert Swarbrickd2dea662017-10-24 17:23:21 +0100624 __m128i s0 = xx_loadl_64(ref + 0 * ref_stride);
625 __m128i s1 = xx_loadl_64(ref + 1 * ref_stride);
626 xx_storeu_128(comp_pred, _mm_unpacklo_epi64(s0, s1));
Yaowu Xuc27fc142016-08-22 16:08:15 -0700627 comp_pred += 16;
Timothy B. Terriberry5d24b6f2017-06-15 13:39:35 -0700628 ref += 2 * ref_stride;
Yaowu Xuc27fc142016-08-22 16:08:15 -0700629 }
Timothy B. Terriberry5d24b6f2017-06-15 13:39:35 -0700630 } else {
631 int i;
632 assert(!(width & 3));
633 assert(!(height & 3));
634 /*Read 4 pixels four rows at a time.*/
635 for (i = 0; i < height; i++) {
Rupert Swarbrickd2dea662017-10-24 17:23:21 +0100636 const __m128i row0 = xx_loadl_32(ref + 0 * ref_stride);
637 const __m128i row1 = xx_loadl_32(ref + 1 * ref_stride);
638 const __m128i row2 = xx_loadl_32(ref + 2 * ref_stride);
639 const __m128i row3 = xx_loadl_32(ref + 3 * ref_stride);
640 const __m128i reg = _mm_unpacklo_epi64(_mm_unpacklo_epi32(row0, row1),
641 _mm_unpacklo_epi32(row2, row3));
642 xx_storeu_128(comp_pred, reg);
Timothy B. Terriberry5d24b6f2017-06-15 13:39:35 -0700643 comp_pred += 16;
644 ref += 4 * ref_stride;
Yaowu Xuc27fc142016-08-22 16:08:15 -0700645 }
Yaowu Xuc27fc142016-08-22 16:08:15 -0700646 }
647 } else {
Timothy B. Terriberry5d24b6f2017-06-15 13:39:35 -0700648 InterpFilterParams filter;
649 filter = av1_get_interp_filter_params(EIGHTTAP_REGULAR);
650 if (!subpel_y_q3) {
651 const int16_t *kernel;
652 kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
653 aom_convolve8_horiz(ref, ref_stride, comp_pred, width, kernel, 16, NULL,
654 -1, width, height);
655 } else if (!subpel_x_q3) {
656 const int16_t *kernel;
657 kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
658 aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, kernel,
659 16, width, height);
660 } else {
661 DECLARE_ALIGNED(16, uint8_t,
662 temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
663 const int16_t *kernel_x;
664 const int16_t *kernel_y;
665 int intermediate_height;
666 kernel_x = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
667 kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
668 intermediate_height =
669 (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps;
670 assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
671 aom_convolve8_horiz(ref - ref_stride * ((filter.taps >> 1) - 1),
672 ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1,
673 width, intermediate_height);
674 aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1),
675 MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16,
676 width, height);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700677 }
678 }
679}
680
Yaowu Xuf883b422016-08-30 14:01:10 -0700681void aom_comp_avg_upsampled_pred_sse2(uint8_t *comp_pred, const uint8_t *pred,
Timothy B. Terriberry5d24b6f2017-06-15 13:39:35 -0700682 int width, int height, int subpel_x_q3,
683 int subpel_y_q3, const uint8_t *ref,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700684 int ref_stride) {
Timothy B. Terriberry5d24b6f2017-06-15 13:39:35 -0700685 int n;
686 int i;
687 aom_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3, ref,
688 ref_stride);
689 /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
690 assert(!(width * height & 15));
691 n = width * height >> 4;
692 for (i = 0; i < n; i++) {
Rupert Swarbrickd2dea662017-10-24 17:23:21 +0100693 __m128i s0 = xx_loadu_128(comp_pred);
694 __m128i p0 = xx_loadu_128(pred);
695 xx_storeu_128(comp_pred, _mm_avg_epu8(s0, p0));
Timothy B. Terriberry5d24b6f2017-06-15 13:39:35 -0700696 comp_pred += 16;
697 pred += 16;
Yaowu Xuc27fc142016-08-22 16:08:15 -0700698 }
699}