Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 1 | /* |
Yaowu Xu | 2ab7ff0 | 2016-09-02 12:04:54 -0700 | [diff] [blame] | 2 | * Copyright (c) 2016, Alliance for Open Media. All rights reserved |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 3 | * |
Yaowu Xu | 2ab7ff0 | 2016-09-02 12:04:54 -0700 | [diff] [blame] | 4 | * This source code is subject to the terms of the BSD 2 Clause License and |
| 5 | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| 6 | * was not distributed with this source code in the LICENSE file, you can |
| 7 | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| 8 | * Media Patent License 1.0 was not distributed with this source code in the |
| 9 | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 10 | */ |
| 11 | |
Yaowu Xu | 6feda06 | 2016-05-18 09:41:09 -0700 | [diff] [blame] | 12 | #include <assert.h> |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 13 | #include <emmintrin.h> // SSE2 |
| 14 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 15 | #include "./aom_config.h" |
| 16 | #include "./aom_dsp_rtcd.h" |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 17 | |
Rupert Swarbrick | d2dea66 | 2017-10-24 17:23:21 +0100 | [diff] [blame] | 18 | #include "aom_dsp/x86/synonyms.h" |
| 19 | |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 20 | #include "aom_ports/mem.h" |
| 21 | |
Timothy B. Terriberry | 5d24b6f | 2017-06-15 13:39:35 -0700 | [diff] [blame] | 22 | #include "./av1_rtcd.h" |
| 23 | #include "av1/common/filter.h" |
| 24 | |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 25 | typedef void (*getNxMvar_fn_t)(const unsigned char *src, int src_stride, |
| 26 | const unsigned char *ref, int ref_stride, |
| 27 | unsigned int *sse, int *sum); |
| 28 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 29 | unsigned int aom_get_mb_ss_sse2(const int16_t *src) { |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 30 | __m128i vsum = _mm_setzero_si128(); |
| 31 | int i; |
| 32 | |
| 33 | for (i = 0; i < 32; ++i) { |
Rupert Swarbrick | d2dea66 | 2017-10-24 17:23:21 +0100 | [diff] [blame] | 34 | const __m128i v = xx_loadu_128(src); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 35 | vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v)); |
| 36 | src += 8; |
| 37 | } |
| 38 | |
| 39 | vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); |
| 40 | vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); |
| 41 | return _mm_cvtsi128_si32(vsum); |
| 42 | } |
| 43 | |
Rupert Swarbrick | d2dea66 | 2017-10-24 17:23:21 +0100 | [diff] [blame] | 44 | // Read 4 samples from each of row and row + 1. Interleave the two rows and |
| 45 | // zero-extend them to 16 bit samples stored in the lower half of an SSE |
| 46 | // register. |
| 47 | static __m128i read64(const uint8_t *p, int stride, int row) { |
| 48 | __m128i row0 = xx_loadl_32(p + (row + 0) * stride); |
| 49 | __m128i row1 = xx_loadl_32(p + (row + 1) * stride); |
| 50 | return _mm_unpacklo_epi8(_mm_unpacklo_epi8(row0, row1), _mm_setzero_si128()); |
| 51 | } |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 52 | |
| 53 | static void get4x4var_sse2(const uint8_t *src, int src_stride, |
| 54 | const uint8_t *ref, int ref_stride, |
| 55 | unsigned int *sse, int *sum) { |
Rupert Swarbrick | d2dea66 | 2017-10-24 17:23:21 +0100 | [diff] [blame] | 56 | const __m128i src0 = read64(src, src_stride, 0); |
| 57 | const __m128i src1 = read64(src, src_stride, 2); |
| 58 | const __m128i ref0 = read64(ref, ref_stride, 0); |
| 59 | const __m128i ref1 = read64(ref, ref_stride, 2); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 60 | const __m128i diff0 = _mm_sub_epi16(src0, ref0); |
| 61 | const __m128i diff1 = _mm_sub_epi16(src1, ref1); |
| 62 | |
| 63 | // sum |
| 64 | __m128i vsum = _mm_add_epi16(diff0, diff1); |
| 65 | vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); |
| 66 | vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); |
| 67 | vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); |
| 68 | *sum = (int16_t)_mm_extract_epi16(vsum, 0); |
| 69 | |
| 70 | // sse |
| 71 | vsum = |
| 72 | _mm_add_epi32(_mm_madd_epi16(diff0, diff0), _mm_madd_epi16(diff1, diff1)); |
| 73 | vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); |
| 74 | vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); |
| 75 | *sse = _mm_cvtsi128_si32(vsum); |
| 76 | } |
| 77 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 78 | void aom_get8x8var_sse2(const uint8_t *src, int src_stride, const uint8_t *ref, |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 79 | int ref_stride, unsigned int *sse, int *sum) { |
| 80 | const __m128i zero = _mm_setzero_si128(); |
| 81 | __m128i vsum = _mm_setzero_si128(); |
| 82 | __m128i vsse = _mm_setzero_si128(); |
| 83 | int i; |
| 84 | |
| 85 | for (i = 0; i < 8; i += 2) { |
Rupert Swarbrick | d2dea66 | 2017-10-24 17:23:21 +0100 | [diff] [blame] | 86 | const __m128i src0 = |
| 87 | _mm_unpacklo_epi8(xx_loadl_64(src + i * src_stride), zero); |
| 88 | const __m128i ref0 = |
| 89 | _mm_unpacklo_epi8(xx_loadl_64(ref + i * ref_stride), zero); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 90 | const __m128i diff0 = _mm_sub_epi16(src0, ref0); |
| 91 | |
Rupert Swarbrick | d2dea66 | 2017-10-24 17:23:21 +0100 | [diff] [blame] | 92 | const __m128i src1 = |
| 93 | _mm_unpacklo_epi8(xx_loadl_64(src + (i + 1) * src_stride), zero); |
| 94 | const __m128i ref1 = |
| 95 | _mm_unpacklo_epi8(xx_loadl_64(ref + (i + 1) * ref_stride), zero); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 96 | const __m128i diff1 = _mm_sub_epi16(src1, ref1); |
| 97 | |
| 98 | vsum = _mm_add_epi16(vsum, diff0); |
| 99 | vsum = _mm_add_epi16(vsum, diff1); |
| 100 | vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0)); |
| 101 | vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1)); |
| 102 | } |
| 103 | |
| 104 | // sum |
| 105 | vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); |
| 106 | vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); |
| 107 | vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); |
| 108 | *sum = (int16_t)_mm_extract_epi16(vsum, 0); |
| 109 | |
| 110 | // sse |
| 111 | vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8)); |
| 112 | vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4)); |
| 113 | *sse = _mm_cvtsi128_si32(vsse); |
| 114 | } |
| 115 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 116 | void aom_get16x16var_sse2(const uint8_t *src, int src_stride, |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 117 | const uint8_t *ref, int ref_stride, unsigned int *sse, |
| 118 | int *sum) { |
| 119 | const __m128i zero = _mm_setzero_si128(); |
| 120 | __m128i vsum = _mm_setzero_si128(); |
| 121 | __m128i vsse = _mm_setzero_si128(); |
| 122 | int i; |
| 123 | |
| 124 | for (i = 0; i < 16; ++i) { |
Rupert Swarbrick | d2dea66 | 2017-10-24 17:23:21 +0100 | [diff] [blame] | 125 | const __m128i s = xx_loadu_128(src); |
| 126 | const __m128i r = xx_loadu_128(ref); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 127 | |
| 128 | const __m128i src0 = _mm_unpacklo_epi8(s, zero); |
| 129 | const __m128i ref0 = _mm_unpacklo_epi8(r, zero); |
| 130 | const __m128i diff0 = _mm_sub_epi16(src0, ref0); |
| 131 | |
| 132 | const __m128i src1 = _mm_unpackhi_epi8(s, zero); |
| 133 | const __m128i ref1 = _mm_unpackhi_epi8(r, zero); |
| 134 | const __m128i diff1 = _mm_sub_epi16(src1, ref1); |
| 135 | |
| 136 | vsum = _mm_add_epi16(vsum, diff0); |
| 137 | vsum = _mm_add_epi16(vsum, diff1); |
| 138 | vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0)); |
| 139 | vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1)); |
| 140 | |
| 141 | src += src_stride; |
| 142 | ref += ref_stride; |
| 143 | } |
| 144 | |
| 145 | // sum |
| 146 | vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); |
| 147 | vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); |
| 148 | *sum = |
| 149 | (int16_t)_mm_extract_epi16(vsum, 0) + (int16_t)_mm_extract_epi16(vsum, 1); |
| 150 | |
| 151 | // sse |
| 152 | vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8)); |
| 153 | vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4)); |
| 154 | *sse = _mm_cvtsi128_si32(vsse); |
| 155 | } |
| 156 | |
| 157 | static void variance_sse2(const unsigned char *src, int src_stride, |
| 158 | const unsigned char *ref, int ref_stride, int w, |
| 159 | int h, unsigned int *sse, int *sum, |
| 160 | getNxMvar_fn_t var_fn, int block_size) { |
| 161 | int i, j; |
| 162 | |
| 163 | *sse = 0; |
| 164 | *sum = 0; |
| 165 | |
| 166 | for (i = 0; i < h; i += block_size) { |
| 167 | for (j = 0; j < w; j += block_size) { |
| 168 | unsigned int sse0; |
| 169 | int sum0; |
| 170 | var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, |
| 171 | ref_stride, &sse0, &sum0); |
| 172 | *sse += sse0; |
| 173 | *sum += sum0; |
| 174 | } |
| 175 | } |
| 176 | } |
| 177 | |
Yaowu Xu | 6feda06 | 2016-05-18 09:41:09 -0700 | [diff] [blame] | 178 | unsigned int aom_variance4x4_sse2(const uint8_t *src, int src_stride, |
| 179 | const uint8_t *ref, int ref_stride, |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 180 | unsigned int *sse) { |
| 181 | int sum; |
| 182 | get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum); |
Yaowu Xu | 6feda06 | 2016-05-18 09:41:09 -0700 | [diff] [blame] | 183 | assert(sum <= 255 * 4 * 4); |
| 184 | assert(sum >= -255 * 4 * 4); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 185 | return *sse - ((sum * sum) >> 4); |
| 186 | } |
| 187 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 188 | unsigned int aom_variance8x4_sse2(const uint8_t *src, int src_stride, |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 189 | const uint8_t *ref, int ref_stride, |
| 190 | unsigned int *sse) { |
| 191 | int sum; |
| 192 | variance_sse2(src, src_stride, ref, ref_stride, 8, 4, sse, &sum, |
| 193 | get4x4var_sse2, 4); |
Yaowu Xu | 6feda06 | 2016-05-18 09:41:09 -0700 | [diff] [blame] | 194 | assert(sum <= 255 * 8 * 4); |
| 195 | assert(sum >= -255 * 8 * 4); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 196 | return *sse - ((sum * sum) >> 5); |
| 197 | } |
| 198 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 199 | unsigned int aom_variance4x8_sse2(const uint8_t *src, int src_stride, |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 200 | const uint8_t *ref, int ref_stride, |
| 201 | unsigned int *sse) { |
| 202 | int sum; |
| 203 | variance_sse2(src, src_stride, ref, ref_stride, 4, 8, sse, &sum, |
| 204 | get4x4var_sse2, 4); |
Yaowu Xu | 6feda06 | 2016-05-18 09:41:09 -0700 | [diff] [blame] | 205 | assert(sum <= 255 * 8 * 4); |
| 206 | assert(sum >= -255 * 8 * 4); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 207 | return *sse - ((sum * sum) >> 5); |
| 208 | } |
| 209 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 210 | unsigned int aom_variance8x8_sse2(const unsigned char *src, int src_stride, |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 211 | const unsigned char *ref, int ref_stride, |
| 212 | unsigned int *sse) { |
| 213 | int sum; |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 214 | aom_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum); |
Yaowu Xu | 6feda06 | 2016-05-18 09:41:09 -0700 | [diff] [blame] | 215 | assert(sum <= 255 * 8 * 8); |
| 216 | assert(sum >= -255 * 8 * 8); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 217 | return *sse - ((sum * sum) >> 6); |
| 218 | } |
| 219 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 220 | unsigned int aom_variance16x8_sse2(const unsigned char *src, int src_stride, |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 221 | const unsigned char *ref, int ref_stride, |
| 222 | unsigned int *sse) { |
| 223 | int sum; |
| 224 | variance_sse2(src, src_stride, ref, ref_stride, 16, 8, sse, &sum, |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 225 | aom_get8x8var_sse2, 8); |
Yaowu Xu | 6feda06 | 2016-05-18 09:41:09 -0700 | [diff] [blame] | 226 | assert(sum <= 255 * 16 * 8); |
| 227 | assert(sum >= -255 * 16 * 8); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 228 | return *sse - ((sum * sum) >> 7); |
| 229 | } |
| 230 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 231 | unsigned int aom_variance8x16_sse2(const unsigned char *src, int src_stride, |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 232 | const unsigned char *ref, int ref_stride, |
| 233 | unsigned int *sse) { |
| 234 | int sum; |
| 235 | variance_sse2(src, src_stride, ref, ref_stride, 8, 16, sse, &sum, |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 236 | aom_get8x8var_sse2, 8); |
Yaowu Xu | 6feda06 | 2016-05-18 09:41:09 -0700 | [diff] [blame] | 237 | assert(sum <= 255 * 16 * 8); |
| 238 | assert(sum >= -255 * 16 * 8); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 239 | return *sse - ((sum * sum) >> 7); |
| 240 | } |
| 241 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 242 | unsigned int aom_variance16x16_sse2(const unsigned char *src, int src_stride, |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 243 | const unsigned char *ref, int ref_stride, |
| 244 | unsigned int *sse) { |
| 245 | int sum; |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 246 | aom_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum); |
Yaowu Xu | 6feda06 | 2016-05-18 09:41:09 -0700 | [diff] [blame] | 247 | assert(sum <= 255 * 16 * 16); |
| 248 | assert(sum >= -255 * 16 * 16); |
| 249 | return *sse - ((uint32_t)((int64_t)sum * sum) >> 8); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 250 | } |
| 251 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 252 | unsigned int aom_variance32x32_sse2(const uint8_t *src, int src_stride, |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 253 | const uint8_t *ref, int ref_stride, |
| 254 | unsigned int *sse) { |
| 255 | int sum; |
| 256 | variance_sse2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum, |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 257 | aom_get16x16var_sse2, 16); |
Yaowu Xu | 6feda06 | 2016-05-18 09:41:09 -0700 | [diff] [blame] | 258 | assert(sum <= 255 * 32 * 32); |
| 259 | assert(sum >= -255 * 32 * 32); |
Alex Converse | 2176b7a | 2016-07-28 09:48:50 -0700 | [diff] [blame] | 260 | return *sse - (unsigned int)(((int64_t)sum * sum) >> 10); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 261 | } |
| 262 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 263 | unsigned int aom_variance32x16_sse2(const uint8_t *src, int src_stride, |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 264 | const uint8_t *ref, int ref_stride, |
| 265 | unsigned int *sse) { |
| 266 | int sum; |
| 267 | variance_sse2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum, |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 268 | aom_get16x16var_sse2, 16); |
Yaowu Xu | 6feda06 | 2016-05-18 09:41:09 -0700 | [diff] [blame] | 269 | assert(sum <= 255 * 32 * 16); |
| 270 | assert(sum >= -255 * 32 * 16); |
Alex Converse | 2176b7a | 2016-07-28 09:48:50 -0700 | [diff] [blame] | 271 | return *sse - (unsigned int)(((int64_t)sum * sum) >> 9); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 272 | } |
| 273 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 274 | unsigned int aom_variance16x32_sse2(const uint8_t *src, int src_stride, |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 275 | const uint8_t *ref, int ref_stride, |
| 276 | unsigned int *sse) { |
| 277 | int sum; |
| 278 | variance_sse2(src, src_stride, ref, ref_stride, 16, 32, sse, &sum, |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 279 | aom_get16x16var_sse2, 16); |
Yaowu Xu | 6feda06 | 2016-05-18 09:41:09 -0700 | [diff] [blame] | 280 | assert(sum <= 255 * 32 * 16); |
| 281 | assert(sum >= -255 * 32 * 16); |
Alex Converse | 2176b7a | 2016-07-28 09:48:50 -0700 | [diff] [blame] | 282 | return *sse - (unsigned int)(((int64_t)sum * sum) >> 9); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 283 | } |
| 284 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 285 | unsigned int aom_variance64x64_sse2(const uint8_t *src, int src_stride, |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 286 | const uint8_t *ref, int ref_stride, |
| 287 | unsigned int *sse) { |
| 288 | int sum; |
| 289 | variance_sse2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum, |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 290 | aom_get16x16var_sse2, 16); |
Yaowu Xu | 6feda06 | 2016-05-18 09:41:09 -0700 | [diff] [blame] | 291 | assert(sum <= 255 * 64 * 64); |
| 292 | assert(sum >= -255 * 64 * 64); |
Alex Converse | 2176b7a | 2016-07-28 09:48:50 -0700 | [diff] [blame] | 293 | return *sse - (unsigned int)(((int64_t)sum * sum) >> 12); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 294 | } |
| 295 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 296 | unsigned int aom_variance64x32_sse2(const uint8_t *src, int src_stride, |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 297 | const uint8_t *ref, int ref_stride, |
| 298 | unsigned int *sse) { |
| 299 | int sum; |
| 300 | variance_sse2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum, |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 301 | aom_get16x16var_sse2, 16); |
Yaowu Xu | 6feda06 | 2016-05-18 09:41:09 -0700 | [diff] [blame] | 302 | assert(sum <= 255 * 64 * 32); |
| 303 | assert(sum >= -255 * 64 * 32); |
Alex Converse | 2176b7a | 2016-07-28 09:48:50 -0700 | [diff] [blame] | 304 | return *sse - (unsigned int)(((int64_t)sum * sum) >> 11); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 305 | } |
| 306 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 307 | unsigned int aom_variance32x64_sse2(const uint8_t *src, int src_stride, |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 308 | const uint8_t *ref, int ref_stride, |
| 309 | unsigned int *sse) { |
| 310 | int sum; |
| 311 | variance_sse2(src, src_stride, ref, ref_stride, 32, 64, sse, &sum, |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 312 | aom_get16x16var_sse2, 16); |
Yaowu Xu | 6feda06 | 2016-05-18 09:41:09 -0700 | [diff] [blame] | 313 | assert(sum <= 255 * 64 * 32); |
| 314 | assert(sum >= -255 * 64 * 32); |
Alex Converse | 2176b7a | 2016-07-28 09:48:50 -0700 | [diff] [blame] | 315 | return *sse - (unsigned int)(((int64_t)sum * sum) >> 11); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 316 | } |
| 317 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 318 | unsigned int aom_mse8x8_sse2(const uint8_t *src, int src_stride, |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 319 | const uint8_t *ref, int ref_stride, |
| 320 | unsigned int *sse) { |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 321 | aom_variance8x8_sse2(src, src_stride, ref, ref_stride, sse); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 322 | return *sse; |
| 323 | } |
| 324 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 325 | unsigned int aom_mse8x16_sse2(const uint8_t *src, int src_stride, |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 326 | const uint8_t *ref, int ref_stride, |
| 327 | unsigned int *sse) { |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 328 | aom_variance8x16_sse2(src, src_stride, ref, ref_stride, sse); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 329 | return *sse; |
| 330 | } |
| 331 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 332 | unsigned int aom_mse16x8_sse2(const uint8_t *src, int src_stride, |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 333 | const uint8_t *ref, int ref_stride, |
| 334 | unsigned int *sse) { |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 335 | aom_variance16x8_sse2(src, src_stride, ref, ref_stride, sse); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 336 | return *sse; |
| 337 | } |
| 338 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 339 | unsigned int aom_mse16x16_sse2(const uint8_t *src, int src_stride, |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 340 | const uint8_t *ref, int ref_stride, |
| 341 | unsigned int *sse) { |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 342 | aom_variance16x16_sse2(src, src_stride, ref, ref_stride, sse); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 343 | return *sse; |
| 344 | } |
| 345 | |
Rupert Swarbrick | 93c39e9 | 2017-07-12 11:11:02 +0100 | [diff] [blame] | 346 | #if CONFIG_EXT_PARTITION_TYPES |
| 347 | unsigned int aom_variance4x16_sse2(const uint8_t *src, int src_stride, |
| 348 | const uint8_t *ref, int ref_stride, |
| 349 | unsigned int *sse) { |
| 350 | int sum; |
| 351 | variance_sse2(src, src_stride, ref, ref_stride, 4, 16, sse, &sum, |
| 352 | get4x4var_sse2, 4); |
| 353 | assert(sum <= 255 * 4 * 16); |
| 354 | assert(sum >= -255 * 4 * 16); |
| 355 | return *sse - (unsigned int)(((int64_t)sum * sum) >> 6); |
| 356 | } |
| 357 | |
| 358 | unsigned int aom_variance16x4_sse2(const uint8_t *src, int src_stride, |
| 359 | const uint8_t *ref, int ref_stride, |
| 360 | unsigned int *sse) { |
| 361 | int sum; |
| 362 | variance_sse2(src, src_stride, ref, ref_stride, 16, 4, sse, &sum, |
| 363 | get4x4var_sse2, 4); |
| 364 | assert(sum <= 255 * 16 * 4); |
| 365 | assert(sum >= -255 * 16 * 4); |
| 366 | return *sse - (unsigned int)(((int64_t)sum * sum) >> 6); |
| 367 | } |
| 368 | |
| 369 | unsigned int aom_variance8x32_sse2(const uint8_t *src, int src_stride, |
| 370 | const uint8_t *ref, int ref_stride, |
| 371 | unsigned int *sse) { |
| 372 | int sum; |
| 373 | variance_sse2(src, src_stride, ref, ref_stride, 8, 32, sse, &sum, |
| 374 | aom_get8x8var_sse2, 8); |
| 375 | assert(sum <= 255 * 8 * 32); |
| 376 | assert(sum >= -255 * 8 * 32); |
| 377 | return *sse - (unsigned int)(((int64_t)sum * sum) >> 8); |
| 378 | } |
| 379 | |
| 380 | unsigned int aom_variance32x8_sse2(const uint8_t *src, int src_stride, |
| 381 | const uint8_t *ref, int ref_stride, |
| 382 | unsigned int *sse) { |
| 383 | int sum; |
| 384 | variance_sse2(src, src_stride, ref, ref_stride, 32, 8, sse, &sum, |
| 385 | aom_get8x8var_sse2, 8); |
| 386 | assert(sum <= 255 * 32 * 8); |
| 387 | assert(sum >= -255 * 32 * 8); |
| 388 | return *sse - (unsigned int)(((int64_t)sum * sum) >> 8); |
| 389 | } |
Rupert Swarbrick | 7267857 | 2017-08-02 12:05:26 +0100 | [diff] [blame] | 390 | |
| 391 | unsigned int aom_variance16x64_sse2(const uint8_t *src, int src_stride, |
| 392 | const uint8_t *ref, int ref_stride, |
| 393 | unsigned int *sse) { |
| 394 | int sum; |
| 395 | variance_sse2(src, src_stride, ref, ref_stride, 16, 64, sse, &sum, |
| 396 | aom_get16x16var_sse2, 16); |
| 397 | assert(sum <= 255 * 16 * 64); |
| 398 | assert(sum >= -255 * 16 * 64); |
| 399 | return *sse - (unsigned int)(((int64_t)sum * sum) >> 10); |
| 400 | } |
| 401 | |
| 402 | unsigned int aom_variance64x16_sse2(const uint8_t *src, int src_stride, |
| 403 | const uint8_t *ref, int ref_stride, |
| 404 | unsigned int *sse) { |
| 405 | int sum; |
| 406 | variance_sse2(src, src_stride, ref, ref_stride, 64, 16, sse, &sum, |
| 407 | aom_get16x16var_sse2, 16); |
| 408 | assert(sum <= 255 * 64 * 16); |
| 409 | assert(sum >= -255 * 64 * 16); |
| 410 | return *sse - (unsigned int)(((int64_t)sum * sum) >> 10); |
| 411 | } |
Rupert Swarbrick | 93c39e9 | 2017-07-12 11:11:02 +0100 | [diff] [blame] | 412 | #endif |
| 413 | |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 414 | // The 2 unused parameters are place holders for PIC enabled build. |
| 415 | // These definitions are for functions defined in subpel_variance.asm |
| 416 | #define DECL(w, opt) \ |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 417 | int aom_sub_pixel_variance##w##xh_##opt( \ |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 418 | const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ |
| 419 | const uint8_t *dst, ptrdiff_t dst_stride, int height, unsigned int *sse, \ |
| 420 | void *unused0, void *unused) |
David Barker | 6c4af6b | 2017-06-23 17:14:51 +0100 | [diff] [blame] | 421 | #define DECLS(opt) \ |
| 422 | DECL(4, opt); \ |
| 423 | DECL(8, opt); \ |
| 424 | DECL(16, opt) |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 425 | |
David Barker | 6c4af6b | 2017-06-23 17:14:51 +0100 | [diff] [blame] | 426 | DECLS(sse2); |
| 427 | DECLS(ssse3); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 428 | #undef DECLS |
| 429 | #undef DECL |
| 430 | |
| 431 | #define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 432 | unsigned int aom_sub_pixel_variance##w##x##h##_##opt( \ |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 433 | const uint8_t *src, int src_stride, int x_offset, int y_offset, \ |
| 434 | const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) { \ |
| 435 | unsigned int sse; \ |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 436 | int se = aom_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \ |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 437 | y_offset, dst, dst_stride, \ |
| 438 | h, &sse, NULL, NULL); \ |
| 439 | if (w > wf) { \ |
| 440 | unsigned int sse2; \ |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 441 | int se2 = aom_sub_pixel_variance##wf##xh_##opt( \ |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 442 | src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \ |
| 443 | &sse2, NULL, NULL); \ |
| 444 | se += se2; \ |
| 445 | sse += sse2; \ |
| 446 | if (w > wf * 2) { \ |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 447 | se2 = aom_sub_pixel_variance##wf##xh_##opt( \ |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 448 | src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \ |
| 449 | &sse2, NULL, NULL); \ |
| 450 | se += se2; \ |
| 451 | sse += sse2; \ |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 452 | se2 = aom_sub_pixel_variance##wf##xh_##opt( \ |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 453 | src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \ |
| 454 | &sse2, NULL, NULL); \ |
| 455 | se += se2; \ |
| 456 | sse += sse2; \ |
| 457 | } \ |
| 458 | } \ |
| 459 | *sse_ptr = sse; \ |
Alex Converse | 2176b7a | 2016-07-28 09:48:50 -0700 | [diff] [blame] | 460 | return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 461 | } |
| 462 | |
Rupert Swarbrick | 93c39e9 | 2017-07-12 11:11:02 +0100 | [diff] [blame] | 463 | #if CONFIG_EXT_PARTITION_TYPES |
| 464 | #define FNS(opt) \ |
| 465 | FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \ |
| 466 | FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)); \ |
| 467 | FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)); \ |
| 468 | FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)); \ |
| 469 | FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)); \ |
| 470 | FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)); \ |
| 471 | FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \ |
| 472 | FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t)); \ |
| 473 | FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t)); \ |
| 474 | FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t)); \ |
| 475 | FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t)); \ |
| 476 | FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t)); \ |
| 477 | FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t)); \ |
| 478 | FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)); \ |
| 479 | FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)); \ |
Yaowu Xu | 9f78e85 | 2017-10-18 09:22:56 -0700 | [diff] [blame] | 480 | FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t)); \ |
| 481 | FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t)); \ |
| 482 | FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t)); \ |
| 483 | FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t)) |
Rupert Swarbrick | 93c39e9 | 2017-07-12 11:11:02 +0100 | [diff] [blame] | 484 | #else |
David Barker | 6c4af6b | 2017-06-23 17:14:51 +0100 | [diff] [blame] | 485 | #define FNS(opt) \ |
| 486 | FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \ |
| 487 | FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)); \ |
| 488 | FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)); \ |
| 489 | FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)); \ |
| 490 | FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)); \ |
| 491 | FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)); \ |
| 492 | FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \ |
| 493 | FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t)); \ |
| 494 | FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t)); \ |
| 495 | FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t)); \ |
| 496 | FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t)); \ |
| 497 | FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t)); \ |
| 498 | FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t)) |
Rupert Swarbrick | 93c39e9 | 2017-07-12 11:11:02 +0100 | [diff] [blame] | 499 | #endif |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 500 | |
David Barker | 6c4af6b | 2017-06-23 17:14:51 +0100 | [diff] [blame] | 501 | FNS(sse2); |
| 502 | FNS(ssse3); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 503 | |
| 504 | #undef FNS |
| 505 | #undef FN |
| 506 | |
| 507 | // The 2 unused parameters are place holders for PIC enabled build. |
| 508 | #define DECL(w, opt) \ |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 509 | int aom_sub_pixel_avg_variance##w##xh_##opt( \ |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 510 | const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ |
| 511 | const uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *sec, \ |
| 512 | ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0, \ |
| 513 | void *unused) |
David Barker | 6c4af6b | 2017-06-23 17:14:51 +0100 | [diff] [blame] | 514 | #define DECLS(opt) \ |
| 515 | DECL(4, opt); \ |
| 516 | DECL(8, opt); \ |
| 517 | DECL(16, opt) |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 518 | |
David Barker | 6c4af6b | 2017-06-23 17:14:51 +0100 | [diff] [blame] | 519 | DECLS(sse2); |
| 520 | DECLS(ssse3); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 521 | #undef DECL |
| 522 | #undef DECLS |
| 523 | |
| 524 | #define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 525 | unsigned int aom_sub_pixel_avg_variance##w##x##h##_##opt( \ |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 526 | const uint8_t *src, int src_stride, int x_offset, int y_offset, \ |
| 527 | const uint8_t *dst, int dst_stride, unsigned int *sseptr, \ |
| 528 | const uint8_t *sec) { \ |
| 529 | unsigned int sse; \ |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 530 | int se = aom_sub_pixel_avg_variance##wf##xh_##opt( \ |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 531 | src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \ |
| 532 | NULL, NULL); \ |
| 533 | if (w > wf) { \ |
| 534 | unsigned int sse2; \ |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 535 | int se2 = aom_sub_pixel_avg_variance##wf##xh_##opt( \ |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 536 | src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, \ |
| 537 | sec + 16, w, h, &sse2, NULL, NULL); \ |
| 538 | se += se2; \ |
| 539 | sse += sse2; \ |
| 540 | if (w > wf * 2) { \ |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 541 | se2 = aom_sub_pixel_avg_variance##wf##xh_##opt( \ |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 542 | src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \ |
| 543 | sec + 32, w, h, &sse2, NULL, NULL); \ |
| 544 | se += se2; \ |
| 545 | sse += sse2; \ |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 546 | se2 = aom_sub_pixel_avg_variance##wf##xh_##opt( \ |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 547 | src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \ |
| 548 | sec + 48, w, h, &sse2, NULL, NULL); \ |
| 549 | se += se2; \ |
| 550 | sse += sse2; \ |
| 551 | } \ |
| 552 | } \ |
| 553 | *sseptr = sse; \ |
Alex Converse | 2176b7a | 2016-07-28 09:48:50 -0700 | [diff] [blame] | 554 | return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 555 | } |
| 556 | |
Rupert Swarbrick | 93c39e9 | 2017-07-12 11:11:02 +0100 | [diff] [blame] | 557 | #if CONFIG_EXT_PARTITION_TYPES |
| 558 | #define FNS(opt) \ |
| 559 | FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \ |
| 560 | FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)); \ |
| 561 | FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)); \ |
| 562 | FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)); \ |
| 563 | FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)); \ |
| 564 | FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)); \ |
| 565 | FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \ |
| 566 | FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t)); \ |
| 567 | FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t)); \ |
| 568 | FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t)); \ |
| 569 | FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t)); \ |
| 570 | FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t)); \ |
| 571 | FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t)); \ |
| 572 | FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)); \ |
| 573 | FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)); \ |
Yaowu Xu | 9f78e85 | 2017-10-18 09:22:56 -0700 | [diff] [blame] | 574 | FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t)); \ |
| 575 | FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t)); \ |
| 576 | FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t)); \ |
| 577 | FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t)) |
Rupert Swarbrick | 93c39e9 | 2017-07-12 11:11:02 +0100 | [diff] [blame] | 578 | #else |
David Barker | 6c4af6b | 2017-06-23 17:14:51 +0100 | [diff] [blame] | 579 | #define FNS(opt) \ |
| 580 | FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \ |
| 581 | FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)); \ |
| 582 | FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)); \ |
| 583 | FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)); \ |
| 584 | FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)); \ |
| 585 | FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)); \ |
| 586 | FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \ |
| 587 | FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t)); \ |
| 588 | FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t)); \ |
| 589 | FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t)); \ |
| 590 | FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t)); \ |
| 591 | FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t)); \ |
| 592 | FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t)) |
Rupert Swarbrick | 93c39e9 | 2017-07-12 11:11:02 +0100 | [diff] [blame] | 593 | #endif |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 594 | |
David Barker | 6c4af6b | 2017-06-23 17:14:51 +0100 | [diff] [blame] | 595 | FNS(sse2); |
| 596 | FNS(ssse3); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 597 | |
| 598 | #undef FNS |
| 599 | #undef FN |
| 600 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 601 | void aom_upsampled_pred_sse2(uint8_t *comp_pred, int width, int height, |
Timothy B. Terriberry | 5d24b6f | 2017-06-15 13:39:35 -0700 | [diff] [blame] | 602 | int subpel_x_q3, int subpel_y_q3, |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 603 | const uint8_t *ref, int ref_stride) { |
Timothy B. Terriberry | 5d24b6f | 2017-06-15 13:39:35 -0700 | [diff] [blame] | 604 | if (!subpel_x_q3 && !subpel_y_q3) { |
| 605 | if (width >= 16) { |
| 606 | int i; |
| 607 | assert(!(width & 15)); |
| 608 | /*Read 16 pixels one row at a time.*/ |
| 609 | for (i = 0; i < height; i++) { |
| 610 | int j; |
| 611 | for (j = 0; j < width; j += 16) { |
Rupert Swarbrick | d2dea66 | 2017-10-24 17:23:21 +0100 | [diff] [blame] | 612 | xx_storeu_128(comp_pred, xx_loadu_128(ref)); |
Timothy B. Terriberry | 5d24b6f | 2017-06-15 13:39:35 -0700 | [diff] [blame] | 613 | comp_pred += 16; |
| 614 | ref += 16; |
| 615 | } |
| 616 | ref += ref_stride - width; |
| 617 | } |
| 618 | } else if (width >= 8) { |
| 619 | int i; |
| 620 | assert(!(width & 7)); |
| 621 | assert(!(height & 1)); |
| 622 | /*Read 8 pixels two rows at a time.*/ |
| 623 | for (i = 0; i < height; i += 2) { |
Rupert Swarbrick | d2dea66 | 2017-10-24 17:23:21 +0100 | [diff] [blame] | 624 | __m128i s0 = xx_loadl_64(ref + 0 * ref_stride); |
| 625 | __m128i s1 = xx_loadl_64(ref + 1 * ref_stride); |
| 626 | xx_storeu_128(comp_pred, _mm_unpacklo_epi64(s0, s1)); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 627 | comp_pred += 16; |
Timothy B. Terriberry | 5d24b6f | 2017-06-15 13:39:35 -0700 | [diff] [blame] | 628 | ref += 2 * ref_stride; |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 629 | } |
Timothy B. Terriberry | 5d24b6f | 2017-06-15 13:39:35 -0700 | [diff] [blame] | 630 | } else { |
| 631 | int i; |
| 632 | assert(!(width & 3)); |
| 633 | assert(!(height & 3)); |
| 634 | /*Read 4 pixels four rows at a time.*/ |
| 635 | for (i = 0; i < height; i++) { |
Rupert Swarbrick | d2dea66 | 2017-10-24 17:23:21 +0100 | [diff] [blame] | 636 | const __m128i row0 = xx_loadl_32(ref + 0 * ref_stride); |
| 637 | const __m128i row1 = xx_loadl_32(ref + 1 * ref_stride); |
| 638 | const __m128i row2 = xx_loadl_32(ref + 2 * ref_stride); |
| 639 | const __m128i row3 = xx_loadl_32(ref + 3 * ref_stride); |
| 640 | const __m128i reg = _mm_unpacklo_epi64(_mm_unpacklo_epi32(row0, row1), |
| 641 | _mm_unpacklo_epi32(row2, row3)); |
| 642 | xx_storeu_128(comp_pred, reg); |
Timothy B. Terriberry | 5d24b6f | 2017-06-15 13:39:35 -0700 | [diff] [blame] | 643 | comp_pred += 16; |
| 644 | ref += 4 * ref_stride; |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 645 | } |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 646 | } |
| 647 | } else { |
Timothy B. Terriberry | 5d24b6f | 2017-06-15 13:39:35 -0700 | [diff] [blame] | 648 | InterpFilterParams filter; |
| 649 | filter = av1_get_interp_filter_params(EIGHTTAP_REGULAR); |
| 650 | if (!subpel_y_q3) { |
| 651 | const int16_t *kernel; |
| 652 | kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); |
| 653 | aom_convolve8_horiz(ref, ref_stride, comp_pred, width, kernel, 16, NULL, |
| 654 | -1, width, height); |
| 655 | } else if (!subpel_x_q3) { |
| 656 | const int16_t *kernel; |
| 657 | kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); |
| 658 | aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, kernel, |
| 659 | 16, width, height); |
| 660 | } else { |
| 661 | DECLARE_ALIGNED(16, uint8_t, |
| 662 | temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]); |
| 663 | const int16_t *kernel_x; |
| 664 | const int16_t *kernel_y; |
| 665 | int intermediate_height; |
| 666 | kernel_x = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); |
| 667 | kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); |
| 668 | intermediate_height = |
| 669 | (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps; |
| 670 | assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); |
| 671 | aom_convolve8_horiz(ref - ref_stride * ((filter.taps >> 1) - 1), |
| 672 | ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1, |
| 673 | width, intermediate_height); |
| 674 | aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1), |
| 675 | MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16, |
| 676 | width, height); |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 677 | } |
| 678 | } |
| 679 | } |
| 680 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 681 | void aom_comp_avg_upsampled_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, |
Timothy B. Terriberry | 5d24b6f | 2017-06-15 13:39:35 -0700 | [diff] [blame] | 682 | int width, int height, int subpel_x_q3, |
| 683 | int subpel_y_q3, const uint8_t *ref, |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 684 | int ref_stride) { |
Timothy B. Terriberry | 5d24b6f | 2017-06-15 13:39:35 -0700 | [diff] [blame] | 685 | int n; |
| 686 | int i; |
| 687 | aom_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3, ref, |
| 688 | ref_stride); |
| 689 | /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/ |
| 690 | assert(!(width * height & 15)); |
| 691 | n = width * height >> 4; |
| 692 | for (i = 0; i < n; i++) { |
Rupert Swarbrick | d2dea66 | 2017-10-24 17:23:21 +0100 | [diff] [blame] | 693 | __m128i s0 = xx_loadu_128(comp_pred); |
| 694 | __m128i p0 = xx_loadu_128(pred); |
| 695 | xx_storeu_128(comp_pred, _mm_avg_epu8(s0, p0)); |
Timothy B. Terriberry | 5d24b6f | 2017-06-15 13:39:35 -0700 | [diff] [blame] | 696 | comp_pred += 16; |
| 697 | pred += 16; |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 698 | } |
| 699 | } |