Debargha Mukherjee | 1d69cee | 2016-02-29 16:08:07 -0800 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (c) 2015 The WebM project authors. All Rights Reserved. |
| 3 | * |
| 4 | * Use of this source code is governed by a BSD-style license |
| 5 | * that can be found in the LICENSE file in the root of the source |
| 6 | * tree. An additional intellectual property rights grant can be found |
| 7 | * in the file PATENTS. All contributing project authors may |
| 8 | * be found in the AUTHORS file in the root of the source tree. |
| 9 | */ |
| 10 | |
| 11 | #include <stdlib.h> |
| 12 | #include <emmintrin.h> |
| 13 | #include <tmmintrin.h> |
| 14 | |
| 15 | #include "vpx_ports/mem.h" |
| 16 | #include "./vpx_config.h" |
| 17 | #include "vpx/vpx_integer.h" |
| 18 | |
| 19 | static INLINE __m128i width8_load_2rows(const uint8_t *ptr, int stride) { |
| 20 | __m128i temp1 = _mm_loadl_epi64((const __m128i*)ptr); |
| 21 | __m128i temp2 = _mm_loadl_epi64((const __m128i*)(ptr + stride)); |
| 22 | return _mm_unpacklo_epi64(temp1, temp2); |
| 23 | } |
| 24 | |
| 25 | static INLINE __m128i width4_load_4rows(const uint8_t *ptr, int stride) { |
| 26 | __m128i temp1 = _mm_cvtsi32_si128(*(const uint32_t*)ptr); |
| 27 | __m128i temp2 = _mm_cvtsi32_si128(*(const uint32_t*)(ptr + stride)); |
| 28 | __m128i temp3 = _mm_unpacklo_epi32(temp1, temp2); |
| 29 | temp1 = _mm_cvtsi32_si128(*(const uint32_t*)(ptr + stride * 2)); |
| 30 | temp2 = _mm_cvtsi32_si128(*(const uint32_t*)(ptr + stride * 3)); |
| 31 | temp1 = _mm_unpacklo_epi32(temp1, temp2); |
| 32 | return _mm_unpacklo_epi64(temp3, temp1); |
| 33 | } |
| 34 | |
| 35 | static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride, |
| 36 | const uint8_t *b_ptr, int b_stride, |
| 37 | const uint8_t *m_ptr, int m_stride, |
| 38 | int width, int height); |
| 39 | |
| 40 | static INLINE unsigned int masked_sad8xh_ssse3(const uint8_t *a_ptr, |
| 41 | int a_stride, |
| 42 | const uint8_t *b_ptr, |
| 43 | int b_stride, |
| 44 | const uint8_t *m_ptr, |
| 45 | int m_stride, |
| 46 | int height); |
| 47 | |
| 48 | static INLINE unsigned int masked_sad4xh_ssse3(const uint8_t *a_ptr, |
| 49 | int a_stride, |
| 50 | const uint8_t *b_ptr, |
| 51 | int b_stride, |
| 52 | const uint8_t *m_ptr, |
| 53 | int m_stride, |
| 54 | int height); |
| 55 | |
| 56 | #define MASKSADMXN_SSSE3(m, n) \ |
| 57 | unsigned int vpx_masked_sad##m##x##n##_ssse3(const uint8_t *src, \ |
| 58 | int src_stride, \ |
| 59 | const uint8_t *ref, \ |
| 60 | int ref_stride, \ |
| 61 | const uint8_t *msk, \ |
| 62 | int msk_stride) { \ |
| 63 | return masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, msk_stride, \ |
| 64 | m, n); \ |
| 65 | } |
| 66 | |
Geza Lore | 697bf5b | 2016-03-02 11:12:52 +0000 | [diff] [blame] | 67 | #if CONFIG_EXT_PARTITION |
| 68 | MASKSADMXN_SSSE3(128, 128) |
| 69 | MASKSADMXN_SSSE3(128, 64) |
| 70 | MASKSADMXN_SSSE3(64, 128) |
| 71 | #endif // CONFIG_EXT_PARTITION |
Debargha Mukherjee | 1d69cee | 2016-02-29 16:08:07 -0800 | [diff] [blame] | 72 | MASKSADMXN_SSSE3(64, 64) |
| 73 | MASKSADMXN_SSSE3(64, 32) |
| 74 | MASKSADMXN_SSSE3(32, 64) |
| 75 | MASKSADMXN_SSSE3(32, 32) |
| 76 | MASKSADMXN_SSSE3(32, 16) |
| 77 | MASKSADMXN_SSSE3(16, 32) |
| 78 | MASKSADMXN_SSSE3(16, 16) |
| 79 | MASKSADMXN_SSSE3(16, 8) |
| 80 | |
| 81 | #define MASKSAD8XN_SSSE3(n) \ |
| 82 | unsigned int vpx_masked_sad8x##n##_ssse3(const uint8_t *src, \ |
| 83 | int src_stride, \ |
| 84 | const uint8_t *ref, \ |
| 85 | int ref_stride, \ |
| 86 | const uint8_t *msk, \ |
| 87 | int msk_stride) { \ |
| 88 | return masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, msk, \ |
| 89 | msk_stride, n); \ |
| 90 | } |
| 91 | |
| 92 | MASKSAD8XN_SSSE3(16) |
| 93 | MASKSAD8XN_SSSE3(8) |
| 94 | MASKSAD8XN_SSSE3(4) |
| 95 | |
| 96 | #define MASKSAD4XN_SSSE3(n) \ |
| 97 | unsigned int vpx_masked_sad4x##n##_ssse3(const uint8_t *src, int src_stride, \ |
| 98 | const uint8_t *ref, int ref_stride, \ |
| 99 | const uint8_t *msk, int msk_stride) { \ |
| 100 | return masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk, \ |
| 101 | msk_stride, n); \ |
| 102 | } |
| 103 | |
| 104 | MASKSAD4XN_SSSE3(8) |
| 105 | MASKSAD4XN_SSSE3(4) |
| 106 | |
| 107 | // For width a multiple of 16 |
Geza Lore | 697bf5b | 2016-03-02 11:12:52 +0000 | [diff] [blame] | 108 | // Assumes values in m are <=64 |
Debargha Mukherjee | 1d69cee | 2016-02-29 16:08:07 -0800 | [diff] [blame] | 109 | static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride, |
| 110 | const uint8_t *b_ptr, int b_stride, |
| 111 | const uint8_t *m_ptr, int m_stride, |
| 112 | int width, int height) { |
| 113 | int y, x; |
| 114 | __m128i a, b, m, temp1, temp2; |
| 115 | __m128i res = _mm_setzero_si128(); |
| 116 | __m128i one = _mm_set1_epi16(1); |
| 117 | // For each row |
| 118 | for (y = 0; y < height; y++) { |
| 119 | // Covering the full width |
| 120 | for (x = 0; x < width; x += 16) { |
| 121 | // Load a, b, m in xmm registers |
| 122 | a = _mm_loadu_si128((const __m128i*)(a_ptr + x)); |
| 123 | b = _mm_loadu_si128((const __m128i*)(b_ptr + x)); |
| 124 | m = _mm_loadu_si128((const __m128i*)(m_ptr + x)); |
| 125 | |
| 126 | // Calculate the difference between a & b |
| 127 | temp1 = _mm_subs_epu8(a, b); |
| 128 | temp2 = _mm_subs_epu8(b, a); |
| 129 | temp1 = _mm_or_si128(temp1, temp2); |
| 130 | |
| 131 | // Multiply by m and add together |
| 132 | temp2 = _mm_maddubs_epi16(temp1, m); |
| 133 | // Pad out row result to 32 bit integers & add to running total |
| 134 | res = _mm_add_epi32(res, _mm_madd_epi16(temp2, one)); |
| 135 | } |
| 136 | // Move onto the next row |
| 137 | a_ptr += a_stride; |
| 138 | b_ptr += b_stride; |
| 139 | m_ptr += m_stride; |
| 140 | } |
| 141 | res = _mm_hadd_epi32(res, _mm_setzero_si128()); |
| 142 | res = _mm_hadd_epi32(res, _mm_setzero_si128()); |
| 143 | // sad = (sad + 31) >> 6; |
| 144 | return (_mm_cvtsi128_si32(res) + 31) >> 6; |
| 145 | } |
| 146 | |
| 147 | static INLINE unsigned int masked_sad8xh_ssse3(const uint8_t *a_ptr, |
| 148 | int a_stride, |
| 149 | const uint8_t *b_ptr, |
| 150 | int b_stride, |
| 151 | const uint8_t *m_ptr, |
| 152 | int m_stride, |
| 153 | int height) { |
| 154 | int y; |
| 155 | __m128i a, b, m, temp1, temp2, row_res; |
| 156 | __m128i res = _mm_setzero_si128(); |
| 157 | __m128i one = _mm_set1_epi16(1); |
| 158 | // Add the masked SAD for 2 rows at a time |
| 159 | for (y = 0; y < height; y += 2) { |
| 160 | // Load a, b, m in xmm registers |
| 161 | a = width8_load_2rows(a_ptr, a_stride); |
| 162 | b = width8_load_2rows(b_ptr, b_stride); |
| 163 | m = width8_load_2rows(m_ptr, m_stride); |
| 164 | |
| 165 | // Calculate the difference between a & b |
| 166 | temp1 = _mm_subs_epu8(a, b); |
| 167 | temp2 = _mm_subs_epu8(b, a); |
| 168 | temp1 = _mm_or_si128(temp1, temp2); |
| 169 | |
| 170 | // Multiply by m and add together |
| 171 | row_res = _mm_maddubs_epi16(temp1, m); |
| 172 | |
| 173 | // Pad out row result to 32 bit integers & add to running total |
| 174 | res = _mm_add_epi32(res, _mm_madd_epi16(row_res, one)); |
| 175 | |
| 176 | // Move onto the next rows |
| 177 | a_ptr += a_stride * 2; |
| 178 | b_ptr += b_stride * 2; |
| 179 | m_ptr += m_stride * 2; |
| 180 | } |
| 181 | res = _mm_hadd_epi32(res, _mm_setzero_si128()); |
| 182 | res = _mm_hadd_epi32(res, _mm_setzero_si128()); |
| 183 | // sad = (sad + 31) >> 6; |
| 184 | return (_mm_cvtsi128_si32(res) + 31) >> 6; |
| 185 | } |
| 186 | |
| 187 | static INLINE unsigned int masked_sad4xh_ssse3(const uint8_t *a_ptr, |
| 188 | int a_stride, |
| 189 | const uint8_t *b_ptr, |
| 190 | int b_stride, |
| 191 | const uint8_t *m_ptr, |
| 192 | int m_stride, |
| 193 | int height) { |
| 194 | int y; |
| 195 | __m128i a, b, m, temp1, temp2, row_res; |
| 196 | __m128i res = _mm_setzero_si128(); |
| 197 | __m128i one = _mm_set1_epi16(1); |
| 198 | // Add the masked SAD for 4 rows at a time |
| 199 | for (y = 0; y < height; y += 4) { |
| 200 | // Load a, b, m in xmm registers |
| 201 | a = width4_load_4rows(a_ptr, a_stride); |
| 202 | b = width4_load_4rows(b_ptr, b_stride); |
| 203 | m = width4_load_4rows(m_ptr, m_stride); |
| 204 | |
| 205 | // Calculate the difference between a & b |
| 206 | temp1 = _mm_subs_epu8(a, b); |
| 207 | temp2 = _mm_subs_epu8(b, a); |
| 208 | temp1 = _mm_or_si128(temp1, temp2); |
| 209 | |
| 210 | // Multiply by m and add together |
| 211 | row_res = _mm_maddubs_epi16(temp1, m); |
| 212 | |
| 213 | // Pad out row result to 32 bit integers & add to running total |
| 214 | res = _mm_add_epi32(res, _mm_madd_epi16(row_res, one)); |
| 215 | |
| 216 | // Move onto the next rows |
| 217 | a_ptr += a_stride * 4; |
| 218 | b_ptr += b_stride * 4; |
| 219 | m_ptr += m_stride * 4; |
| 220 | } |
| 221 | // Pad out row result to 32 bit integers & add to running total |
| 222 | res = _mm_hadd_epi32(res, _mm_setzero_si128()); |
| 223 | res = _mm_hadd_epi32(res, _mm_setzero_si128()); |
| 224 | // sad = (sad + 31) >> 6; |
| 225 | return (_mm_cvtsi128_si32(res) + 31) >> 6; |
| 226 | } |
| 227 | |
| 228 | #if CONFIG_VP9_HIGHBITDEPTH |
| 229 | static INLINE __m128i highbd_width4_load_2rows(const uint16_t *ptr, |
| 230 | int stride) { |
| 231 | __m128i temp1 = _mm_loadl_epi64((const __m128i*)ptr); |
| 232 | __m128i temp2 = _mm_loadl_epi64((const __m128i*)(ptr + stride)); |
| 233 | return _mm_unpacklo_epi64(temp1, temp2); |
| 234 | } |
| 235 | |
| 236 | static INLINE unsigned int highbd_masked_sad_ssse3(const uint8_t *a8_ptr, |
| 237 | int a_stride, |
| 238 | const uint8_t *b8_ptr, |
| 239 | int b_stride, |
| 240 | const uint8_t *m_ptr, |
| 241 | int m_stride, |
| 242 | int width, int height); |
| 243 | |
| 244 | static INLINE unsigned int highbd_masked_sad4xh_ssse3(const uint8_t *a8_ptr, |
| 245 | int a_stride, |
| 246 | const uint8_t *b8_ptr, |
| 247 | int b_stride, |
| 248 | const uint8_t *m_ptr, |
| 249 | int m_stride, |
| 250 | int height); |
| 251 | |
| 252 | #define HIGHBD_MASKSADMXN_SSSE3(m, n) \ |
| 253 | unsigned int vpx_highbd_masked_sad##m##x##n##_ssse3(const uint8_t *src, \ |
| 254 | int src_stride, \ |
| 255 | const uint8_t *ref, \ |
| 256 | int ref_stride, \ |
| 257 | const uint8_t *msk, \ |
| 258 | int msk_stride) { \ |
| 259 | return highbd_masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, \ |
| 260 | msk_stride, m, n); \ |
| 261 | } |
| 262 | |
Geza Lore | 697bf5b | 2016-03-02 11:12:52 +0000 | [diff] [blame] | 263 | #if CONFIG_EXT_PARTITION |
| 264 | HIGHBD_MASKSADMXN_SSSE3(128, 128) |
| 265 | HIGHBD_MASKSADMXN_SSSE3(128, 64) |
| 266 | HIGHBD_MASKSADMXN_SSSE3(64, 128) |
| 267 | #endif // CONFIG_EXT_PARTITION |
Debargha Mukherjee | 1d69cee | 2016-02-29 16:08:07 -0800 | [diff] [blame] | 268 | HIGHBD_MASKSADMXN_SSSE3(64, 64) |
| 269 | HIGHBD_MASKSADMXN_SSSE3(64, 32) |
| 270 | HIGHBD_MASKSADMXN_SSSE3(32, 64) |
| 271 | HIGHBD_MASKSADMXN_SSSE3(32, 32) |
| 272 | HIGHBD_MASKSADMXN_SSSE3(32, 16) |
| 273 | HIGHBD_MASKSADMXN_SSSE3(16, 32) |
| 274 | HIGHBD_MASKSADMXN_SSSE3(16, 16) |
| 275 | HIGHBD_MASKSADMXN_SSSE3(16, 8) |
| 276 | HIGHBD_MASKSADMXN_SSSE3(8, 16) |
| 277 | HIGHBD_MASKSADMXN_SSSE3(8, 8) |
| 278 | HIGHBD_MASKSADMXN_SSSE3(8, 4) |
| 279 | |
| 280 | #define HIGHBD_MASKSAD4XN_SSSE3(n) \ |
| 281 | unsigned int vpx_highbd_masked_sad4x##n##_ssse3(const uint8_t *src, \ |
| 282 | int src_stride, \ |
| 283 | const uint8_t *ref, \ |
| 284 | int ref_stride, \ |
| 285 | const uint8_t *msk, \ |
| 286 | int msk_stride) { \ |
| 287 | return highbd_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk, \ |
| 288 | msk_stride, n); \ |
| 289 | } |
| 290 | |
| 291 | HIGHBD_MASKSAD4XN_SSSE3(8) |
| 292 | HIGHBD_MASKSAD4XN_SSSE3(4) |
| 293 | |
| 294 | // For width a multiple of 8 |
| 295 | // Assumes values in m are <=64 |
| 296 | static INLINE unsigned int highbd_masked_sad_ssse3(const uint8_t *a8_ptr, |
| 297 | int a_stride, |
| 298 | const uint8_t *b8_ptr, |
| 299 | int b_stride, |
| 300 | const uint8_t *m_ptr, |
| 301 | int m_stride, |
| 302 | int width, int height) { |
| 303 | int y, x; |
| 304 | __m128i a, b, m, temp1, temp2; |
| 305 | const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr); |
| 306 | const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8_ptr); |
| 307 | __m128i res = _mm_setzero_si128(); |
| 308 | // For each row |
| 309 | for (y = 0; y < height; y++) { |
| 310 | // Covering the full width |
| 311 | for (x = 0; x < width; x += 8) { |
| 312 | // Load a, b, m in xmm registers |
| 313 | a = _mm_loadu_si128((const __m128i*)(a_ptr + x)); |
| 314 | b = _mm_loadu_si128((const __m128i*)(b_ptr + x)); |
| 315 | m = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(m_ptr + x)), |
| 316 | _mm_setzero_si128()); |
| 317 | |
| 318 | // Calculate the difference between a & b |
| 319 | temp1 = _mm_subs_epu16(a, b); |
| 320 | temp2 = _mm_subs_epu16(b, a); |
| 321 | temp1 = _mm_or_si128(temp1, temp2); |
| 322 | |
| 323 | // Add result of multiplying by m and add pairs together to running total |
| 324 | res = _mm_add_epi32(res, _mm_madd_epi16(temp1, m)); |
| 325 | } |
| 326 | // Move onto the next row |
| 327 | a_ptr += a_stride; |
| 328 | b_ptr += b_stride; |
| 329 | m_ptr += m_stride; |
| 330 | } |
| 331 | res = _mm_hadd_epi32(res, _mm_setzero_si128()); |
| 332 | res = _mm_hadd_epi32(res, _mm_setzero_si128()); |
| 333 | // sad = (sad + 31) >> 6; |
| 334 | return (_mm_cvtsi128_si32(res) + 31) >> 6; |
| 335 | } |
| 336 | |
| 337 | static INLINE unsigned int highbd_masked_sad4xh_ssse3(const uint8_t *a8_ptr, |
| 338 | int a_stride, |
| 339 | const uint8_t *b8_ptr, |
| 340 | int b_stride, |
| 341 | const uint8_t *m_ptr, |
| 342 | int m_stride, |
| 343 | int height) { |
| 344 | int y; |
| 345 | __m128i a, b, m, temp1, temp2; |
| 346 | const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr); |
| 347 | const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8_ptr); |
| 348 | __m128i res = _mm_setzero_si128(); |
| 349 | // Add the masked SAD for 2 rows at a time |
| 350 | for (y = 0; y < height; y += 2) { |
| 351 | // Load a, b, m in xmm registers |
| 352 | a = highbd_width4_load_2rows(a_ptr, a_stride); |
| 353 | b = highbd_width4_load_2rows(b_ptr, b_stride); |
| 354 | temp1 = _mm_loadl_epi64((const __m128i*)m_ptr); |
| 355 | temp2 = _mm_loadl_epi64((const __m128i*)(m_ptr + m_stride)); |
| 356 | m = _mm_unpacklo_epi8(_mm_unpacklo_epi32(temp1, temp2), |
| 357 | _mm_setzero_si128()); |
| 358 | |
| 359 | // Calculate the difference between a & b |
| 360 | temp1 = _mm_subs_epu16(a, b); |
| 361 | temp2 = _mm_subs_epu16(b, a); |
| 362 | temp1 = _mm_or_si128(temp1, temp2); |
| 363 | |
| 364 | // Multiply by m and add together |
| 365 | res = _mm_add_epi32(res, _mm_madd_epi16(temp1, m)); |
| 366 | |
| 367 | // Move onto the next rows |
| 368 | a_ptr += a_stride * 2; |
| 369 | b_ptr += b_stride * 2; |
| 370 | m_ptr += m_stride * 2; |
| 371 | } |
| 372 | res = _mm_hadd_epi32(res, _mm_setzero_si128()); |
| 373 | res = _mm_hadd_epi32(res, _mm_setzero_si128()); |
| 374 | // sad = (sad + 31) >> 6; |
| 375 | return (_mm_cvtsi128_si32(res) + 31) >> 6; |
| 376 | } |
| 377 | #endif // CONFIG_VP9_HIGHBITDEPTH |