Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 1 | /* |
Yaowu Xu | bde4ac8 | 2016-11-28 15:26:06 -0800 | [diff] [blame] | 2 | * Copyright (c) 2016, Alliance for Open Media. All rights reserved |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 3 | * |
Yaowu Xu | bde4ac8 | 2016-11-28 15:26:06 -0800 | [diff] [blame] | 4 | * This source code is subject to the terms of the BSD 2 Clause License and |
| 5 | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| 6 | * was not distributed with this source code in the LICENSE file, you can |
| 7 | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| 8 | * Media Patent License 1.0 was not distributed with this source code in the |
| 9 | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 10 | */ |
| 11 | |
| 12 | #include <stdlib.h> |
| 13 | #include <emmintrin.h> |
| 14 | #include <tmmintrin.h> |
| 15 | |
| 16 | #include "aom_ports/mem.h" |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 17 | #include "./aom_config.h" |
| 18 | #include "aom/aom_integer.h" |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 19 | |
| 20 | static INLINE __m128i width8_load_2rows(const uint8_t *ptr, int stride) { |
| 21 | __m128i temp1 = _mm_loadl_epi64((const __m128i *)ptr); |
| 22 | __m128i temp2 = _mm_loadl_epi64((const __m128i *)(ptr + stride)); |
| 23 | return _mm_unpacklo_epi64(temp1, temp2); |
| 24 | } |
| 25 | |
| 26 | static INLINE __m128i width4_load_4rows(const uint8_t *ptr, int stride) { |
| 27 | __m128i temp1 = _mm_cvtsi32_si128(*(const uint32_t *)ptr); |
| 28 | __m128i temp2 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride)); |
| 29 | __m128i temp3 = _mm_unpacklo_epi32(temp1, temp2); |
| 30 | temp1 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride * 2)); |
| 31 | temp2 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride * 3)); |
| 32 | temp1 = _mm_unpacklo_epi32(temp1, temp2); |
| 33 | return _mm_unpacklo_epi64(temp3, temp1); |
| 34 | } |
| 35 | |
| 36 | static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride, |
| 37 | const uint8_t *b_ptr, int b_stride, |
| 38 | const uint8_t *m_ptr, int m_stride, |
| 39 | int width, int height); |
| 40 | |
| 41 | static INLINE unsigned int masked_sad8xh_ssse3( |
| 42 | const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, |
| 43 | const uint8_t *m_ptr, int m_stride, int height); |
| 44 | |
| 45 | static INLINE unsigned int masked_sad4xh_ssse3( |
| 46 | const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, |
| 47 | const uint8_t *m_ptr, int m_stride, int height); |
| 48 | |
| 49 | #define MASKSADMXN_SSSE3(m, n) \ |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 50 | unsigned int aom_masked_sad##m##x##n##_ssse3( \ |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 51 | const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ |
| 52 | const uint8_t *msk, int msk_stride) { \ |
| 53 | return masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, msk_stride, \ |
| 54 | m, n); \ |
| 55 | } |
| 56 | |
| 57 | #if CONFIG_EXT_PARTITION |
| 58 | MASKSADMXN_SSSE3(128, 128) |
| 59 | MASKSADMXN_SSSE3(128, 64) |
| 60 | MASKSADMXN_SSSE3(64, 128) |
| 61 | #endif // CONFIG_EXT_PARTITION |
| 62 | MASKSADMXN_SSSE3(64, 64) |
| 63 | MASKSADMXN_SSSE3(64, 32) |
| 64 | MASKSADMXN_SSSE3(32, 64) |
| 65 | MASKSADMXN_SSSE3(32, 32) |
| 66 | MASKSADMXN_SSSE3(32, 16) |
| 67 | MASKSADMXN_SSSE3(16, 32) |
| 68 | MASKSADMXN_SSSE3(16, 16) |
| 69 | MASKSADMXN_SSSE3(16, 8) |
| 70 | |
| 71 | #define MASKSAD8XN_SSSE3(n) \ |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 72 | unsigned int aom_masked_sad8x##n##_ssse3( \ |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 73 | const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ |
| 74 | const uint8_t *msk, int msk_stride) { \ |
| 75 | return masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, msk, \ |
| 76 | msk_stride, n); \ |
| 77 | } |
| 78 | |
| 79 | MASKSAD8XN_SSSE3(16) |
| 80 | MASKSAD8XN_SSSE3(8) |
| 81 | MASKSAD8XN_SSSE3(4) |
| 82 | |
| 83 | #define MASKSAD4XN_SSSE3(n) \ |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 84 | unsigned int aom_masked_sad4x##n##_ssse3( \ |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 85 | const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ |
| 86 | const uint8_t *msk, int msk_stride) { \ |
| 87 | return masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk, \ |
| 88 | msk_stride, n); \ |
| 89 | } |
| 90 | |
| 91 | MASKSAD4XN_SSSE3(8) |
| 92 | MASKSAD4XN_SSSE3(4) |
| 93 | |
| 94 | // For width a multiple of 16 |
| 95 | // Assumes values in m are <=64 |
| 96 | static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride, |
| 97 | const uint8_t *b_ptr, int b_stride, |
| 98 | const uint8_t *m_ptr, int m_stride, |
| 99 | int width, int height) { |
| 100 | int y, x; |
| 101 | __m128i a, b, m, temp1, temp2; |
| 102 | __m128i res = _mm_setzero_si128(); |
| 103 | __m128i one = _mm_set1_epi16(1); |
| 104 | // For each row |
| 105 | for (y = 0; y < height; y++) { |
| 106 | // Covering the full width |
| 107 | for (x = 0; x < width; x += 16) { |
| 108 | // Load a, b, m in xmm registers |
| 109 | a = _mm_loadu_si128((const __m128i *)(a_ptr + x)); |
| 110 | b = _mm_loadu_si128((const __m128i *)(b_ptr + x)); |
| 111 | m = _mm_loadu_si128((const __m128i *)(m_ptr + x)); |
| 112 | |
| 113 | // Calculate the difference between a & b |
| 114 | temp1 = _mm_subs_epu8(a, b); |
| 115 | temp2 = _mm_subs_epu8(b, a); |
| 116 | temp1 = _mm_or_si128(temp1, temp2); |
| 117 | |
| 118 | // Multiply by m and add together |
| 119 | temp2 = _mm_maddubs_epi16(temp1, m); |
| 120 | // Pad out row result to 32 bit integers & add to running total |
| 121 | res = _mm_add_epi32(res, _mm_madd_epi16(temp2, one)); |
| 122 | } |
| 123 | // Move onto the next row |
| 124 | a_ptr += a_stride; |
| 125 | b_ptr += b_stride; |
| 126 | m_ptr += m_stride; |
| 127 | } |
| 128 | res = _mm_hadd_epi32(res, _mm_setzero_si128()); |
| 129 | res = _mm_hadd_epi32(res, _mm_setzero_si128()); |
| 130 | // sad = (sad + 31) >> 6; |
| 131 | return (_mm_cvtsi128_si32(res) + 31) >> 6; |
| 132 | } |
| 133 | |
| 134 | static INLINE unsigned int masked_sad8xh_ssse3( |
| 135 | const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, |
| 136 | const uint8_t *m_ptr, int m_stride, int height) { |
| 137 | int y; |
| 138 | __m128i a, b, m, temp1, temp2, row_res; |
| 139 | __m128i res = _mm_setzero_si128(); |
| 140 | __m128i one = _mm_set1_epi16(1); |
| 141 | // Add the masked SAD for 2 rows at a time |
| 142 | for (y = 0; y < height; y += 2) { |
| 143 | // Load a, b, m in xmm registers |
| 144 | a = width8_load_2rows(a_ptr, a_stride); |
| 145 | b = width8_load_2rows(b_ptr, b_stride); |
| 146 | m = width8_load_2rows(m_ptr, m_stride); |
| 147 | |
| 148 | // Calculate the difference between a & b |
| 149 | temp1 = _mm_subs_epu8(a, b); |
| 150 | temp2 = _mm_subs_epu8(b, a); |
| 151 | temp1 = _mm_or_si128(temp1, temp2); |
| 152 | |
| 153 | // Multiply by m and add together |
| 154 | row_res = _mm_maddubs_epi16(temp1, m); |
| 155 | |
| 156 | // Pad out row result to 32 bit integers & add to running total |
| 157 | res = _mm_add_epi32(res, _mm_madd_epi16(row_res, one)); |
| 158 | |
| 159 | // Move onto the next rows |
| 160 | a_ptr += a_stride * 2; |
| 161 | b_ptr += b_stride * 2; |
| 162 | m_ptr += m_stride * 2; |
| 163 | } |
| 164 | res = _mm_hadd_epi32(res, _mm_setzero_si128()); |
| 165 | res = _mm_hadd_epi32(res, _mm_setzero_si128()); |
| 166 | // sad = (sad + 31) >> 6; |
| 167 | return (_mm_cvtsi128_si32(res) + 31) >> 6; |
| 168 | } |
| 169 | |
| 170 | static INLINE unsigned int masked_sad4xh_ssse3( |
| 171 | const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, |
| 172 | const uint8_t *m_ptr, int m_stride, int height) { |
| 173 | int y; |
| 174 | __m128i a, b, m, temp1, temp2, row_res; |
| 175 | __m128i res = _mm_setzero_si128(); |
| 176 | __m128i one = _mm_set1_epi16(1); |
| 177 | // Add the masked SAD for 4 rows at a time |
| 178 | for (y = 0; y < height; y += 4) { |
| 179 | // Load a, b, m in xmm registers |
| 180 | a = width4_load_4rows(a_ptr, a_stride); |
| 181 | b = width4_load_4rows(b_ptr, b_stride); |
| 182 | m = width4_load_4rows(m_ptr, m_stride); |
| 183 | |
| 184 | // Calculate the difference between a & b |
| 185 | temp1 = _mm_subs_epu8(a, b); |
| 186 | temp2 = _mm_subs_epu8(b, a); |
| 187 | temp1 = _mm_or_si128(temp1, temp2); |
| 188 | |
| 189 | // Multiply by m and add together |
| 190 | row_res = _mm_maddubs_epi16(temp1, m); |
| 191 | |
| 192 | // Pad out row result to 32 bit integers & add to running total |
| 193 | res = _mm_add_epi32(res, _mm_madd_epi16(row_res, one)); |
| 194 | |
| 195 | // Move onto the next rows |
| 196 | a_ptr += a_stride * 4; |
| 197 | b_ptr += b_stride * 4; |
| 198 | m_ptr += m_stride * 4; |
| 199 | } |
| 200 | // Pad out row result to 32 bit integers & add to running total |
| 201 | res = _mm_hadd_epi32(res, _mm_setzero_si128()); |
| 202 | res = _mm_hadd_epi32(res, _mm_setzero_si128()); |
| 203 | // sad = (sad + 31) >> 6; |
| 204 | return (_mm_cvtsi128_si32(res) + 31) >> 6; |
| 205 | } |
| 206 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 207 | #if CONFIG_AOM_HIGHBITDEPTH |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 208 | static INLINE __m128i highbd_width4_load_2rows(const uint16_t *ptr, |
| 209 | int stride) { |
| 210 | __m128i temp1 = _mm_loadl_epi64((const __m128i *)ptr); |
| 211 | __m128i temp2 = _mm_loadl_epi64((const __m128i *)(ptr + stride)); |
| 212 | return _mm_unpacklo_epi64(temp1, temp2); |
| 213 | } |
| 214 | |
| 215 | static INLINE unsigned int highbd_masked_sad_ssse3( |
| 216 | const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride, |
| 217 | const uint8_t *m_ptr, int m_stride, int width, int height); |
| 218 | |
| 219 | static INLINE unsigned int highbd_masked_sad4xh_ssse3( |
| 220 | const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride, |
| 221 | const uint8_t *m_ptr, int m_stride, int height); |
| 222 | |
| 223 | #define HIGHBD_MASKSADMXN_SSSE3(m, n) \ |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 224 | unsigned int aom_highbd_masked_sad##m##x##n##_ssse3( \ |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 225 | const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ |
| 226 | const uint8_t *msk, int msk_stride) { \ |
| 227 | return highbd_masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, \ |
| 228 | msk_stride, m, n); \ |
| 229 | } |
| 230 | |
| 231 | #if CONFIG_EXT_PARTITION |
| 232 | HIGHBD_MASKSADMXN_SSSE3(128, 128) |
| 233 | HIGHBD_MASKSADMXN_SSSE3(128, 64) |
| 234 | HIGHBD_MASKSADMXN_SSSE3(64, 128) |
| 235 | #endif // CONFIG_EXT_PARTITION |
| 236 | HIGHBD_MASKSADMXN_SSSE3(64, 64) |
| 237 | HIGHBD_MASKSADMXN_SSSE3(64, 32) |
| 238 | HIGHBD_MASKSADMXN_SSSE3(32, 64) |
| 239 | HIGHBD_MASKSADMXN_SSSE3(32, 32) |
| 240 | HIGHBD_MASKSADMXN_SSSE3(32, 16) |
| 241 | HIGHBD_MASKSADMXN_SSSE3(16, 32) |
| 242 | HIGHBD_MASKSADMXN_SSSE3(16, 16) |
| 243 | HIGHBD_MASKSADMXN_SSSE3(16, 8) |
| 244 | HIGHBD_MASKSADMXN_SSSE3(8, 16) |
| 245 | HIGHBD_MASKSADMXN_SSSE3(8, 8) |
| 246 | HIGHBD_MASKSADMXN_SSSE3(8, 4) |
| 247 | |
| 248 | #define HIGHBD_MASKSAD4XN_SSSE3(n) \ |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 249 | unsigned int aom_highbd_masked_sad4x##n##_ssse3( \ |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 250 | const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ |
| 251 | const uint8_t *msk, int msk_stride) { \ |
| 252 | return highbd_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk, \ |
| 253 | msk_stride, n); \ |
| 254 | } |
| 255 | |
| 256 | HIGHBD_MASKSAD4XN_SSSE3(8) |
| 257 | HIGHBD_MASKSAD4XN_SSSE3(4) |
| 258 | |
| 259 | // For width a multiple of 8 |
| 260 | // Assumes values in m are <=64 |
| 261 | static INLINE unsigned int highbd_masked_sad_ssse3( |
| 262 | const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride, |
| 263 | const uint8_t *m_ptr, int m_stride, int width, int height) { |
| 264 | int y, x; |
| 265 | __m128i a, b, m, temp1, temp2; |
| 266 | const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr); |
| 267 | const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8_ptr); |
| 268 | __m128i res = _mm_setzero_si128(); |
| 269 | // For each row |
| 270 | for (y = 0; y < height; y++) { |
| 271 | // Covering the full width |
| 272 | for (x = 0; x < width; x += 8) { |
| 273 | // Load a, b, m in xmm registers |
| 274 | a = _mm_loadu_si128((const __m128i *)(a_ptr + x)); |
| 275 | b = _mm_loadu_si128((const __m128i *)(b_ptr + x)); |
| 276 | m = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(m_ptr + x)), |
| 277 | _mm_setzero_si128()); |
| 278 | |
| 279 | // Calculate the difference between a & b |
| 280 | temp1 = _mm_subs_epu16(a, b); |
| 281 | temp2 = _mm_subs_epu16(b, a); |
| 282 | temp1 = _mm_or_si128(temp1, temp2); |
| 283 | |
| 284 | // Add result of multiplying by m and add pairs together to running total |
| 285 | res = _mm_add_epi32(res, _mm_madd_epi16(temp1, m)); |
| 286 | } |
| 287 | // Move onto the next row |
| 288 | a_ptr += a_stride; |
| 289 | b_ptr += b_stride; |
| 290 | m_ptr += m_stride; |
| 291 | } |
| 292 | res = _mm_hadd_epi32(res, _mm_setzero_si128()); |
| 293 | res = _mm_hadd_epi32(res, _mm_setzero_si128()); |
| 294 | // sad = (sad + 31) >> 6; |
| 295 | return (_mm_cvtsi128_si32(res) + 31) >> 6; |
| 296 | } |
| 297 | |
| 298 | static INLINE unsigned int highbd_masked_sad4xh_ssse3( |
| 299 | const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride, |
| 300 | const uint8_t *m_ptr, int m_stride, int height) { |
| 301 | int y; |
| 302 | __m128i a, b, m, temp1, temp2; |
| 303 | const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr); |
| 304 | const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8_ptr); |
| 305 | __m128i res = _mm_setzero_si128(); |
| 306 | // Add the masked SAD for 2 rows at a time |
| 307 | for (y = 0; y < height; y += 2) { |
| 308 | // Load a, b, m in xmm registers |
| 309 | a = highbd_width4_load_2rows(a_ptr, a_stride); |
| 310 | b = highbd_width4_load_2rows(b_ptr, b_stride); |
| 311 | temp1 = _mm_loadl_epi64((const __m128i *)m_ptr); |
| 312 | temp2 = _mm_loadl_epi64((const __m128i *)(m_ptr + m_stride)); |
| 313 | m = _mm_unpacklo_epi8(_mm_unpacklo_epi32(temp1, temp2), |
| 314 | _mm_setzero_si128()); |
| 315 | |
| 316 | // Calculate the difference between a & b |
| 317 | temp1 = _mm_subs_epu16(a, b); |
| 318 | temp2 = _mm_subs_epu16(b, a); |
| 319 | temp1 = _mm_or_si128(temp1, temp2); |
| 320 | |
| 321 | // Multiply by m and add together |
| 322 | res = _mm_add_epi32(res, _mm_madd_epi16(temp1, m)); |
| 323 | |
| 324 | // Move onto the next rows |
| 325 | a_ptr += a_stride * 2; |
| 326 | b_ptr += b_stride * 2; |
| 327 | m_ptr += m_stride * 2; |
| 328 | } |
| 329 | res = _mm_hadd_epi32(res, _mm_setzero_si128()); |
| 330 | res = _mm_hadd_epi32(res, _mm_setzero_si128()); |
| 331 | // sad = (sad + 31) >> 6; |
| 332 | return (_mm_cvtsi128_si32(res) + 31) >> 6; |
| 333 | } |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 334 | #endif // CONFIG_AOM_HIGHBITDEPTH |