Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 1 | /* |
Yaowu Xu | 9c01aa1 | 2016-09-01 14:32:49 -0700 | [diff] [blame] | 2 | * Copyright (c) 2016, Alliance for Open Media. All rights reserved |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 3 | * |
Yaowu Xu | 9c01aa1 | 2016-09-01 14:32:49 -0700 | [diff] [blame] | 4 | * This source code is subject to the terms of the BSD 2 Clause License and |
| 5 | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| 6 | * was not distributed with this source code in the LICENSE file, you can |
| 7 | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| 8 | * Media Patent License 1.0 was not distributed with this source code in the |
| 9 | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 10 | */ |
| 11 | |
Tom Finegan | 44702c8 | 2018-05-22 13:00:39 -0700 | [diff] [blame] | 12 | #include "config/aom_dsp_rtcd.h" |
| 13 | |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 14 | #include "aom_dsp/mips/macros_msa.h" |
| 15 | |
| 16 | #define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out) \ |
| 17 | { \ |
| 18 | out = (RTYPE)__msa_insve_w((v4i32)out, 0, (v4i32)in0); \ |
| 19 | out = (RTYPE)__msa_insve_w((v4i32)out, 1, (v4i32)in1); \ |
| 20 | out = (RTYPE)__msa_insve_w((v4i32)out, 2, (v4i32)in2); \ |
| 21 | out = (RTYPE)__msa_insve_w((v4i32)out, 3, (v4i32)in3); \ |
| 22 | } |
| 23 | #define SAD_INSVE_W4_UB(...) SAD_INSVE_W4(v16u8, __VA_ARGS__) |
| 24 | |
| 25 | static uint32_t sad_4width_msa(const uint8_t *src_ptr, int32_t src_stride, |
| 26 | const uint8_t *ref_ptr, int32_t ref_stride, |
| 27 | int32_t height) { |
| 28 | int32_t ht_cnt; |
| 29 | uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3; |
| 30 | v16u8 src = { 0 }; |
| 31 | v16u8 ref = { 0 }; |
| 32 | v16u8 diff; |
| 33 | v8u16 sad = { 0 }; |
| 34 | |
| 35 | for (ht_cnt = (height >> 2); ht_cnt--;) { |
| 36 | LW4(src_ptr, src_stride, src0, src1, src2, src3); |
| 37 | src_ptr += (4 * src_stride); |
| 38 | LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); |
| 39 | ref_ptr += (4 * ref_stride); |
| 40 | |
| 41 | INSERT_W4_UB(src0, src1, src2, src3, src); |
| 42 | INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); |
| 43 | |
| 44 | diff = __msa_asub_u_b(src, ref); |
| 45 | sad += __msa_hadd_u_h(diff, diff); |
| 46 | } |
| 47 | |
| 48 | return HADD_UH_U32(sad); |
| 49 | } |
| 50 | |
| 51 | static uint32_t sad_8width_msa(const uint8_t *src, int32_t src_stride, |
| 52 | const uint8_t *ref, int32_t ref_stride, |
| 53 | int32_t height) { |
| 54 | int32_t ht_cnt; |
| 55 | v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3; |
| 56 | v8u16 sad = { 0 }; |
| 57 | |
| 58 | for (ht_cnt = (height >> 2); ht_cnt--;) { |
| 59 | LD_UB4(src, src_stride, src0, src1, src2, src3); |
| 60 | src += (4 * src_stride); |
| 61 | LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); |
| 62 | ref += (4 * ref_stride); |
| 63 | |
| 64 | PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, |
| 65 | ref0, ref1); |
| 66 | sad += SAD_UB2_UH(src0, src1, ref0, ref1); |
| 67 | } |
| 68 | |
| 69 | return HADD_UH_U32(sad); |
| 70 | } |
| 71 | |
| 72 | static uint32_t sad_16width_msa(const uint8_t *src, int32_t src_stride, |
| 73 | const uint8_t *ref, int32_t ref_stride, |
| 74 | int32_t height) { |
| 75 | int32_t ht_cnt; |
| 76 | v16u8 src0, src1, ref0, ref1; |
| 77 | v8u16 sad = { 0 }; |
| 78 | |
| 79 | for (ht_cnt = (height >> 2); ht_cnt--;) { |
| 80 | LD_UB2(src, src_stride, src0, src1); |
| 81 | src += (2 * src_stride); |
| 82 | LD_UB2(ref, ref_stride, ref0, ref1); |
| 83 | ref += (2 * ref_stride); |
| 84 | sad += SAD_UB2_UH(src0, src1, ref0, ref1); |
| 85 | |
| 86 | LD_UB2(src, src_stride, src0, src1); |
| 87 | src += (2 * src_stride); |
| 88 | LD_UB2(ref, ref_stride, ref0, ref1); |
| 89 | ref += (2 * ref_stride); |
| 90 | sad += SAD_UB2_UH(src0, src1, ref0, ref1); |
| 91 | } |
| 92 | |
| 93 | return HADD_UH_U32(sad); |
| 94 | } |
| 95 | |
| 96 | static uint32_t sad_32width_msa(const uint8_t *src, int32_t src_stride, |
| 97 | const uint8_t *ref, int32_t ref_stride, |
| 98 | int32_t height) { |
| 99 | int32_t ht_cnt; |
| 100 | v16u8 src0, src1, ref0, ref1; |
| 101 | v8u16 sad = { 0 }; |
| 102 | |
| 103 | for (ht_cnt = (height >> 2); ht_cnt--;) { |
| 104 | LD_UB2(src, 16, src0, src1); |
| 105 | src += src_stride; |
| 106 | LD_UB2(ref, 16, ref0, ref1); |
| 107 | ref += ref_stride; |
| 108 | sad += SAD_UB2_UH(src0, src1, ref0, ref1); |
| 109 | |
| 110 | LD_UB2(src, 16, src0, src1); |
| 111 | src += src_stride; |
| 112 | LD_UB2(ref, 16, ref0, ref1); |
| 113 | ref += ref_stride; |
| 114 | sad += SAD_UB2_UH(src0, src1, ref0, ref1); |
| 115 | |
| 116 | LD_UB2(src, 16, src0, src1); |
| 117 | src += src_stride; |
| 118 | LD_UB2(ref, 16, ref0, ref1); |
| 119 | ref += ref_stride; |
| 120 | sad += SAD_UB2_UH(src0, src1, ref0, ref1); |
| 121 | |
| 122 | LD_UB2(src, 16, src0, src1); |
| 123 | src += src_stride; |
| 124 | LD_UB2(ref, 16, ref0, ref1); |
| 125 | ref += ref_stride; |
| 126 | sad += SAD_UB2_UH(src0, src1, ref0, ref1); |
| 127 | } |
| 128 | |
| 129 | return HADD_UH_U32(sad); |
| 130 | } |
| 131 | |
| 132 | static uint32_t sad_64width_msa(const uint8_t *src, int32_t src_stride, |
| 133 | const uint8_t *ref, int32_t ref_stride, |
| 134 | int32_t height) { |
| 135 | int32_t ht_cnt; |
| 136 | uint32_t sad = 0; |
| 137 | v16u8 src0, src1, src2, src3; |
| 138 | v16u8 ref0, ref1, ref2, ref3; |
| 139 | v8u16 sad0 = { 0 }; |
| 140 | v8u16 sad1 = { 0 }; |
| 141 | |
| 142 | for (ht_cnt = (height >> 1); ht_cnt--;) { |
| 143 | LD_UB4(src, 16, src0, src1, src2, src3); |
| 144 | src += src_stride; |
| 145 | LD_UB4(ref, 16, ref0, ref1, ref2, ref3); |
| 146 | ref += ref_stride; |
| 147 | sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); |
| 148 | sad1 += SAD_UB2_UH(src2, src3, ref2, ref3); |
| 149 | |
| 150 | LD_UB4(src, 16, src0, src1, src2, src3); |
| 151 | src += src_stride; |
| 152 | LD_UB4(ref, 16, ref0, ref1, ref2, ref3); |
| 153 | ref += ref_stride; |
| 154 | sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); |
| 155 | sad1 += SAD_UB2_UH(src2, src3, ref2, ref3); |
| 156 | } |
| 157 | |
| 158 | sad = HADD_UH_U32(sad0); |
| 159 | sad += HADD_UH_U32(sad1); |
| 160 | |
| 161 | return sad; |
| 162 | } |
| 163 | |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 164 | static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, |
| 165 | const uint8_t *const aref_ptr[], |
| 166 | int32_t ref_stride, int32_t height, |
| 167 | uint32_t *sad_array) { |
| 168 | const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; |
| 169 | int32_t ht_cnt; |
| 170 | uint32_t src0, src1, src2, src3; |
| 171 | uint32_t ref0, ref1, ref2, ref3; |
| 172 | v16u8 src = { 0 }; |
| 173 | v16u8 ref = { 0 }; |
| 174 | v16u8 diff; |
| 175 | v8u16 sad0 = { 0 }; |
| 176 | v8u16 sad1 = { 0 }; |
| 177 | v8u16 sad2 = { 0 }; |
| 178 | v8u16 sad3 = { 0 }; |
| 179 | |
| 180 | ref0_ptr = aref_ptr[0]; |
| 181 | ref1_ptr = aref_ptr[1]; |
| 182 | ref2_ptr = aref_ptr[2]; |
| 183 | ref3_ptr = aref_ptr[3]; |
| 184 | |
| 185 | for (ht_cnt = (height >> 2); ht_cnt--;) { |
| 186 | LW4(src_ptr, src_stride, src0, src1, src2, src3); |
| 187 | INSERT_W4_UB(src0, src1, src2, src3, src); |
| 188 | src_ptr += (4 * src_stride); |
| 189 | |
| 190 | LW4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3); |
| 191 | INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); |
| 192 | ref0_ptr += (4 * ref_stride); |
| 193 | |
| 194 | diff = __msa_asub_u_b(src, ref); |
| 195 | sad0 += __msa_hadd_u_h(diff, diff); |
| 196 | |
| 197 | LW4(ref1_ptr, ref_stride, ref0, ref1, ref2, ref3); |
| 198 | INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); |
| 199 | ref1_ptr += (4 * ref_stride); |
| 200 | |
| 201 | diff = __msa_asub_u_b(src, ref); |
| 202 | sad1 += __msa_hadd_u_h(diff, diff); |
| 203 | |
| 204 | LW4(ref2_ptr, ref_stride, ref0, ref1, ref2, ref3); |
| 205 | INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); |
| 206 | ref2_ptr += (4 * ref_stride); |
| 207 | |
| 208 | diff = __msa_asub_u_b(src, ref); |
| 209 | sad2 += __msa_hadd_u_h(diff, diff); |
| 210 | |
| 211 | LW4(ref3_ptr, ref_stride, ref0, ref1, ref2, ref3); |
| 212 | INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); |
| 213 | ref3_ptr += (4 * ref_stride); |
| 214 | |
| 215 | diff = __msa_asub_u_b(src, ref); |
| 216 | sad3 += __msa_hadd_u_h(diff, diff); |
| 217 | } |
| 218 | |
| 219 | sad_array[0] = HADD_UH_U32(sad0); |
| 220 | sad_array[1] = HADD_UH_U32(sad1); |
| 221 | sad_array[2] = HADD_UH_U32(sad2); |
| 222 | sad_array[3] = HADD_UH_U32(sad3); |
| 223 | } |
| 224 | |
| 225 | static void sad_8width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, |
| 226 | const uint8_t *const aref_ptr[], |
| 227 | int32_t ref_stride, int32_t height, |
| 228 | uint32_t *sad_array) { |
| 229 | int32_t ht_cnt; |
| 230 | const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; |
| 231 | v16u8 src0, src1, src2, src3; |
| 232 | v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; |
| 233 | v16u8 ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15; |
| 234 | v8u16 sad0 = { 0 }; |
| 235 | v8u16 sad1 = { 0 }; |
| 236 | v8u16 sad2 = { 0 }; |
| 237 | v8u16 sad3 = { 0 }; |
| 238 | |
| 239 | ref0_ptr = aref_ptr[0]; |
| 240 | ref1_ptr = aref_ptr[1]; |
| 241 | ref2_ptr = aref_ptr[2]; |
| 242 | ref3_ptr = aref_ptr[3]; |
| 243 | |
| 244 | for (ht_cnt = (height >> 2); ht_cnt--;) { |
| 245 | LD_UB4(src_ptr, src_stride, src0, src1, src2, src3); |
| 246 | src_ptr += (4 * src_stride); |
| 247 | LD_UB4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3); |
| 248 | ref0_ptr += (4 * ref_stride); |
| 249 | LD_UB4(ref1_ptr, ref_stride, ref4, ref5, ref6, ref7); |
| 250 | ref1_ptr += (4 * ref_stride); |
| 251 | LD_UB4(ref2_ptr, ref_stride, ref8, ref9, ref10, ref11); |
| 252 | ref2_ptr += (4 * ref_stride); |
| 253 | LD_UB4(ref3_ptr, ref_stride, ref12, ref13, ref14, ref15); |
| 254 | ref3_ptr += (4 * ref_stride); |
| 255 | |
| 256 | PCKEV_D2_UB(src1, src0, src3, src2, src0, src1); |
| 257 | PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); |
| 258 | sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); |
| 259 | |
| 260 | PCKEV_D2_UB(ref5, ref4, ref7, ref6, ref0, ref1); |
| 261 | sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); |
| 262 | |
| 263 | PCKEV_D2_UB(ref9, ref8, ref11, ref10, ref0, ref1); |
| 264 | sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); |
| 265 | |
| 266 | PCKEV_D2_UB(ref13, ref12, ref15, ref14, ref0, ref1); |
| 267 | sad3 += SAD_UB2_UH(src0, src1, ref0, ref1); |
| 268 | } |
| 269 | |
| 270 | sad_array[0] = HADD_UH_U32(sad0); |
| 271 | sad_array[1] = HADD_UH_U32(sad1); |
| 272 | sad_array[2] = HADD_UH_U32(sad2); |
| 273 | sad_array[3] = HADD_UH_U32(sad3); |
| 274 | } |
| 275 | |
| 276 | static void sad_16width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, |
| 277 | const uint8_t *const aref_ptr[], |
| 278 | int32_t ref_stride, int32_t height, |
| 279 | uint32_t *sad_array) { |
| 280 | int32_t ht_cnt; |
| 281 | const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; |
| 282 | v16u8 src, ref0, ref1, ref2, ref3, diff; |
| 283 | v8u16 sad0 = { 0 }; |
| 284 | v8u16 sad1 = { 0 }; |
| 285 | v8u16 sad2 = { 0 }; |
| 286 | v8u16 sad3 = { 0 }; |
| 287 | |
| 288 | ref0_ptr = aref_ptr[0]; |
| 289 | ref1_ptr = aref_ptr[1]; |
| 290 | ref2_ptr = aref_ptr[2]; |
| 291 | ref3_ptr = aref_ptr[3]; |
| 292 | |
| 293 | for (ht_cnt = (height >> 1); ht_cnt--;) { |
| 294 | src = LD_UB(src_ptr); |
| 295 | src_ptr += src_stride; |
| 296 | ref0 = LD_UB(ref0_ptr); |
| 297 | ref0_ptr += ref_stride; |
| 298 | ref1 = LD_UB(ref1_ptr); |
| 299 | ref1_ptr += ref_stride; |
| 300 | ref2 = LD_UB(ref2_ptr); |
| 301 | ref2_ptr += ref_stride; |
| 302 | ref3 = LD_UB(ref3_ptr); |
| 303 | ref3_ptr += ref_stride; |
| 304 | |
| 305 | diff = __msa_asub_u_b(src, ref0); |
| 306 | sad0 += __msa_hadd_u_h(diff, diff); |
| 307 | diff = __msa_asub_u_b(src, ref1); |
| 308 | sad1 += __msa_hadd_u_h(diff, diff); |
| 309 | diff = __msa_asub_u_b(src, ref2); |
| 310 | sad2 += __msa_hadd_u_h(diff, diff); |
| 311 | diff = __msa_asub_u_b(src, ref3); |
| 312 | sad3 += __msa_hadd_u_h(diff, diff); |
| 313 | |
| 314 | src = LD_UB(src_ptr); |
| 315 | src_ptr += src_stride; |
| 316 | ref0 = LD_UB(ref0_ptr); |
| 317 | ref0_ptr += ref_stride; |
| 318 | ref1 = LD_UB(ref1_ptr); |
| 319 | ref1_ptr += ref_stride; |
| 320 | ref2 = LD_UB(ref2_ptr); |
| 321 | ref2_ptr += ref_stride; |
| 322 | ref3 = LD_UB(ref3_ptr); |
| 323 | ref3_ptr += ref_stride; |
| 324 | |
| 325 | diff = __msa_asub_u_b(src, ref0); |
| 326 | sad0 += __msa_hadd_u_h(diff, diff); |
| 327 | diff = __msa_asub_u_b(src, ref1); |
| 328 | sad1 += __msa_hadd_u_h(diff, diff); |
| 329 | diff = __msa_asub_u_b(src, ref2); |
| 330 | sad2 += __msa_hadd_u_h(diff, diff); |
| 331 | diff = __msa_asub_u_b(src, ref3); |
| 332 | sad3 += __msa_hadd_u_h(diff, diff); |
| 333 | } |
| 334 | |
| 335 | sad_array[0] = HADD_UH_U32(sad0); |
| 336 | sad_array[1] = HADD_UH_U32(sad1); |
| 337 | sad_array[2] = HADD_UH_U32(sad2); |
| 338 | sad_array[3] = HADD_UH_U32(sad3); |
| 339 | } |
| 340 | |
| 341 | static void sad_32width_x4d_msa(const uint8_t *src, int32_t src_stride, |
| 342 | const uint8_t *const aref_ptr[], |
| 343 | int32_t ref_stride, int32_t height, |
| 344 | uint32_t *sad_array) { |
| 345 | const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; |
| 346 | int32_t ht_cnt; |
| 347 | v16u8 src0, src1, ref0, ref1; |
| 348 | v8u16 sad0 = { 0 }; |
| 349 | v8u16 sad1 = { 0 }; |
| 350 | v8u16 sad2 = { 0 }; |
| 351 | v8u16 sad3 = { 0 }; |
| 352 | |
| 353 | ref0_ptr = aref_ptr[0]; |
| 354 | ref1_ptr = aref_ptr[1]; |
| 355 | ref2_ptr = aref_ptr[2]; |
| 356 | ref3_ptr = aref_ptr[3]; |
| 357 | |
| 358 | for (ht_cnt = height; ht_cnt--;) { |
| 359 | LD_UB2(src, 16, src0, src1); |
| 360 | src += src_stride; |
| 361 | |
| 362 | LD_UB2(ref0_ptr, 16, ref0, ref1); |
| 363 | ref0_ptr += ref_stride; |
| 364 | sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); |
| 365 | |
| 366 | LD_UB2(ref1_ptr, 16, ref0, ref1); |
| 367 | ref1_ptr += ref_stride; |
| 368 | sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); |
| 369 | |
| 370 | LD_UB2(ref2_ptr, 16, ref0, ref1); |
| 371 | ref2_ptr += ref_stride; |
| 372 | sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); |
| 373 | |
| 374 | LD_UB2(ref3_ptr, 16, ref0, ref1); |
| 375 | ref3_ptr += ref_stride; |
| 376 | sad3 += SAD_UB2_UH(src0, src1, ref0, ref1); |
| 377 | } |
| 378 | |
| 379 | sad_array[0] = HADD_UH_U32(sad0); |
| 380 | sad_array[1] = HADD_UH_U32(sad1); |
| 381 | sad_array[2] = HADD_UH_U32(sad2); |
| 382 | sad_array[3] = HADD_UH_U32(sad3); |
| 383 | } |
| 384 | |
| 385 | static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride, |
| 386 | const uint8_t *const aref_ptr[], |
| 387 | int32_t ref_stride, int32_t height, |
| 388 | uint32_t *sad_array) { |
| 389 | const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; |
| 390 | int32_t ht_cnt; |
| 391 | v16u8 src0, src1, src2, src3; |
| 392 | v16u8 ref0, ref1, ref2, ref3; |
| 393 | v8u16 sad0_0 = { 0 }; |
| 394 | v8u16 sad0_1 = { 0 }; |
| 395 | v8u16 sad1_0 = { 0 }; |
| 396 | v8u16 sad1_1 = { 0 }; |
| 397 | v8u16 sad2_0 = { 0 }; |
| 398 | v8u16 sad2_1 = { 0 }; |
| 399 | v8u16 sad3_0 = { 0 }; |
| 400 | v8u16 sad3_1 = { 0 }; |
| 401 | |
| 402 | ref0_ptr = aref_ptr[0]; |
| 403 | ref1_ptr = aref_ptr[1]; |
| 404 | ref2_ptr = aref_ptr[2]; |
| 405 | ref3_ptr = aref_ptr[3]; |
| 406 | |
| 407 | for (ht_cnt = height; ht_cnt--;) { |
| 408 | LD_UB4(src, 16, src0, src1, src2, src3); |
| 409 | src += src_stride; |
| 410 | |
| 411 | LD_UB4(ref0_ptr, 16, ref0, ref1, ref2, ref3); |
| 412 | ref0_ptr += ref_stride; |
| 413 | sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1); |
| 414 | sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3); |
| 415 | |
| 416 | LD_UB4(ref1_ptr, 16, ref0, ref1, ref2, ref3); |
| 417 | ref1_ptr += ref_stride; |
| 418 | sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1); |
| 419 | sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3); |
| 420 | |
| 421 | LD_UB4(ref2_ptr, 16, ref0, ref1, ref2, ref3); |
| 422 | ref2_ptr += ref_stride; |
| 423 | sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1); |
| 424 | sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3); |
| 425 | |
| 426 | LD_UB4(ref3_ptr, 16, ref0, ref1, ref2, ref3); |
| 427 | ref3_ptr += ref_stride; |
| 428 | sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1); |
| 429 | sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3); |
| 430 | } |
| 431 | |
| 432 | sad_array[0] = HADD_UH_U32(sad0_0); |
| 433 | sad_array[0] += HADD_UH_U32(sad0_1); |
| 434 | sad_array[1] = HADD_UH_U32(sad1_0); |
| 435 | sad_array[1] += HADD_UH_U32(sad1_1); |
| 436 | sad_array[2] = HADD_UH_U32(sad2_0); |
| 437 | sad_array[2] += HADD_UH_U32(sad2_1); |
| 438 | sad_array[3] = HADD_UH_U32(sad3_0); |
| 439 | sad_array[3] += HADD_UH_U32(sad3_1); |
| 440 | } |
| 441 | |
| 442 | static uint32_t avgsad_4width_msa(const uint8_t *src_ptr, int32_t src_stride, |
| 443 | const uint8_t *ref_ptr, int32_t ref_stride, |
| 444 | int32_t height, const uint8_t *sec_pred) { |
| 445 | int32_t ht_cnt; |
| 446 | uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3; |
| 447 | v16u8 src = { 0 }; |
| 448 | v16u8 ref = { 0 }; |
| 449 | v16u8 diff, pred, comp; |
| 450 | v8u16 sad = { 0 }; |
| 451 | |
| 452 | for (ht_cnt = (height >> 2); ht_cnt--;) { |
| 453 | LW4(src_ptr, src_stride, src0, src1, src2, src3); |
| 454 | src_ptr += (4 * src_stride); |
| 455 | LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); |
| 456 | ref_ptr += (4 * ref_stride); |
| 457 | pred = LD_UB(sec_pred); |
| 458 | sec_pred += 16; |
| 459 | |
| 460 | INSERT_W4_UB(src0, src1, src2, src3, src); |
| 461 | INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); |
| 462 | |
| 463 | comp = __msa_aver_u_b(pred, ref); |
| 464 | diff = __msa_asub_u_b(src, comp); |
| 465 | sad += __msa_hadd_u_h(diff, diff); |
| 466 | } |
| 467 | |
| 468 | return HADD_UH_U32(sad); |
| 469 | } |
| 470 | |
| 471 | static uint32_t avgsad_8width_msa(const uint8_t *src, int32_t src_stride, |
| 472 | const uint8_t *ref, int32_t ref_stride, |
| 473 | int32_t height, const uint8_t *sec_pred) { |
| 474 | int32_t ht_cnt; |
| 475 | v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3; |
| 476 | v16u8 diff0, diff1, pred0, pred1; |
| 477 | v8u16 sad = { 0 }; |
| 478 | |
| 479 | for (ht_cnt = (height >> 2); ht_cnt--;) { |
| 480 | LD_UB4(src, src_stride, src0, src1, src2, src3); |
| 481 | src += (4 * src_stride); |
| 482 | LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); |
| 483 | ref += (4 * ref_stride); |
| 484 | LD_UB2(sec_pred, 16, pred0, pred1); |
| 485 | sec_pred += 32; |
| 486 | PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, |
| 487 | ref0, ref1); |
| 488 | AVER_UB2_UB(pred0, ref0, pred1, ref1, diff0, diff1); |
| 489 | sad += SAD_UB2_UH(src0, src1, diff0, diff1); |
| 490 | } |
| 491 | |
| 492 | return HADD_UH_U32(sad); |
| 493 | } |
| 494 | |
| 495 | static uint32_t avgsad_16width_msa(const uint8_t *src, int32_t src_stride, |
| 496 | const uint8_t *ref, int32_t ref_stride, |
| 497 | int32_t height, const uint8_t *sec_pred) { |
| 498 | int32_t ht_cnt; |
| 499 | v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3; |
| 500 | v16u8 pred0, pred1, pred2, pred3, comp0, comp1; |
| 501 | v8u16 sad = { 0 }; |
| 502 | |
| 503 | for (ht_cnt = (height >> 3); ht_cnt--;) { |
| 504 | LD_UB4(src, src_stride, src0, src1, src2, src3); |
| 505 | src += (4 * src_stride); |
| 506 | LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); |
| 507 | ref += (4 * ref_stride); |
| 508 | LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); |
| 509 | sec_pred += (4 * 16); |
| 510 | AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1); |
| 511 | sad += SAD_UB2_UH(src0, src1, comp0, comp1); |
| 512 | AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1); |
| 513 | sad += SAD_UB2_UH(src2, src3, comp0, comp1); |
| 514 | |
| 515 | LD_UB4(src, src_stride, src0, src1, src2, src3); |
| 516 | src += (4 * src_stride); |
| 517 | LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); |
| 518 | ref += (4 * ref_stride); |
| 519 | LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); |
| 520 | sec_pred += (4 * 16); |
| 521 | AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1); |
| 522 | sad += SAD_UB2_UH(src0, src1, comp0, comp1); |
| 523 | AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1); |
| 524 | sad += SAD_UB2_UH(src2, src3, comp0, comp1); |
| 525 | } |
| 526 | |
| 527 | return HADD_UH_U32(sad); |
| 528 | } |
| 529 | |
| 530 | static uint32_t avgsad_32width_msa(const uint8_t *src, int32_t src_stride, |
| 531 | const uint8_t *ref, int32_t ref_stride, |
| 532 | int32_t height, const uint8_t *sec_pred) { |
| 533 | int32_t ht_cnt; |
| 534 | v16u8 src0, src1, src2, src3, src4, src5, src6, src7; |
| 535 | v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; |
| 536 | v16u8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; |
| 537 | v16u8 comp0, comp1; |
| 538 | v8u16 sad = { 0 }; |
| 539 | |
| 540 | for (ht_cnt = (height >> 2); ht_cnt--;) { |
| 541 | LD_UB4(src, src_stride, src0, src2, src4, src6); |
| 542 | LD_UB4(src + 16, src_stride, src1, src3, src5, src7); |
| 543 | src += (4 * src_stride); |
| 544 | |
| 545 | LD_UB4(ref, ref_stride, ref0, ref2, ref4, ref6); |
| 546 | LD_UB4(ref + 16, ref_stride, ref1, ref3, ref5, ref7); |
| 547 | ref += (4 * ref_stride); |
| 548 | |
| 549 | LD_UB4(sec_pred, 32, pred0, pred2, pred4, pred6); |
| 550 | LD_UB4(sec_pred + 16, 32, pred1, pred3, pred5, pred7); |
| 551 | sec_pred += (4 * 32); |
| 552 | |
| 553 | AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1); |
| 554 | sad += SAD_UB2_UH(src0, src1, comp0, comp1); |
| 555 | AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1); |
| 556 | sad += SAD_UB2_UH(src2, src3, comp0, comp1); |
| 557 | AVER_UB2_UB(pred4, ref4, pred5, ref5, comp0, comp1); |
| 558 | sad += SAD_UB2_UH(src4, src5, comp0, comp1); |
| 559 | AVER_UB2_UB(pred6, ref6, pred7, ref7, comp0, comp1); |
| 560 | sad += SAD_UB2_UH(src6, src7, comp0, comp1); |
| 561 | } |
| 562 | |
| 563 | return HADD_UH_U32(sad); |
| 564 | } |
| 565 | |
| 566 | static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride, |
| 567 | const uint8_t *ref, int32_t ref_stride, |
| 568 | int32_t height, const uint8_t *sec_pred) { |
| 569 | int32_t ht_cnt; |
| 570 | v16u8 src0, src1, src2, src3; |
| 571 | v16u8 ref0, ref1, ref2, ref3; |
| 572 | v16u8 comp0, comp1, comp2, comp3; |
| 573 | v16u8 pred0, pred1, pred2, pred3; |
| 574 | v8u16 sad0 = { 0 }; |
| 575 | v8u16 sad1 = { 0 }; |
| 576 | v4u32 sad; |
| 577 | |
| 578 | for (ht_cnt = (height >> 2); ht_cnt--;) { |
| 579 | LD_UB4(src, 16, src0, src1, src2, src3); |
| 580 | src += src_stride; |
| 581 | LD_UB4(ref, 16, ref0, ref1, ref2, ref3); |
| 582 | ref += ref_stride; |
| 583 | LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); |
| 584 | sec_pred += 64; |
| 585 | AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, |
| 586 | comp1, comp2, comp3); |
| 587 | sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); |
| 588 | sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); |
| 589 | |
| 590 | LD_UB4(src, 16, src0, src1, src2, src3); |
| 591 | src += src_stride; |
| 592 | LD_UB4(ref, 16, ref0, ref1, ref2, ref3); |
| 593 | ref += ref_stride; |
| 594 | LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); |
| 595 | sec_pred += 64; |
| 596 | AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, |
| 597 | comp1, comp2, comp3); |
| 598 | sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); |
| 599 | sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); |
| 600 | |
| 601 | LD_UB4(src, 16, src0, src1, src2, src3); |
| 602 | src += src_stride; |
| 603 | LD_UB4(ref, 16, ref0, ref1, ref2, ref3); |
| 604 | ref += ref_stride; |
| 605 | LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); |
| 606 | sec_pred += 64; |
| 607 | AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, |
| 608 | comp1, comp2, comp3); |
| 609 | sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); |
| 610 | sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); |
| 611 | |
| 612 | LD_UB4(src, 16, src0, src1, src2, src3); |
| 613 | src += src_stride; |
| 614 | LD_UB4(ref, 16, ref0, ref1, ref2, ref3); |
| 615 | ref += ref_stride; |
| 616 | LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); |
| 617 | sec_pred += 64; |
| 618 | AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, |
| 619 | comp1, comp2, comp3); |
| 620 | sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); |
| 621 | sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); |
| 622 | } |
| 623 | |
| 624 | sad = __msa_hadd_u_w(sad0, sad0); |
| 625 | sad += __msa_hadd_u_w(sad1, sad1); |
| 626 | |
| 627 | return HADD_SW_S32(sad); |
| 628 | } |
| 629 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 630 | #define AOM_SAD_4xHEIGHT_MSA(height) \ |
| 631 | uint32_t aom_sad4x##height##_msa(const uint8_t *src, int32_t src_stride, \ |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 632 | const uint8_t *ref, int32_t ref_stride) { \ |
| 633 | return sad_4width_msa(src, src_stride, ref, ref_stride, height); \ |
| 634 | } |
| 635 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 636 | #define AOM_SAD_8xHEIGHT_MSA(height) \ |
| 637 | uint32_t aom_sad8x##height##_msa(const uint8_t *src, int32_t src_stride, \ |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 638 | const uint8_t *ref, int32_t ref_stride) { \ |
| 639 | return sad_8width_msa(src, src_stride, ref, ref_stride, height); \ |
| 640 | } |
| 641 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 642 | #define AOM_SAD_16xHEIGHT_MSA(height) \ |
| 643 | uint32_t aom_sad16x##height##_msa(const uint8_t *src, int32_t src_stride, \ |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 644 | const uint8_t *ref, int32_t ref_stride) { \ |
| 645 | return sad_16width_msa(src, src_stride, ref, ref_stride, height); \ |
| 646 | } |
| 647 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 648 | #define AOM_SAD_32xHEIGHT_MSA(height) \ |
| 649 | uint32_t aom_sad32x##height##_msa(const uint8_t *src, int32_t src_stride, \ |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 650 | const uint8_t *ref, int32_t ref_stride) { \ |
| 651 | return sad_32width_msa(src, src_stride, ref, ref_stride, height); \ |
| 652 | } |
| 653 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 654 | #define AOM_SAD_64xHEIGHT_MSA(height) \ |
| 655 | uint32_t aom_sad64x##height##_msa(const uint8_t *src, int32_t src_stride, \ |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 656 | const uint8_t *ref, int32_t ref_stride) { \ |
| 657 | return sad_64width_msa(src, src_stride, ref, ref_stride, height); \ |
| 658 | } |
| 659 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 660 | #define AOM_SAD_4xHEIGHTx4D_MSA(height) \ |
| 661 | void aom_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 662 | const uint8_t *const refs[], \ |
| 663 | int32_t ref_stride, uint32_t *sads) { \ |
| 664 | sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ |
| 665 | } |
| 666 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 667 | #define AOM_SAD_8xHEIGHTx4D_MSA(height) \ |
| 668 | void aom_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 669 | const uint8_t *const refs[], \ |
| 670 | int32_t ref_stride, uint32_t *sads) { \ |
| 671 | sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ |
| 672 | } |
| 673 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 674 | #define AOM_SAD_16xHEIGHTx4D_MSA(height) \ |
| 675 | void aom_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 676 | const uint8_t *const refs[], \ |
| 677 | int32_t ref_stride, uint32_t *sads) { \ |
| 678 | sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ |
| 679 | } |
| 680 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 681 | #define AOM_SAD_32xHEIGHTx4D_MSA(height) \ |
| 682 | void aom_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 683 | const uint8_t *const refs[], \ |
| 684 | int32_t ref_stride, uint32_t *sads) { \ |
| 685 | sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ |
| 686 | } |
| 687 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 688 | #define AOM_SAD_64xHEIGHTx4D_MSA(height) \ |
| 689 | void aom_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 690 | const uint8_t *const refs[], \ |
| 691 | int32_t ref_stride, uint32_t *sads) { \ |
| 692 | sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ |
| 693 | } |
| 694 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 695 | #define AOM_AVGSAD_4xHEIGHT_MSA(height) \ |
| 696 | uint32_t aom_sad4x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \ |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 697 | const uint8_t *ref, int32_t ref_stride, \ |
| 698 | const uint8_t *second_pred) { \ |
| 699 | return avgsad_4width_msa(src, src_stride, ref, ref_stride, height, \ |
| 700 | second_pred); \ |
| 701 | } |
| 702 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 703 | #define AOM_AVGSAD_8xHEIGHT_MSA(height) \ |
| 704 | uint32_t aom_sad8x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \ |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 705 | const uint8_t *ref, int32_t ref_stride, \ |
| 706 | const uint8_t *second_pred) { \ |
| 707 | return avgsad_8width_msa(src, src_stride, ref, ref_stride, height, \ |
| 708 | second_pred); \ |
| 709 | } |
| 710 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 711 | #define AOM_AVGSAD_16xHEIGHT_MSA(height) \ |
| 712 | uint32_t aom_sad16x##height##_avg_msa( \ |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 713 | const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ |
| 714 | int32_t ref_stride, const uint8_t *second_pred) { \ |
| 715 | return avgsad_16width_msa(src, src_stride, ref, ref_stride, height, \ |
| 716 | second_pred); \ |
| 717 | } |
| 718 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 719 | #define AOM_AVGSAD_32xHEIGHT_MSA(height) \ |
| 720 | uint32_t aom_sad32x##height##_avg_msa( \ |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 721 | const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ |
| 722 | int32_t ref_stride, const uint8_t *second_pred) { \ |
| 723 | return avgsad_32width_msa(src, src_stride, ref, ref_stride, height, \ |
| 724 | second_pred); \ |
| 725 | } |
| 726 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 727 | #define AOM_AVGSAD_64xHEIGHT_MSA(height) \ |
| 728 | uint32_t aom_sad64x##height##_avg_msa( \ |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 729 | const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ |
| 730 | int32_t ref_stride, const uint8_t *second_pred) { \ |
| 731 | return avgsad_64width_msa(src, src_stride, ref, ref_stride, height, \ |
| 732 | second_pred); \ |
| 733 | } |
| 734 | |
Yaowu Xu | 410fee8 | 2016-10-13 13:04:29 -0700 | [diff] [blame] | 735 | /* clang-format off */ |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 736 | // 64x64 |
Yaowu Xu | 410fee8 | 2016-10-13 13:04:29 -0700 | [diff] [blame] | 737 | AOM_SAD_64xHEIGHT_MSA(64) |
Yaowu Xu | 410fee8 | 2016-10-13 13:04:29 -0700 | [diff] [blame] | 738 | AOM_SAD_64xHEIGHTx4D_MSA(64) |
| 739 | AOM_AVGSAD_64xHEIGHT_MSA(64) |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 740 | |
| 741 | // 64x32 |
Yaowu Xu | 410fee8 | 2016-10-13 13:04:29 -0700 | [diff] [blame] | 742 | AOM_SAD_64xHEIGHT_MSA(32) |
Yaowu Xu | 410fee8 | 2016-10-13 13:04:29 -0700 | [diff] [blame] | 743 | AOM_SAD_64xHEIGHTx4D_MSA(32) |
| 744 | AOM_AVGSAD_64xHEIGHT_MSA(32) |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 745 | |
| 746 | // 32x64 |
Yaowu Xu | 410fee8 | 2016-10-13 13:04:29 -0700 | [diff] [blame] | 747 | AOM_SAD_32xHEIGHT_MSA(64) |
Yaowu Xu | 410fee8 | 2016-10-13 13:04:29 -0700 | [diff] [blame] | 748 | AOM_SAD_32xHEIGHTx4D_MSA(64) |
| 749 | AOM_AVGSAD_32xHEIGHT_MSA(64) |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 750 | |
| 751 | // 32x32 |
Yaowu Xu | 410fee8 | 2016-10-13 13:04:29 -0700 | [diff] [blame] | 752 | AOM_SAD_32xHEIGHT_MSA(32) |
Yaowu Xu | 410fee8 | 2016-10-13 13:04:29 -0700 | [diff] [blame] | 753 | AOM_SAD_32xHEIGHTx4D_MSA(32) |
| 754 | AOM_AVGSAD_32xHEIGHT_MSA(32) |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 755 | |
| 756 | // 32x16 |
Yaowu Xu | 410fee8 | 2016-10-13 13:04:29 -0700 | [diff] [blame] | 757 | AOM_SAD_32xHEIGHT_MSA(16) |
Yaowu Xu | 410fee8 | 2016-10-13 13:04:29 -0700 | [diff] [blame] | 758 | AOM_SAD_32xHEIGHTx4D_MSA(16) |
| 759 | AOM_AVGSAD_32xHEIGHT_MSA(16) |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 760 | |
| 761 | // 16x32 |
Yaowu Xu | 410fee8 | 2016-10-13 13:04:29 -0700 | [diff] [blame] | 762 | AOM_SAD_16xHEIGHT_MSA(32) |
Yaowu Xu | 410fee8 | 2016-10-13 13:04:29 -0700 | [diff] [blame] | 763 | AOM_SAD_16xHEIGHTx4D_MSA(32) |
| 764 | AOM_AVGSAD_16xHEIGHT_MSA(32) |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 765 | |
| 766 | // 16x16 |
Yaowu Xu | 410fee8 | 2016-10-13 13:04:29 -0700 | [diff] [blame] | 767 | AOM_SAD_16xHEIGHT_MSA(16) |
Yaowu Xu | 410fee8 | 2016-10-13 13:04:29 -0700 | [diff] [blame] | 768 | AOM_SAD_16xHEIGHTx4D_MSA(16) |
| 769 | AOM_AVGSAD_16xHEIGHT_MSA(16) |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 770 | |
| 771 | // 16x8 |
Yaowu Xu | 410fee8 | 2016-10-13 13:04:29 -0700 | [diff] [blame] | 772 | AOM_SAD_16xHEIGHT_MSA(8) |
Yaowu Xu | 410fee8 | 2016-10-13 13:04:29 -0700 | [diff] [blame] | 773 | AOM_SAD_16xHEIGHTx4D_MSA(8) |
| 774 | AOM_AVGSAD_16xHEIGHT_MSA(8) |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 775 | |
| 776 | // 8x16 |
Yaowu Xu | 410fee8 | 2016-10-13 13:04:29 -0700 | [diff] [blame] | 777 | AOM_SAD_8xHEIGHT_MSA(16) |
Yaowu Xu | 410fee8 | 2016-10-13 13:04:29 -0700 | [diff] [blame] | 778 | AOM_SAD_8xHEIGHTx4D_MSA(16) |
| 779 | AOM_AVGSAD_8xHEIGHT_MSA(16) |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 780 | |
| 781 | // 8x8 |
Yaowu Xu | 410fee8 | 2016-10-13 13:04:29 -0700 | [diff] [blame] | 782 | AOM_SAD_8xHEIGHT_MSA(8) |
Yaowu Xu | 410fee8 | 2016-10-13 13:04:29 -0700 | [diff] [blame] | 783 | AOM_SAD_8xHEIGHTx4D_MSA(8) |
| 784 | AOM_AVGSAD_8xHEIGHT_MSA(8) |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 785 | |
| 786 | // 8x4 |
Yaowu Xu | 410fee8 | 2016-10-13 13:04:29 -0700 | [diff] [blame] | 787 | AOM_SAD_8xHEIGHT_MSA(4) |
Yaowu Xu | 410fee8 | 2016-10-13 13:04:29 -0700 | [diff] [blame] | 788 | AOM_SAD_8xHEIGHTx4D_MSA(4) |
| 789 | AOM_AVGSAD_8xHEIGHT_MSA(4) |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 790 | |
| 791 | // 4x8 |
Yaowu Xu | 410fee8 | 2016-10-13 13:04:29 -0700 | [diff] [blame] | 792 | AOM_SAD_4xHEIGHT_MSA(8) |
Yaowu Xu | 410fee8 | 2016-10-13 13:04:29 -0700 | [diff] [blame] | 793 | AOM_SAD_4xHEIGHTx4D_MSA(8) |
| 794 | AOM_AVGSAD_4xHEIGHT_MSA(8) |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 795 | |
| 796 | // 4x4 |
Yaowu Xu | 410fee8 | 2016-10-13 13:04:29 -0700 | [diff] [blame] | 797 | AOM_SAD_4xHEIGHT_MSA(4) |
Yaowu Xu | 410fee8 | 2016-10-13 13:04:29 -0700 | [diff] [blame] | 798 | AOM_SAD_4xHEIGHTx4D_MSA(4) |
| 799 | AOM_AVGSAD_4xHEIGHT_MSA(4) |
| 800 | /* clang-format on */ |