Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 1 | ; |
Yaowu Xu | 9c01aa1 | 2016-09-01 14:32:49 -0700 | [diff] [blame] | 2 | ; Copyright (c) 2016, Alliance for Open Media. All rights reserved |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 3 | ; |
Yaowu Xu | 9c01aa1 | 2016-09-01 14:32:49 -0700 | [diff] [blame] | 4 | ; This source code is subject to the terms of the BSD 2 Clause License and |
| 5 | ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| 6 | ; was not distributed with this source code in the LICENSE file, you can |
| 7 | ; obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| 8 | ; Media Patent License 1.0 was not distributed with this source code in the |
| 9 | ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
| 10 | ; |
| 11 | |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 12 | ; |
| 13 | |
| 14 | %include "aom_ports/x86_abi_support.asm" |
| 15 | |
| 16 | %macro HIGH_GET_PARAM_4 0 |
| 17 | mov rdx, arg(5) ;filter ptr |
| 18 | mov rsi, arg(0) ;src_ptr |
| 19 | mov rdi, arg(2) ;output_ptr |
| 20 | mov rcx, 0x00000040 |
| 21 | |
| 22 | movdqa xmm3, [rdx] ;load filters |
| 23 | pshuflw xmm4, xmm3, 11111111b ;k3 |
| 24 | psrldq xmm3, 8 |
| 25 | pshuflw xmm3, xmm3, 0b ;k4 |
| 26 | punpcklwd xmm4, xmm3 ;k3k4 |
| 27 | |
| 28 | movq xmm3, rcx ;rounding |
| 29 | pshufd xmm3, xmm3, 0 |
| 30 | |
| 31 | mov rdx, 0x00010001 |
| 32 | movsxd rcx, DWORD PTR arg(6) ;bps |
| 33 | movq xmm5, rdx |
| 34 | movq xmm2, rcx |
| 35 | pshufd xmm5, xmm5, 0b |
| 36 | movdqa xmm1, xmm5 |
| 37 | psllw xmm5, xmm2 |
| 38 | psubw xmm5, xmm1 ;max value (for clamping) |
| 39 | pxor xmm2, xmm2 ;min value (for clamping) |
| 40 | |
| 41 | movsxd rax, DWORD PTR arg(1) ;pixels_per_line |
| 42 | movsxd rdx, DWORD PTR arg(3) ;out_pitch |
| 43 | movsxd rcx, DWORD PTR arg(4) ;output_height |
| 44 | %endm |
| 45 | |
| 46 | %macro HIGH_APPLY_FILTER_4 1 |
| 47 | |
| 48 | punpcklwd xmm0, xmm1 ;two row in one register |
| 49 | pmaddwd xmm0, xmm4 ;multiply the filter factors |
| 50 | |
| 51 | paddd xmm0, xmm3 ;rounding |
| 52 | psrad xmm0, 7 ;shift |
| 53 | packssdw xmm0, xmm0 ;pack to word |
| 54 | |
| 55 | ;clamp the values |
| 56 | pminsw xmm0, xmm5 |
| 57 | pmaxsw xmm0, xmm2 |
| 58 | |
| 59 | %if %1 |
| 60 | movq xmm1, [rdi] |
| 61 | pavgw xmm0, xmm1 |
| 62 | %endif |
| 63 | |
| 64 | movq [rdi], xmm0 |
| 65 | lea rsi, [rsi + 2*rax] |
| 66 | lea rdi, [rdi + 2*rdx] |
| 67 | dec rcx |
| 68 | %endm |
| 69 | |
| 70 | %if ARCH_X86_64 |
| 71 | %macro HIGH_GET_PARAM 0 |
| 72 | mov rdx, arg(5) ;filter ptr |
| 73 | mov rsi, arg(0) ;src_ptr |
| 74 | mov rdi, arg(2) ;output_ptr |
| 75 | mov rcx, 0x00000040 |
| 76 | |
| 77 | movdqa xmm6, [rdx] ;load filters |
| 78 | |
| 79 | pshuflw xmm7, xmm6, 11111111b ;k3 |
| 80 | pshufhw xmm6, xmm6, 0b ;k4 |
| 81 | psrldq xmm6, 8 |
| 82 | punpcklwd xmm7, xmm6 ;k3k4k3k4k3k4k3k4 |
| 83 | |
| 84 | movq xmm4, rcx ;rounding |
| 85 | pshufd xmm4, xmm4, 0 |
| 86 | |
| 87 | mov rdx, 0x00010001 |
| 88 | movsxd rcx, DWORD PTR arg(6) ;bps |
| 89 | movq xmm8, rdx |
| 90 | movq xmm5, rcx |
| 91 | pshufd xmm8, xmm8, 0b |
| 92 | movdqa xmm1, xmm8 |
| 93 | psllw xmm8, xmm5 |
| 94 | psubw xmm8, xmm1 ;max value (for clamping) |
| 95 | pxor xmm5, xmm5 ;min value (for clamping) |
| 96 | |
| 97 | movsxd rax, DWORD PTR arg(1) ;pixels_per_line |
| 98 | movsxd rdx, DWORD PTR arg(3) ;out_pitch |
| 99 | movsxd rcx, DWORD PTR arg(4) ;output_height |
| 100 | %endm |
| 101 | |
| 102 | %macro HIGH_APPLY_FILTER_8 1 |
| 103 | movdqa xmm6, xmm0 |
| 104 | punpckhwd xmm6, xmm1 |
| 105 | punpcklwd xmm0, xmm1 |
| 106 | pmaddwd xmm6, xmm7 |
| 107 | pmaddwd xmm0, xmm7 |
| 108 | |
| 109 | paddd xmm6, xmm4 ;rounding |
| 110 | paddd xmm0, xmm4 ;rounding |
| 111 | psrad xmm6, 7 ;shift |
| 112 | psrad xmm0, 7 ;shift |
| 113 | packssdw xmm0, xmm6 ;pack back to word |
| 114 | |
| 115 | ;clamp the values |
| 116 | pminsw xmm0, xmm8 |
| 117 | pmaxsw xmm0, xmm5 |
| 118 | |
| 119 | %if %1 |
| 120 | movdqu xmm1, [rdi] |
| 121 | pavgw xmm0, xmm1 |
| 122 | %endif |
| 123 | movdqu [rdi], xmm0 ;store the result |
| 124 | |
| 125 | lea rsi, [rsi + 2*rax] |
| 126 | lea rdi, [rdi + 2*rdx] |
| 127 | dec rcx |
| 128 | %endm |
| 129 | |
| 130 | %macro HIGH_APPLY_FILTER_16 1 |
| 131 | movdqa xmm9, xmm0 |
| 132 | movdqa xmm6, xmm2 |
| 133 | punpckhwd xmm9, xmm1 |
| 134 | punpckhwd xmm6, xmm3 |
| 135 | punpcklwd xmm0, xmm1 |
| 136 | punpcklwd xmm2, xmm3 |
| 137 | |
| 138 | pmaddwd xmm9, xmm7 |
| 139 | pmaddwd xmm6, xmm7 |
| 140 | pmaddwd xmm0, xmm7 |
| 141 | pmaddwd xmm2, xmm7 |
| 142 | |
| 143 | paddd xmm9, xmm4 ;rounding |
| 144 | paddd xmm6, xmm4 |
| 145 | paddd xmm0, xmm4 |
| 146 | paddd xmm2, xmm4 |
| 147 | |
| 148 | psrad xmm9, 7 ;shift |
| 149 | psrad xmm6, 7 |
| 150 | psrad xmm0, 7 |
| 151 | psrad xmm2, 7 |
| 152 | |
| 153 | packssdw xmm0, xmm9 ;pack back to word |
| 154 | packssdw xmm2, xmm6 ;pack back to word |
| 155 | |
| 156 | ;clamp the values |
| 157 | pminsw xmm0, xmm8 |
| 158 | pmaxsw xmm0, xmm5 |
| 159 | pminsw xmm2, xmm8 |
| 160 | pmaxsw xmm2, xmm5 |
| 161 | |
| 162 | %if %1 |
| 163 | movdqu xmm1, [rdi] |
| 164 | movdqu xmm3, [rdi + 16] |
| 165 | pavgw xmm0, xmm1 |
| 166 | pavgw xmm2, xmm3 |
| 167 | %endif |
| 168 | movdqu [rdi], xmm0 ;store the result |
| 169 | movdqu [rdi + 16], xmm2 ;store the result |
| 170 | |
| 171 | lea rsi, [rsi + 2*rax] |
| 172 | lea rdi, [rdi + 2*rdx] |
| 173 | dec rcx |
| 174 | %endm |
| 175 | %endif |
| 176 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 177 | global sym(aom_highbd_filter_block1d4_v2_sse2) PRIVATE |
| 178 | sym(aom_highbd_filter_block1d4_v2_sse2): |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 179 | push rbp |
| 180 | mov rbp, rsp |
| 181 | SHADOW_ARGS_TO_STACK 7 |
| 182 | push rsi |
| 183 | push rdi |
| 184 | ; end prolog |
| 185 | |
| 186 | HIGH_GET_PARAM_4 |
| 187 | .loop: |
| 188 | movq xmm0, [rsi] ;load src |
| 189 | movq xmm1, [rsi + 2*rax] |
| 190 | |
| 191 | HIGH_APPLY_FILTER_4 0 |
| 192 | jnz .loop |
| 193 | |
| 194 | ; begin epilog |
| 195 | pop rdi |
| 196 | pop rsi |
| 197 | UNSHADOW_ARGS |
| 198 | pop rbp |
| 199 | ret |
| 200 | |
| 201 | %if ARCH_X86_64 |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 202 | global sym(aom_highbd_filter_block1d8_v2_sse2) PRIVATE |
| 203 | sym(aom_highbd_filter_block1d8_v2_sse2): |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 204 | push rbp |
| 205 | mov rbp, rsp |
| 206 | SHADOW_ARGS_TO_STACK 7 |
| 207 | SAVE_XMM 8 |
| 208 | push rsi |
| 209 | push rdi |
| 210 | ; end prolog |
| 211 | |
| 212 | HIGH_GET_PARAM |
| 213 | .loop: |
| 214 | movdqu xmm0, [rsi] ;0 |
| 215 | movdqu xmm1, [rsi + 2*rax] ;1 |
| 216 | |
| 217 | HIGH_APPLY_FILTER_8 0 |
| 218 | jnz .loop |
| 219 | |
| 220 | ; begin epilog |
| 221 | pop rdi |
| 222 | pop rsi |
| 223 | RESTORE_XMM |
| 224 | UNSHADOW_ARGS |
| 225 | pop rbp |
| 226 | ret |
| 227 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 228 | global sym(aom_highbd_filter_block1d16_v2_sse2) PRIVATE |
| 229 | sym(aom_highbd_filter_block1d16_v2_sse2): |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 230 | push rbp |
| 231 | mov rbp, rsp |
| 232 | SHADOW_ARGS_TO_STACK 7 |
| 233 | SAVE_XMM 9 |
| 234 | push rsi |
| 235 | push rdi |
| 236 | ; end prolog |
| 237 | |
| 238 | HIGH_GET_PARAM |
| 239 | .loop: |
| 240 | movdqu xmm0, [rsi] ;0 |
| 241 | movdqu xmm2, [rsi + 16] |
| 242 | movdqu xmm1, [rsi + 2*rax] ;1 |
| 243 | movdqu xmm3, [rsi + 2*rax + 16] |
| 244 | |
| 245 | HIGH_APPLY_FILTER_16 0 |
| 246 | jnz .loop |
| 247 | |
| 248 | ; begin epilog |
| 249 | pop rdi |
| 250 | pop rsi |
| 251 | RESTORE_XMM |
| 252 | UNSHADOW_ARGS |
| 253 | pop rbp |
| 254 | ret |
| 255 | %endif |
| 256 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 257 | global sym(aom_highbd_filter_block1d4_v2_avg_sse2) PRIVATE |
| 258 | sym(aom_highbd_filter_block1d4_v2_avg_sse2): |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 259 | push rbp |
| 260 | mov rbp, rsp |
| 261 | SHADOW_ARGS_TO_STACK 7 |
| 262 | push rsi |
| 263 | push rdi |
| 264 | ; end prolog |
| 265 | |
| 266 | HIGH_GET_PARAM_4 |
| 267 | .loop: |
| 268 | movq xmm0, [rsi] ;load src |
| 269 | movq xmm1, [rsi + 2*rax] |
| 270 | |
| 271 | HIGH_APPLY_FILTER_4 1 |
| 272 | jnz .loop |
| 273 | |
| 274 | ; begin epilog |
| 275 | pop rdi |
| 276 | pop rsi |
| 277 | UNSHADOW_ARGS |
| 278 | pop rbp |
| 279 | ret |
| 280 | |
| 281 | %if ARCH_X86_64 |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 282 | global sym(aom_highbd_filter_block1d8_v2_avg_sse2) PRIVATE |
| 283 | sym(aom_highbd_filter_block1d8_v2_avg_sse2): |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 284 | push rbp |
| 285 | mov rbp, rsp |
| 286 | SHADOW_ARGS_TO_STACK 7 |
| 287 | SAVE_XMM 8 |
| 288 | push rsi |
| 289 | push rdi |
| 290 | ; end prolog |
| 291 | |
| 292 | HIGH_GET_PARAM |
| 293 | .loop: |
| 294 | movdqu xmm0, [rsi] ;0 |
| 295 | movdqu xmm1, [rsi + 2*rax] ;1 |
| 296 | |
| 297 | HIGH_APPLY_FILTER_8 1 |
| 298 | jnz .loop |
| 299 | |
| 300 | ; begin epilog |
| 301 | pop rdi |
| 302 | pop rsi |
| 303 | RESTORE_XMM |
| 304 | UNSHADOW_ARGS |
| 305 | pop rbp |
| 306 | ret |
| 307 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 308 | global sym(aom_highbd_filter_block1d16_v2_avg_sse2) PRIVATE |
| 309 | sym(aom_highbd_filter_block1d16_v2_avg_sse2): |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 310 | push rbp |
| 311 | mov rbp, rsp |
| 312 | SHADOW_ARGS_TO_STACK 7 |
| 313 | SAVE_XMM 9 |
| 314 | push rsi |
| 315 | push rdi |
| 316 | ; end prolog |
| 317 | |
| 318 | HIGH_GET_PARAM |
| 319 | .loop: |
| 320 | movdqu xmm0, [rsi] ;0 |
| 321 | movdqu xmm1, [rsi + 2*rax] ;1 |
| 322 | movdqu xmm2, [rsi + 16] |
| 323 | movdqu xmm3, [rsi + 2*rax + 16] |
| 324 | |
| 325 | HIGH_APPLY_FILTER_16 1 |
| 326 | jnz .loop |
| 327 | |
| 328 | ; begin epilog |
| 329 | pop rdi |
| 330 | pop rsi |
| 331 | RESTORE_XMM |
| 332 | UNSHADOW_ARGS |
| 333 | pop rbp |
| 334 | ret |
| 335 | %endif |
| 336 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 337 | global sym(aom_highbd_filter_block1d4_h2_sse2) PRIVATE |
| 338 | sym(aom_highbd_filter_block1d4_h2_sse2): |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 339 | push rbp |
| 340 | mov rbp, rsp |
| 341 | SHADOW_ARGS_TO_STACK 7 |
| 342 | push rsi |
| 343 | push rdi |
| 344 | ; end prolog |
| 345 | |
| 346 | HIGH_GET_PARAM_4 |
| 347 | .loop: |
| 348 | movdqu xmm0, [rsi] ;load src |
| 349 | movdqa xmm1, xmm0 |
| 350 | psrldq xmm1, 2 |
| 351 | |
| 352 | HIGH_APPLY_FILTER_4 0 |
| 353 | jnz .loop |
| 354 | |
| 355 | ; begin epilog |
| 356 | pop rdi |
| 357 | pop rsi |
| 358 | UNSHADOW_ARGS |
| 359 | pop rbp |
| 360 | ret |
| 361 | |
| 362 | %if ARCH_X86_64 |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 363 | global sym(aom_highbd_filter_block1d8_h2_sse2) PRIVATE |
| 364 | sym(aom_highbd_filter_block1d8_h2_sse2): |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 365 | push rbp |
| 366 | mov rbp, rsp |
| 367 | SHADOW_ARGS_TO_STACK 7 |
| 368 | SAVE_XMM 8 |
| 369 | push rsi |
| 370 | push rdi |
| 371 | ; end prolog |
| 372 | |
| 373 | HIGH_GET_PARAM |
| 374 | .loop: |
| 375 | movdqu xmm0, [rsi] ;load src |
| 376 | movdqu xmm1, [rsi + 2] |
| 377 | |
| 378 | HIGH_APPLY_FILTER_8 0 |
| 379 | jnz .loop |
| 380 | |
| 381 | ; begin epilog |
| 382 | pop rdi |
| 383 | pop rsi |
| 384 | RESTORE_XMM |
| 385 | UNSHADOW_ARGS |
| 386 | pop rbp |
| 387 | ret |
| 388 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 389 | global sym(aom_highbd_filter_block1d16_h2_sse2) PRIVATE |
| 390 | sym(aom_highbd_filter_block1d16_h2_sse2): |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 391 | push rbp |
| 392 | mov rbp, rsp |
| 393 | SHADOW_ARGS_TO_STACK 7 |
| 394 | SAVE_XMM 9 |
| 395 | push rsi |
| 396 | push rdi |
| 397 | ; end prolog |
| 398 | |
| 399 | HIGH_GET_PARAM |
| 400 | .loop: |
| 401 | movdqu xmm0, [rsi] ;load src |
| 402 | movdqu xmm1, [rsi + 2] |
| 403 | movdqu xmm2, [rsi + 16] |
| 404 | movdqu xmm3, [rsi + 18] |
| 405 | |
| 406 | HIGH_APPLY_FILTER_16 0 |
| 407 | jnz .loop |
| 408 | |
| 409 | ; begin epilog |
| 410 | pop rdi |
| 411 | pop rsi |
| 412 | RESTORE_XMM |
| 413 | UNSHADOW_ARGS |
| 414 | pop rbp |
| 415 | ret |
| 416 | %endif |
| 417 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 418 | global sym(aom_highbd_filter_block1d4_h2_avg_sse2) PRIVATE |
| 419 | sym(aom_highbd_filter_block1d4_h2_avg_sse2): |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 420 | push rbp |
| 421 | mov rbp, rsp |
| 422 | SHADOW_ARGS_TO_STACK 7 |
| 423 | push rsi |
| 424 | push rdi |
| 425 | ; end prolog |
| 426 | |
| 427 | HIGH_GET_PARAM_4 |
| 428 | .loop: |
| 429 | movdqu xmm0, [rsi] ;load src |
| 430 | movdqa xmm1, xmm0 |
| 431 | psrldq xmm1, 2 |
| 432 | |
| 433 | HIGH_APPLY_FILTER_4 1 |
| 434 | jnz .loop |
| 435 | |
| 436 | ; begin epilog |
| 437 | pop rdi |
| 438 | pop rsi |
| 439 | UNSHADOW_ARGS |
| 440 | pop rbp |
| 441 | ret |
| 442 | |
| 443 | %if ARCH_X86_64 |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 444 | global sym(aom_highbd_filter_block1d8_h2_avg_sse2) PRIVATE |
| 445 | sym(aom_highbd_filter_block1d8_h2_avg_sse2): |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 446 | push rbp |
| 447 | mov rbp, rsp |
| 448 | SHADOW_ARGS_TO_STACK 7 |
| 449 | SAVE_XMM 8 |
| 450 | push rsi |
| 451 | push rdi |
| 452 | ; end prolog |
| 453 | |
| 454 | HIGH_GET_PARAM |
| 455 | .loop: |
| 456 | movdqu xmm0, [rsi] ;load src |
| 457 | movdqu xmm1, [rsi + 2] |
| 458 | |
| 459 | HIGH_APPLY_FILTER_8 1 |
| 460 | jnz .loop |
| 461 | |
| 462 | ; begin epilog |
| 463 | pop rdi |
| 464 | pop rsi |
| 465 | RESTORE_XMM |
| 466 | UNSHADOW_ARGS |
| 467 | pop rbp |
| 468 | ret |
| 469 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 470 | global sym(aom_highbd_filter_block1d16_h2_avg_sse2) PRIVATE |
| 471 | sym(aom_highbd_filter_block1d16_h2_avg_sse2): |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 472 | push rbp |
| 473 | mov rbp, rsp |
| 474 | SHADOW_ARGS_TO_STACK 7 |
| 475 | SAVE_XMM 9 |
| 476 | push rsi |
| 477 | push rdi |
| 478 | ; end prolog |
| 479 | |
| 480 | HIGH_GET_PARAM |
| 481 | .loop: |
| 482 | movdqu xmm0, [rsi] ;load src |
| 483 | movdqu xmm1, [rsi + 2] |
| 484 | movdqu xmm2, [rsi + 16] |
| 485 | movdqu xmm3, [rsi + 18] |
| 486 | |
| 487 | HIGH_APPLY_FILTER_16 1 |
| 488 | jnz .loop |
| 489 | |
| 490 | ; begin epilog |
| 491 | pop rdi |
| 492 | pop rsi |
| 493 | RESTORE_XMM |
| 494 | UNSHADOW_ARGS |
| 495 | pop rbp |
| 496 | ret |
| 497 | %endif |