|  | ; | 
|  | ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 
|  | ; | 
|  | ;  Use of this source code is governed by a BSD-style license and patent | 
|  | ;  grant that can be found in the LICENSE file in the root of the source | 
|  | ;  tree. All contributing project authors may be found in the AUTHORS | 
|  | ;  file in the root of the source tree. | 
|  | ; | 
|  |  | 
|  |  | 
|  | %include "vpx_ports/x86_abi_support.asm" | 
|  | %include "vp9_asm_enc_offsets.asm" | 
|  |  | 
|  |  | 
|  | ; void vp9_regular_quantize_b_sse2 | arg | 
|  | ;  (BLOCK  *b,                     |  0 | 
|  | ;   BLOCKD *d)                     |  1 | 
|  |  | 
|  | global sym(vp9_regular_quantize_b_sse2) PRIVATE | 
|  | sym(vp9_regular_quantize_b_sse2): | 
|  | push        rbp | 
|  | mov         rbp, rsp | 
|  | SAVE_XMM 7 | 
|  | GET_GOT     rbx | 
|  |  | 
|  | %if ABI_IS_32BIT | 
|  | push        rdi | 
|  | push        rsi | 
|  | %else | 
|  | %if LIBVPX_YASM_WIN64 | 
|  | push        rdi | 
|  | push        rsi | 
|  | %endif | 
|  | %endif | 
|  |  | 
|  | ALIGN_STACK 16, rax | 
|  | %define zrun_zbin_boost   0  ;  8 | 
|  | %define abs_minus_zbin    8  ; 32 | 
|  | %define temp_qcoeff       40 ; 32 | 
|  | %define qcoeff            72 ; 32 | 
|  | %define stack_size        104 | 
|  | sub         rsp, stack_size | 
|  | ; end prolog | 
|  |  | 
|  | %if ABI_IS_32BIT | 
|  | mov         rdi, arg(0)                 ; BLOCK *b | 
|  | mov         rsi, arg(1)                 ; BLOCKD *d | 
|  | %else | 
|  | %if LIBVPX_YASM_WIN64 | 
|  | mov         rdi, rcx                    ; BLOCK *b | 
|  | mov         rsi, rdx                    ; BLOCKD *d | 
|  | %else | 
|  | ;mov         rdi, rdi                    ; BLOCK *b | 
|  | ;mov         rsi, rsi                    ; BLOCKD *d | 
|  | %endif | 
|  | %endif | 
|  |  | 
|  | mov         rdx, [rdi + vp9_block_coeff] ; coeff_ptr | 
|  | mov         rcx, [rdi + vp9_block_zbin] ; zbin_ptr | 
|  | movd        xmm7, [rdi + vp9_block_zbin_extra] ; zbin_oq_value | 
|  |  | 
|  | ; z | 
|  | movdqa      xmm0, [rdx] | 
|  | movdqa      xmm4, [rdx + 16] | 
|  | mov         rdx, [rdi + vp9_block_round] ; round_ptr | 
|  |  | 
|  | pshuflw     xmm7, xmm7, 0 | 
|  | punpcklwd   xmm7, xmm7                  ; duplicated zbin_oq_value | 
|  |  | 
|  | movdqa      xmm1, xmm0 | 
|  | movdqa      xmm5, xmm4 | 
|  |  | 
|  | ; sz | 
|  | psraw       xmm0, 15 | 
|  | psraw       xmm4, 15 | 
|  |  | 
|  | ; (z ^ sz) | 
|  | pxor        xmm1, xmm0 | 
|  | pxor        xmm5, xmm4 | 
|  |  | 
|  | ; x = abs(z) | 
|  | psubw       xmm1, xmm0 | 
|  | psubw       xmm5, xmm4 | 
|  |  | 
|  | movdqa      xmm2, [rcx] | 
|  | movdqa      xmm3, [rcx + 16] | 
|  | mov         rcx, [rdi + vp9_block_quant] ; quant_ptr | 
|  |  | 
|  | ; *zbin_ptr + zbin_oq_value | 
|  | paddw       xmm2, xmm7 | 
|  | paddw       xmm3, xmm7 | 
|  |  | 
|  | ; x - (*zbin_ptr + zbin_oq_value) | 
|  | psubw       xmm1, xmm2 | 
|  | psubw       xmm5, xmm3 | 
|  | movdqa      [rsp + abs_minus_zbin], xmm1 | 
|  | movdqa      [rsp + abs_minus_zbin + 16], xmm5 | 
|  |  | 
|  | ; add (zbin_ptr + zbin_oq_value) back | 
|  | paddw       xmm1, xmm2 | 
|  | paddw       xmm5, xmm3 | 
|  |  | 
|  | movdqa      xmm2, [rdx] | 
|  | movdqa      xmm6, [rdx + 16] | 
|  |  | 
|  | movdqa      xmm3, [rcx] | 
|  | movdqa      xmm7, [rcx + 16] | 
|  |  | 
|  | ; x + round | 
|  | paddw       xmm1, xmm2 | 
|  | paddw       xmm5, xmm6 | 
|  |  | 
|  | ; y = x * quant_ptr >> 16 | 
|  | pmulhw      xmm3, xmm1 | 
|  | pmulhw      xmm7, xmm5 | 
|  |  | 
|  | ; y += x | 
|  | paddw       xmm1, xmm3 | 
|  | paddw       xmm5, xmm7 | 
|  |  | 
|  | movdqa      [rsp + temp_qcoeff], xmm1 | 
|  | movdqa      [rsp + temp_qcoeff + 16], xmm5 | 
|  |  | 
|  | pxor        xmm6, xmm6 | 
|  | ; zero qcoeff | 
|  | movdqa      [rsp + qcoeff], xmm6 | 
|  | movdqa      [rsp + qcoeff + 16], xmm6 | 
|  |  | 
|  | mov         rdx, [rdi + vp9_block_zrun_zbin_boost] ; zbin_boost_ptr | 
|  | mov         rax, [rdi + vp9_block_quant_shift] ; quant_shift_ptr | 
|  | mov         [rsp + zrun_zbin_boost], rdx | 
|  |  | 
|  | %macro ZIGZAG_LOOP 1 | 
|  | ; x | 
|  | movsx       ecx, WORD PTR[rsp + abs_minus_zbin + %1 * 2] | 
|  |  | 
|  | ; if (x >= zbin) | 
|  | sub         cx, WORD PTR[rdx]           ; x - zbin | 
|  | lea         rdx, [rdx + 2]              ; zbin_boost_ptr++ | 
|  | jl          .rq_zigzag_loop_%1           ; x < zbin | 
|  |  | 
|  | movsx       edi, WORD PTR[rsp + temp_qcoeff + %1 * 2] | 
|  |  | 
|  | ; downshift by quant_shift[rc] | 
|  | movsx       cx, BYTE PTR[rax + %1]      ; quant_shift_ptr[rc] | 
|  | sar         edi, cl                     ; also sets Z bit | 
|  | je          .rq_zigzag_loop_%1           ; !y | 
|  | mov         WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc] | 
|  | mov         rdx, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost | 
|  | .rq_zigzag_loop_%1: | 
|  | %endmacro | 
|  | ; in vp9_default_zig_zag1d order: see vp9/common/vp9_entropy.c | 
|  | ZIGZAG_LOOP  0 | 
|  | ZIGZAG_LOOP  1 | 
|  | ZIGZAG_LOOP  4 | 
|  | ZIGZAG_LOOP  8 | 
|  | ZIGZAG_LOOP  5 | 
|  | ZIGZAG_LOOP  2 | 
|  | ZIGZAG_LOOP  3 | 
|  | ZIGZAG_LOOP  6 | 
|  | ZIGZAG_LOOP  9 | 
|  | ZIGZAG_LOOP 12 | 
|  | ZIGZAG_LOOP 13 | 
|  | ZIGZAG_LOOP 10 | 
|  | ZIGZAG_LOOP  7 | 
|  | ZIGZAG_LOOP 11 | 
|  | ZIGZAG_LOOP 14 | 
|  | ZIGZAG_LOOP 15 | 
|  |  | 
|  | movdqa      xmm2, [rsp + qcoeff] | 
|  | movdqa      xmm3, [rsp + qcoeff + 16] | 
|  |  | 
|  | mov         rcx, [rsi + vp9_blockd_dequant] ; dequant_ptr | 
|  | mov         rdi, [rsi + vp9_blockd_dqcoeff] ; dqcoeff_ptr | 
|  |  | 
|  | ; y ^ sz | 
|  | pxor        xmm2, xmm0 | 
|  | pxor        xmm3, xmm4 | 
|  | ; x = (y ^ sz) - sz | 
|  | psubw       xmm2, xmm0 | 
|  | psubw       xmm3, xmm4 | 
|  |  | 
|  | ; dequant | 
|  | movdqa      xmm0, [rcx] | 
|  | movdqa      xmm1, [rcx + 16] | 
|  |  | 
|  | mov         rcx, [rsi + vp9_blockd_qcoeff] ; qcoeff_ptr | 
|  |  | 
|  | pmullw      xmm0, xmm2 | 
|  | pmullw      xmm1, xmm3 | 
|  |  | 
|  | movdqa      [rcx], xmm2        ; store qcoeff | 
|  | movdqa      [rcx + 16], xmm3 | 
|  | movdqa      [rdi], xmm0        ; store dqcoeff | 
|  | movdqa      [rdi + 16], xmm1 | 
|  |  | 
|  | ; select the last value (in zig_zag order) for EOB | 
|  | pcmpeqw     xmm2, xmm6 | 
|  | pcmpeqw     xmm3, xmm6 | 
|  | ; ! | 
|  | pcmpeqw     xmm6, xmm6 | 
|  | pxor        xmm2, xmm6 | 
|  | pxor        xmm3, xmm6 | 
|  | ; mask inv_zig_zag | 
|  | pand        xmm2, [GLOBAL(inv_zig_zag)] | 
|  | pand        xmm3, [GLOBAL(inv_zig_zag + 16)] | 
|  | ; select the max value | 
|  | pmaxsw      xmm2, xmm3 | 
|  | pshufd      xmm3, xmm2, 00001110b | 
|  | pmaxsw      xmm2, xmm3 | 
|  | pshuflw     xmm3, xmm2, 00001110b | 
|  | pmaxsw      xmm2, xmm3 | 
|  | pshuflw     xmm3, xmm2, 00000001b | 
|  | pmaxsw      xmm2, xmm3 | 
|  | movd        eax, xmm2 | 
|  | and         eax, 0xff | 
|  | mov         [rsi + vp9_blockd_eob], eax | 
|  |  | 
|  | ; begin epilog | 
|  | add         rsp, stack_size | 
|  | pop         rsp | 
|  | %if ABI_IS_32BIT | 
|  | pop         rsi | 
|  | pop         rdi | 
|  | %else | 
|  | %if LIBVPX_YASM_WIN64 | 
|  | pop         rsi | 
|  | pop         rdi | 
|  | %endif | 
|  | %endif | 
|  | RESTORE_GOT | 
|  | RESTORE_XMM | 
|  | pop         rbp | 
|  | ret | 
|  |  | 
|  | ; void vp9_fast_quantize_b_sse2 | arg | 
|  | ;  (BLOCK  *b,                  |  0 | 
|  | ;   BLOCKD *d)                  |  1 | 
|  |  | 
|  | global sym(vp9_fast_quantize_b_sse2) PRIVATE | 
|  | sym(vp9_fast_quantize_b_sse2): | 
|  | push        rbp | 
|  | mov         rbp, rsp | 
|  | GET_GOT     rbx | 
|  |  | 
|  | %if ABI_IS_32BIT | 
|  | push        rdi | 
|  | push        rsi | 
|  | %else | 
|  | %if LIBVPX_YASM_WIN64 | 
|  | push        rdi | 
|  | push        rsi | 
|  | %else | 
|  | ; these registers are used for passing arguments | 
|  | %endif | 
|  | %endif | 
|  |  | 
|  | ; end prolog | 
|  |  | 
|  | %if ABI_IS_32BIT | 
|  | mov         rdi, arg(0)                 ; BLOCK *b | 
|  | mov         rsi, arg(1)                 ; BLOCKD *d | 
|  | %else | 
|  | %if LIBVPX_YASM_WIN64 | 
|  | mov         rdi, rcx                    ; BLOCK *b | 
|  | mov         rsi, rdx                    ; BLOCKD *d | 
|  | %else | 
|  | ;mov         rdi, rdi                    ; BLOCK *b | 
|  | ;mov         rsi, rsi                    ; BLOCKD *d | 
|  | %endif | 
|  | %endif | 
|  |  | 
|  | mov         rax, [rdi + vp9_block_coeff] | 
|  | mov         rcx, [rdi + vp9_block_round] | 
|  | mov         rdx, [rdi + vp9_block_quant_fast] | 
|  |  | 
|  | ; z = coeff | 
|  | movdqa      xmm0, [rax] | 
|  | movdqa      xmm4, [rax + 16] | 
|  |  | 
|  | ; dup z so we can save sz | 
|  | movdqa      xmm1, xmm0 | 
|  | movdqa      xmm5, xmm4 | 
|  |  | 
|  | ; sz = z >> 15 | 
|  | psraw       xmm0, 15 | 
|  | psraw       xmm4, 15 | 
|  |  | 
|  | ; x = abs(z) = (z ^ sz) - sz | 
|  | pxor        xmm1, xmm0 | 
|  | pxor        xmm5, xmm4 | 
|  | psubw       xmm1, xmm0 | 
|  | psubw       xmm5, xmm4 | 
|  |  | 
|  | ; x += round | 
|  | paddw       xmm1, [rcx] | 
|  | paddw       xmm5, [rcx + 16] | 
|  |  | 
|  | mov         rax, [rsi + vp9_blockd_qcoeff] | 
|  | mov         rcx, [rsi + vp9_blockd_dequant] | 
|  | mov         rdi, [rsi + vp9_blockd_dqcoeff] | 
|  |  | 
|  | ; y = x * quant >> 16 | 
|  | pmulhw      xmm1, [rdx] | 
|  | pmulhw      xmm5, [rdx + 16] | 
|  |  | 
|  | ; x = (y ^ sz) - sz | 
|  | pxor        xmm1, xmm0 | 
|  | pxor        xmm5, xmm4 | 
|  | psubw       xmm1, xmm0 | 
|  | psubw       xmm5, xmm4 | 
|  |  | 
|  | ; qcoeff = x | 
|  | movdqa      [rax], xmm1 | 
|  | movdqa      [rax + 16], xmm5 | 
|  |  | 
|  | ; x * dequant | 
|  | movdqa      xmm2, xmm1 | 
|  | movdqa      xmm3, xmm5 | 
|  | pmullw      xmm2, [rcx] | 
|  | pmullw      xmm3, [rcx + 16] | 
|  |  | 
|  | ; dqcoeff = x * dequant | 
|  | movdqa      [rdi], xmm2 | 
|  | movdqa      [rdi + 16], xmm3 | 
|  |  | 
|  | pxor        xmm4, xmm4                  ;clear all bits | 
|  | pcmpeqw     xmm1, xmm4 | 
|  | pcmpeqw     xmm5, xmm4 | 
|  |  | 
|  | pcmpeqw     xmm4, xmm4                  ;set all bits | 
|  | pxor        xmm1, xmm4 | 
|  | pxor        xmm5, xmm4 | 
|  |  | 
|  | pand        xmm1, [GLOBAL(inv_zig_zag)] | 
|  | pand        xmm5, [GLOBAL(inv_zig_zag + 16)] | 
|  |  | 
|  | pmaxsw      xmm1, xmm5 | 
|  |  | 
|  | ; now down to 8 | 
|  | pshufd      xmm5, xmm1, 00001110b | 
|  |  | 
|  | pmaxsw      xmm1, xmm5 | 
|  |  | 
|  | ; only 4 left | 
|  | pshuflw     xmm5, xmm1, 00001110b | 
|  |  | 
|  | pmaxsw      xmm1, xmm5 | 
|  |  | 
|  | ; okay, just 2! | 
|  | pshuflw     xmm5, xmm1, 00000001b | 
|  |  | 
|  | pmaxsw      xmm1, xmm5 | 
|  |  | 
|  | movd        eax, xmm1 | 
|  | and         eax, 0xff | 
|  | mov         [rsi + vp9_blockd_eob], eax | 
|  |  | 
|  | ; begin epilog | 
|  | %if ABI_IS_32BIT | 
|  | pop         rsi | 
|  | pop         rdi | 
|  | %else | 
|  | %if LIBVPX_YASM_WIN64 | 
|  | pop         rsi | 
|  | pop         rdi | 
|  | %endif | 
|  | %endif | 
|  |  | 
|  | RESTORE_GOT | 
|  | pop         rbp | 
|  | ret | 
|  |  | 
|  | SECTION_RODATA | 
|  | align 16 | 
|  | inv_zig_zag: | 
|  | dw 0x0001, 0x0002, 0x0006, 0x0007 | 
|  | dw 0x0003, 0x0005, 0x0008, 0x000d | 
|  | dw 0x0004, 0x0009, 0x000c, 0x000e | 
|  | dw 0x000a, 0x000b, 0x000f, 0x0010 |