|  | ; | 
|  | ; Copyright (c) 2016, Alliance for Open Media. All rights reserved | 
|  | ; | 
|  | ; This source code is subject to the terms of the BSD 2 Clause License and | 
|  | ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License | 
|  | ; was not distributed with this source code in the LICENSE file, you can | 
|  | ; obtain it at www.aomedia.org/license/software. If the Alliance for Open | 
|  | ; Media Patent License 1.0 was not distributed with this source code in the | 
|  | ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. | 
|  | ; | 
|  |  | 
|  | ; | 
|  |  | 
|  | %include "aom_ports/x86_abi_support.asm" | 
|  |  | 
|  | ; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr | 
|  | %macro TABULATE_SSIM 0 | 
|  | paddusw         xmm15, xmm3  ; sum_s | 
|  | paddusw         xmm14, xmm4  ; sum_r | 
|  | movdqa          xmm1, xmm3 | 
|  | pmaddwd         xmm1, xmm1 | 
|  | paddd           xmm13, xmm1 ; sum_sq_s | 
|  | movdqa          xmm2, xmm4 | 
|  | pmaddwd         xmm2, xmm2 | 
|  | paddd           xmm12, xmm2 ; sum_sq_r | 
|  | pmaddwd         xmm3, xmm4 | 
|  | paddd           xmm11, xmm3  ; sum_sxr | 
|  | %endmacro | 
|  |  | 
|  | ; Sum across the register %1 starting with q words | 
|  | %macro SUM_ACROSS_Q 1 | 
|  | movdqa          xmm2,%1 | 
|  | punpckldq       %1,xmm0 | 
|  | punpckhdq       xmm2,xmm0 | 
|  | paddq           %1,xmm2 | 
|  | movdqa          xmm2,%1 | 
|  | punpcklqdq      %1,xmm0 | 
|  | punpckhqdq      xmm2,xmm0 | 
|  | paddq           %1,xmm2 | 
|  | %endmacro | 
|  |  | 
|  | ; Sum across the register %1 starting with q words | 
|  | %macro SUM_ACROSS_W 1 | 
|  | movdqa          xmm1, %1 | 
|  | punpcklwd       %1,xmm0 | 
|  | punpckhwd       xmm1,xmm0 | 
|  | paddd           %1, xmm1 | 
|  | SUM_ACROSS_Q    %1 | 
|  | %endmacro | 
|  |  | 
|  | SECTION .text | 
|  |  | 
|  | ;void ssim_parms_sse2( | 
|  | ;    unsigned char *s, | 
|  | ;    int sp, | 
|  | ;    unsigned char *r, | 
|  | ;    int rp | 
|  | ;    uint32_t *sum_s, | 
|  | ;    uint32_t *sum_r, | 
|  | ;    uint32_t *sum_sq_s, | 
|  | ;    uint32_t *sum_sq_r, | 
|  | ;    uint32_t *sum_sxr); | 
|  | ; | 
|  | ; TODO: Use parm passing through structure, probably don't need the pxors | 
|  | ; ( calling app will initialize to 0 ) could easily fit everything in sse2 | 
|  | ; without too much hastle, and can probably do better estimates with psadw | 
|  | ; or pavgb At this point this is just meant to be first pass for calculating | 
|  | ; all the parms needed for 16x16 ssim so we can play with dssim as distortion | 
|  | ; in mode selection code. | 
|  | globalsym(aom_ssim_parms_16x16_sse2) | 
|  | sym(aom_ssim_parms_16x16_sse2): | 
|  | push        rbp | 
|  | mov         rbp, rsp | 
|  | SHADOW_ARGS_TO_STACK 9 | 
|  | SAVE_XMM 15 | 
|  | push        rsi | 
|  | push        rdi | 
|  | ; end prolog | 
|  |  | 
|  | mov             rsi,        arg(0) ;s | 
|  | mov             rcx,        arg(1) ;sp | 
|  | mov             rdi,        arg(2) ;r | 
|  | mov             rax,        arg(3) ;rp | 
|  |  | 
|  | pxor            xmm0, xmm0 | 
|  | pxor            xmm15,xmm15  ;sum_s | 
|  | pxor            xmm14,xmm14  ;sum_r | 
|  | pxor            xmm13,xmm13  ;sum_sq_s | 
|  | pxor            xmm12,xmm12  ;sum_sq_r | 
|  | pxor            xmm11,xmm11  ;sum_sxr | 
|  |  | 
|  | mov             rdx, 16      ;row counter | 
|  | .NextRow: | 
|  |  | 
|  | ;grab source and reference pixels | 
|  | movdqu          xmm5, [rsi] | 
|  | movdqu          xmm6, [rdi] | 
|  | movdqa          xmm3, xmm5 | 
|  | movdqa          xmm4, xmm6 | 
|  | punpckhbw       xmm3, xmm0 ; high_s | 
|  | punpckhbw       xmm4, xmm0 ; high_r | 
|  |  | 
|  | TABULATE_SSIM | 
|  |  | 
|  | movdqa          xmm3, xmm5 | 
|  | movdqa          xmm4, xmm6 | 
|  | punpcklbw       xmm3, xmm0 ; low_s | 
|  | punpcklbw       xmm4, xmm0 ; low_r | 
|  |  | 
|  | TABULATE_SSIM | 
|  |  | 
|  | add             rsi, rcx   ; next s row | 
|  | add             rdi, rax   ; next r row | 
|  |  | 
|  | dec             rdx        ; counter | 
|  | jnz .NextRow | 
|  |  | 
|  | SUM_ACROSS_W    xmm15 | 
|  | SUM_ACROSS_W    xmm14 | 
|  | SUM_ACROSS_Q    xmm13 | 
|  | SUM_ACROSS_Q    xmm12 | 
|  | SUM_ACROSS_Q    xmm11 | 
|  |  | 
|  | mov             rdi,arg(4) | 
|  | movd            [rdi], xmm15; | 
|  | mov             rdi,arg(5) | 
|  | movd            [rdi], xmm14; | 
|  | mov             rdi,arg(6) | 
|  | movd            [rdi], xmm13; | 
|  | mov             rdi,arg(7) | 
|  | movd            [rdi], xmm12; | 
|  | mov             rdi,arg(8) | 
|  | movd            [rdi], xmm11; | 
|  |  | 
|  | ; begin epilog | 
|  | pop         rdi | 
|  | pop         rsi | 
|  | RESTORE_XMM | 
|  | UNSHADOW_ARGS | 
|  | pop         rbp | 
|  | ret | 
|  |  | 
|  | ;void ssim_parms_sse2( | 
|  | ;    unsigned char *s, | 
|  | ;    int sp, | 
|  | ;    unsigned char *r, | 
|  | ;    int rp | 
|  | ;    uint32_t *sum_s, | 
|  | ;    uint32_t *sum_r, | 
|  | ;    uint32_t *sum_sq_s, | 
|  | ;    uint32_t *sum_sq_r, | 
|  | ;    uint32_t *sum_sxr); | 
|  | ; | 
|  | ; TODO: Use parm passing through structure, probably don't need the pxors | 
|  | ; ( calling app will initialize to 0 ) could easily fit everything in sse2 | 
|  | ; without too much hastle, and can probably do better estimates with psadw | 
|  | ; or pavgb At this point this is just meant to be first pass for calculating | 
|  | ; all the parms needed for 16x16 ssim so we can play with dssim as distortion | 
|  | ; in mode selection code. | 
|  | globalsym(aom_ssim_parms_8x8_sse2) | 
|  | sym(aom_ssim_parms_8x8_sse2): | 
|  | push        rbp | 
|  | mov         rbp, rsp | 
|  | SHADOW_ARGS_TO_STACK 9 | 
|  | SAVE_XMM 15 | 
|  | push        rsi | 
|  | push        rdi | 
|  | ; end prolog | 
|  |  | 
|  | mov             rsi,        arg(0) ;s | 
|  | mov             rcx,        arg(1) ;sp | 
|  | mov             rdi,        arg(2) ;r | 
|  | mov             rax,        arg(3) ;rp | 
|  |  | 
|  | pxor            xmm0, xmm0 | 
|  | pxor            xmm15,xmm15  ;sum_s | 
|  | pxor            xmm14,xmm14  ;sum_r | 
|  | pxor            xmm13,xmm13  ;sum_sq_s | 
|  | pxor            xmm12,xmm12  ;sum_sq_r | 
|  | pxor            xmm11,xmm11  ;sum_sxr | 
|  |  | 
|  | mov             rdx, 8      ;row counter | 
|  | .NextRow: | 
|  |  | 
|  | ;grab source and reference pixels | 
|  | movq            xmm3, [rsi] | 
|  | movq            xmm4, [rdi] | 
|  | punpcklbw       xmm3, xmm0 ; low_s | 
|  | punpcklbw       xmm4, xmm0 ; low_r | 
|  |  | 
|  | TABULATE_SSIM | 
|  |  | 
|  | add             rsi, rcx   ; next s row | 
|  | add             rdi, rax   ; next r row | 
|  |  | 
|  | dec             rdx        ; counter | 
|  | jnz .NextRow | 
|  |  | 
|  | SUM_ACROSS_W    xmm15 | 
|  | SUM_ACROSS_W    xmm14 | 
|  | SUM_ACROSS_Q    xmm13 | 
|  | SUM_ACROSS_Q    xmm12 | 
|  | SUM_ACROSS_Q    xmm11 | 
|  |  | 
|  | mov             rdi,arg(4) | 
|  | movd            [rdi], xmm15; | 
|  | mov             rdi,arg(5) | 
|  | movd            [rdi], xmm14; | 
|  | mov             rdi,arg(6) | 
|  | movd            [rdi], xmm13; | 
|  | mov             rdi,arg(7) | 
|  | movd            [rdi], xmm12; | 
|  | mov             rdi,arg(8) | 
|  | movd            [rdi], xmm11; | 
|  |  | 
|  | ; begin epilog | 
|  | pop         rdi | 
|  | pop         rsi | 
|  | RESTORE_XMM | 
|  | UNSHADOW_ARGS | 
|  | pop         rbp | 
|  | ret |