| ; |
| ; Copyright (c) 2016, Alliance for Open Media. All rights reserved. |
| ; |
| ; This source code is subject to the terms of the BSD 2 Clause License and |
| ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| ; was not distributed with this source code in the LICENSE file, you can |
| ; obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| ; Media Patent License 1.0 was not distributed with this source code in the |
| ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
| ; |
| |
| ; |
| |
| %include "aom_ports/x86_abi_support.asm" |
| |
| ; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr |
| %macro TABULATE_SSIM 0 |
| paddusw xmm15, xmm3 ; sum_s |
| paddusw xmm14, xmm4 ; sum_r |
| movdqa xmm1, xmm3 |
| pmaddwd xmm1, xmm1 |
| paddd xmm13, xmm1 ; sum_sq_s |
| movdqa xmm2, xmm4 |
| pmaddwd xmm2, xmm2 |
| paddd xmm12, xmm2 ; sum_sq_r |
| pmaddwd xmm3, xmm4 |
| paddd xmm11, xmm3 ; sum_sxr |
| %endmacro |
| |
| ; Sum across the register %1 starting with q words |
| %macro SUM_ACROSS_Q 1 |
| movdqa xmm2,%1 |
| punpckldq %1,xmm0 |
| punpckhdq xmm2,xmm0 |
| paddq %1,xmm2 |
| movdqa xmm2,%1 |
| punpcklqdq %1,xmm0 |
| punpckhqdq xmm2,xmm0 |
| paddq %1,xmm2 |
| %endmacro |
| |
| ; Sum across the register %1 starting with q words |
| %macro SUM_ACROSS_W 1 |
| movdqa xmm1, %1 |
| punpcklwd %1,xmm0 |
| punpckhwd xmm1,xmm0 |
| paddd %1, xmm1 |
| SUM_ACROSS_Q %1 |
| %endmacro |
| |
| SECTION .text |
| |
| ;void ssim_parms_sse2( |
| ; unsigned char *s, |
| ; int sp, |
| ; unsigned char *r, |
| ; int rp |
| ; unsigned long *sum_s, |
| ; unsigned long *sum_r, |
| ; unsigned long *sum_sq_s, |
| ; unsigned long *sum_sq_r, |
| ; unsigned long *sum_sxr); |
| ; |
| ; TODO: Use parm passing through structure, probably don't need the pxors |
| ; ( calling app will initialize to 0 ) could easily fit everything in sse2 |
| ; without too much hastle, and can probably do better estimates with psadw |
| ; or pavgb At this point this is just meant to be first pass for calculating |
| ; all the parms needed for 16x16 ssim so we can play with dssim as distortion |
| ; in mode selection code. |
| globalsym(av1_ssim_parms_16x16_sse2) |
| sym(av1_ssim_parms_16x16_sse2): |
| push rbp |
| mov rbp, rsp |
| SHADOW_ARGS_TO_STACK 9 |
| SAVE_XMM 15 |
| push rsi |
| push rdi |
| ; end prolog |
| |
| mov rsi, arg(0) ;s |
| mov rcx, arg(1) ;sp |
| mov rdi, arg(2) ;r |
| mov rax, arg(3) ;rp |
| |
| pxor xmm0, xmm0 |
| pxor xmm15,xmm15 ;sum_s |
| pxor xmm14,xmm14 ;sum_r |
| pxor xmm13,xmm13 ;sum_sq_s |
| pxor xmm12,xmm12 ;sum_sq_r |
| pxor xmm11,xmm11 ;sum_sxr |
| |
| mov rdx, 16 ;row counter |
| .NextRow: |
| |
| ;grab source and reference pixels |
| movdqu xmm5, [rsi] |
| movdqu xmm6, [rdi] |
| movdqa xmm3, xmm5 |
| movdqa xmm4, xmm6 |
| punpckhbw xmm3, xmm0 ; high_s |
| punpckhbw xmm4, xmm0 ; high_r |
| |
| TABULATE_SSIM |
| |
| movdqa xmm3, xmm5 |
| movdqa xmm4, xmm6 |
| punpcklbw xmm3, xmm0 ; low_s |
| punpcklbw xmm4, xmm0 ; low_r |
| |
| TABULATE_SSIM |
| |
| add rsi, rcx ; next s row |
| add rdi, rax ; next r row |
| |
| dec rdx ; counter |
| jnz .NextRow |
| |
| SUM_ACROSS_W xmm15 |
| SUM_ACROSS_W xmm14 |
| SUM_ACROSS_Q xmm13 |
| SUM_ACROSS_Q xmm12 |
| SUM_ACROSS_Q xmm11 |
| |
| mov rdi,arg(4) |
| movd [rdi], xmm15; |
| mov rdi,arg(5) |
| movd [rdi], xmm14; |
| mov rdi,arg(6) |
| movd [rdi], xmm13; |
| mov rdi,arg(7) |
| movd [rdi], xmm12; |
| mov rdi,arg(8) |
| movd [rdi], xmm11; |
| |
| ; begin epilog |
| pop rdi |
| pop rsi |
| RESTORE_XMM |
| UNSHADOW_ARGS |
| pop rbp |
| ret |
| |
| ;void ssim_parms_sse2( |
| ; unsigned char *s, |
| ; int sp, |
| ; unsigned char *r, |
| ; int rp |
| ; unsigned long *sum_s, |
| ; unsigned long *sum_r, |
| ; unsigned long *sum_sq_s, |
| ; unsigned long *sum_sq_r, |
| ; unsigned long *sum_sxr); |
| ; |
| ; TODO: Use parm passing through structure, probably don't need the pxors |
| ; ( calling app will initialize to 0 ) could easily fit everything in sse2 |
| ; without too much hastle, and can probably do better estimates with psadw |
| ; or pavgb At this point this is just meant to be first pass for calculating |
| ; all the parms needed for 16x16 ssim so we can play with dssim as distortion |
| ; in mode selection code. |
| globalsym(av1_ssim_parms_8x8_sse2) |
| sym(av1_ssim_parms_8x8_sse2): |
| push rbp |
| mov rbp, rsp |
| SHADOW_ARGS_TO_STACK 9 |
| SAVE_XMM 15 |
| push rsi |
| push rdi |
| ; end prolog |
| |
| mov rsi, arg(0) ;s |
| mov rcx, arg(1) ;sp |
| mov rdi, arg(2) ;r |
| mov rax, arg(3) ;rp |
| |
| pxor xmm0, xmm0 |
| pxor xmm15,xmm15 ;sum_s |
| pxor xmm14,xmm14 ;sum_r |
| pxor xmm13,xmm13 ;sum_sq_s |
| pxor xmm12,xmm12 ;sum_sq_r |
| pxor xmm11,xmm11 ;sum_sxr |
| |
| mov rdx, 8 ;row counter |
| .NextRow: |
| |
| ;grab source and reference pixels |
| movq xmm3, [rsi] |
| movq xmm4, [rdi] |
| punpcklbw xmm3, xmm0 ; low_s |
| punpcklbw xmm4, xmm0 ; low_r |
| |
| TABULATE_SSIM |
| |
| add rsi, rcx ; next s row |
| add rdi, rax ; next r row |
| |
| dec rdx ; counter |
| jnz .NextRow |
| |
| SUM_ACROSS_W xmm15 |
| SUM_ACROSS_W xmm14 |
| SUM_ACROSS_Q xmm13 |
| SUM_ACROSS_Q xmm12 |
| SUM_ACROSS_Q xmm11 |
| |
| mov rdi,arg(4) |
| movd [rdi], xmm15; |
| mov rdi,arg(5) |
| movd [rdi], xmm14; |
| mov rdi,arg(6) |
| movd [rdi], xmm13; |
| mov rdi,arg(7) |
| movd [rdi], xmm12; |
| mov rdi,arg(8) |
| movd [rdi], xmm11; |
| |
| ; begin epilog |
| pop rdi |
| pop rsi |
| RESTORE_XMM |
| UNSHADOW_ARGS |
| pop rbp |
| ret |