|  | ; | 
|  | ; Copyright (c) 2016, Alliance for Open Media. All rights reserved | 
|  | ; | 
|  | ; This source code is subject to the terms of the BSD 2 Clause License and | 
|  | ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License | 
|  | ; was not distributed with this source code in the LICENSE file, you can | 
|  | ; obtain it at www.aomedia.org/license/software. If the Alliance for Open | 
|  | ; Media Patent License 1.0 was not distributed with this source code in the | 
|  | ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. | 
|  | ; | 
|  |  | 
|  | ; | 
|  |  | 
|  | ; Increment %1 by sizeof() tran_low_t * %2. | 
|  | %macro INCREMENT_ELEMENTS_TRAN_LOW 2 | 
|  | lea %1, [%1 + %2 * 4] | 
|  | %endmacro | 
|  |  | 
|  | ; Load %2 + %3 into m%1. | 
|  | ; %3 is the offset in elements, not bytes. | 
|  | ; If tran_low_t is 16 bits (low bit depth configuration) then load the value | 
|  | ; directly. If tran_low_t is 32 bits (high bit depth configuration) then pack | 
|  | ; the values down to 16 bits. | 
|  | %macro LOAD_TRAN_LOW 3 | 
|  | mova     m%1, [%2 + (%3) * 4] | 
|  | packssdw m%1, [%2 + (%3) * 4 + 16] | 
|  | %endmacro | 
|  |  | 
|  | %define private_prefix av1 | 
|  |  | 
|  | %include "third_party/x86inc/x86inc.asm" | 
|  |  | 
|  | SECTION .text | 
|  |  | 
|  | ; int64_t av1_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, | 
|  | ;                         int64_t *ssz) | 
|  |  | 
|  | INIT_XMM sse2 | 
|  | cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz | 
|  | pxor      m4, m4                 ; sse accumulator | 
|  | pxor      m6, m6                 ; ssz accumulator | 
|  | pxor      m5, m5                 ; dedicated zero register | 
|  | .loop: | 
|  | LOAD_TRAN_LOW 2, uqcq, 0 | 
|  | LOAD_TRAN_LOW 0, dqcq, 0 | 
|  | LOAD_TRAN_LOW 3, uqcq, 8 | 
|  | LOAD_TRAN_LOW 1, dqcq, 8 | 
|  | INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16 | 
|  | INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16 | 
|  | sub    sizeq, 16 | 
|  | psubw     m0, m2 | 
|  | psubw     m1, m3 | 
|  | ; individual errors are max. 15bit+sign, so squares are 30bit, and | 
|  | ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit) | 
|  | pmaddwd   m0, m0 | 
|  | pmaddwd   m1, m1 | 
|  | pmaddwd   m2, m2 | 
|  | pmaddwd   m3, m3 | 
|  | ; the sum of 2 31bit integers will fit in a 32bit unsigned integer | 
|  | paddd     m0, m1 | 
|  | paddd     m2, m3 | 
|  | ; accumulate in 64bit | 
|  | punpckldq m7, m0, m5 | 
|  | punpckhdq m0, m5 | 
|  | paddq     m4, m7 | 
|  | punpckldq m7, m2, m5 | 
|  | paddq     m4, m0 | 
|  | punpckhdq m2, m5 | 
|  | paddq     m6, m7 | 
|  | paddq     m6, m2 | 
|  | jg .loop | 
|  |  | 
|  | ; accumulate horizontally and store in return value | 
|  | movhlps   m5, m4 | 
|  | movhlps   m7, m6 | 
|  | paddq     m4, m5 | 
|  | paddq     m6, m7 | 
|  | %if ARCH_X86_64 | 
|  | movq    rax, m4 | 
|  | movq [sszq], m6 | 
|  | %else | 
|  | mov     eax, sszm | 
|  | pshufd   m5, m4, 0x1 | 
|  | movq  [eax], m6 | 
|  | movd    eax, m4 | 
|  | movd    edx, m5 | 
|  | %endif | 
|  | RET |