|  | ; | 
|  | ; Copyright (c) 2016, Alliance for Open Media. All rights reserved | 
|  | ; | 
|  | ; This source code is subject to the terms of the BSD 2 Clause License and | 
|  | ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License | 
|  | ; was not distributed with this source code in the LICENSE file, you can | 
|  | ; obtain it at www.aomedia.org/license/software. If the Alliance for Open | 
|  | ; Media Patent License 1.0 was not distributed with this source code in the | 
|  | ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. | 
|  | ; | 
|  |  | 
|  | ; | 
|  |  | 
|  | %include "third_party/x86inc/x86inc.asm" | 
|  |  | 
|  | SECTION_RODATA | 
|  | pb_1: times 16 db 1 | 
|  | pw_4:  times 8 dw 4 | 
|  | pw_8:  times 8 dw 8 | 
|  | pw_16: times 8 dw 16 | 
|  | pw_32: times 8 dw 32 | 
|  | dc_128: times 16 db 128 | 
|  | pw2_4:  times 8 dw 2 | 
|  | pw2_8:  times 8 dw 4 | 
|  | pw2_16:  times 8 dw 8 | 
|  | pw2_32:  times 8 dw 16 | 
|  |  | 
|  | SECTION .text | 
|  |  | 
|  | ; ------------------------------------------ | 
|  | ; input: x, y, z, result | 
|  | ; | 
|  | ; trick from pascal | 
|  | ; (x+2y+z+2)>>2 can be calculated as: | 
|  | ; result = avg(x,z) | 
|  | ; result -= xor(x,z) & 1 | 
|  | ; result = avg(result,y) | 
|  | ; ------------------------------------------ | 
|  | %macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4 | 
|  | pavgb               %4, %1, %3 | 
|  | pxor                %3, %1 | 
|  | pand                %3, [GLOBAL(pb_1)] | 
|  | psubb               %4, %3 | 
|  | pavgb               %4, %2 | 
|  | %endmacro | 
|  |  | 
|  | INIT_XMM sse2 | 
|  | cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset | 
|  | GET_GOT     goffsetq | 
|  |  | 
|  | movd                  m2, [leftq] | 
|  | movd                  m0, [aboveq] | 
|  | pxor                  m1, m1 | 
|  | punpckldq             m0, m2 | 
|  | psadbw                m0, m1 | 
|  | paddw                 m0, [GLOBAL(pw_4)] | 
|  | psraw                 m0, 3 | 
|  | pshuflw               m0, m0, 0x0 | 
|  | packuswb              m0, m0 | 
|  | movd      [dstq        ], m0 | 
|  | movd      [dstq+strideq], m0 | 
|  | lea                 dstq, [dstq+strideq*2] | 
|  | movd      [dstq        ], m0 | 
|  | movd      [dstq+strideq], m0 | 
|  |  | 
|  | RESTORE_GOT | 
|  | RET | 
|  |  | 
|  | INIT_XMM sse2 | 
|  | cglobal dc_left_predictor_4x4, 2, 5, 2, dst, stride, above, left, goffset | 
|  | movifnidn          leftq, leftmp | 
|  | GET_GOT     goffsetq | 
|  |  | 
|  | pxor                  m1, m1 | 
|  | movd                  m0, [leftq] | 
|  | psadbw                m0, m1 | 
|  | paddw                 m0, [GLOBAL(pw2_4)] | 
|  | psraw                 m0, 2 | 
|  | pshuflw               m0, m0, 0x0 | 
|  | packuswb              m0, m0 | 
|  | movd      [dstq        ], m0 | 
|  | movd      [dstq+strideq], m0 | 
|  | lea                 dstq, [dstq+strideq*2] | 
|  | movd      [dstq        ], m0 | 
|  | movd      [dstq+strideq], m0 | 
|  |  | 
|  | RESTORE_GOT | 
|  | RET | 
|  |  | 
|  | INIT_XMM sse2 | 
|  | cglobal dc_top_predictor_4x4, 3, 5, 2, dst, stride, above, left, goffset | 
|  | GET_GOT     goffsetq | 
|  |  | 
|  | pxor                  m1, m1 | 
|  | movd                  m0, [aboveq] | 
|  | psadbw                m0, m1 | 
|  | paddw                 m0, [GLOBAL(pw2_4)] | 
|  | psraw                 m0, 2 | 
|  | pshuflw               m0, m0, 0x0 | 
|  | packuswb              m0, m0 | 
|  | movd      [dstq        ], m0 | 
|  | movd      [dstq+strideq], m0 | 
|  | lea                 dstq, [dstq+strideq*2] | 
|  | movd      [dstq        ], m0 | 
|  | movd      [dstq+strideq], m0 | 
|  |  | 
|  | RESTORE_GOT | 
|  | RET | 
|  |  | 
|  | INIT_XMM sse2 | 
|  | cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset | 
|  | GET_GOT     goffsetq | 
|  |  | 
|  | pxor                  m1, m1 | 
|  | movq                  m0, [aboveq] | 
|  | movq                  m2, [leftq] | 
|  | DEFINE_ARGS dst, stride, stride3 | 
|  | lea             stride3q, [strideq*3] | 
|  | psadbw                m0, m1 | 
|  | psadbw                m2, m1 | 
|  | paddw                 m0, m2 | 
|  | paddw                 m0, [GLOBAL(pw_8)] | 
|  | psraw                 m0, 4 | 
|  | punpcklbw             m0, m0 | 
|  | pshuflw               m0, m0, 0x0 | 
|  | movq    [dstq          ], m0 | 
|  | movq    [dstq+strideq  ], m0 | 
|  | movq    [dstq+strideq*2], m0 | 
|  | movq    [dstq+stride3q ], m0 | 
|  | lea                 dstq, [dstq+strideq*4] | 
|  | movq    [dstq          ], m0 | 
|  | movq    [dstq+strideq  ], m0 | 
|  | movq    [dstq+strideq*2], m0 | 
|  | movq    [dstq+stride3q ], m0 | 
|  |  | 
|  | RESTORE_GOT | 
|  | RET | 
|  |  | 
|  | INIT_XMM sse2 | 
|  | cglobal dc_top_predictor_8x8, 3, 5, 2, dst, stride, above, left, goffset | 
|  | GET_GOT     goffsetq | 
|  |  | 
|  | pxor                  m1, m1 | 
|  | movq                  m0, [aboveq] | 
|  | DEFINE_ARGS dst, stride, stride3 | 
|  | lea             stride3q, [strideq*3] | 
|  | psadbw                m0, m1 | 
|  | paddw                 m0, [GLOBAL(pw2_8)] | 
|  | psraw                 m0, 3 | 
|  | punpcklbw             m0, m0 | 
|  | pshuflw               m0, m0, 0x0 | 
|  | movq    [dstq          ], m0 | 
|  | movq    [dstq+strideq  ], m0 | 
|  | movq    [dstq+strideq*2], m0 | 
|  | movq    [dstq+stride3q ], m0 | 
|  | lea                 dstq, [dstq+strideq*4] | 
|  | movq    [dstq          ], m0 | 
|  | movq    [dstq+strideq  ], m0 | 
|  | movq    [dstq+strideq*2], m0 | 
|  | movq    [dstq+stride3q ], m0 | 
|  |  | 
|  | RESTORE_GOT | 
|  | RET | 
|  |  | 
|  | INIT_XMM sse2 | 
|  | cglobal dc_left_predictor_8x8, 2, 5, 2, dst, stride, above, left, goffset | 
|  | movifnidn          leftq, leftmp | 
|  | GET_GOT     goffsetq | 
|  |  | 
|  | pxor                  m1, m1 | 
|  | movq                  m0, [leftq] | 
|  | DEFINE_ARGS dst, stride, stride3 | 
|  | lea             stride3q, [strideq*3] | 
|  | psadbw                m0, m1 | 
|  | paddw                 m0, [GLOBAL(pw2_8)] | 
|  | psraw                 m0, 3 | 
|  | punpcklbw             m0, m0 | 
|  | pshuflw               m0, m0, 0x0 | 
|  | movq    [dstq          ], m0 | 
|  | movq    [dstq+strideq  ], m0 | 
|  | movq    [dstq+strideq*2], m0 | 
|  | movq    [dstq+stride3q ], m0 | 
|  | lea                 dstq, [dstq+strideq*4] | 
|  | movq    [dstq          ], m0 | 
|  | movq    [dstq+strideq  ], m0 | 
|  | movq    [dstq+strideq*2], m0 | 
|  | movq    [dstq+stride3q ], m0 | 
|  |  | 
|  | RESTORE_GOT | 
|  | RET | 
|  |  | 
|  | INIT_XMM sse2 | 
|  | cglobal dc_128_predictor_4x4, 2, 5, 1, dst, stride, above, left, goffset | 
|  | GET_GOT     goffsetq | 
|  |  | 
|  | DEFINE_ARGS dst, stride, stride3 | 
|  | lea             stride3q, [strideq*3] | 
|  | movd     m0,        [GLOBAL(dc_128)] | 
|  | movd    [dstq          ], m0 | 
|  | movd    [dstq+strideq  ], m0 | 
|  | movd    [dstq+strideq*2], m0 | 
|  | movd    [dstq+stride3q ], m0 | 
|  | RESTORE_GOT | 
|  | RET | 
|  |  | 
|  | INIT_XMM sse2 | 
|  | cglobal dc_128_predictor_8x8, 2, 5, 1, dst, stride, above, left, goffset | 
|  | GET_GOT     goffsetq | 
|  |  | 
|  | DEFINE_ARGS dst, stride, stride3 | 
|  | lea             stride3q, [strideq*3] | 
|  | movq    m0,        [GLOBAL(dc_128)] | 
|  | movq    [dstq          ], m0 | 
|  | movq    [dstq+strideq  ], m0 | 
|  | movq    [dstq+strideq*2], m0 | 
|  | movq    [dstq+stride3q ], m0 | 
|  | lea                 dstq, [dstq+strideq*4] | 
|  | movq    [dstq          ], m0 | 
|  | movq    [dstq+strideq  ], m0 | 
|  | movq    [dstq+strideq*2], m0 | 
|  | movq    [dstq+stride3q ], m0 | 
|  | RESTORE_GOT | 
|  | RET | 
|  |  | 
|  | INIT_XMM sse2 | 
|  | cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset | 
|  | GET_GOT     goffsetq | 
|  |  | 
|  | pxor                  m1, m1 | 
|  | mova                  m0, [aboveq] | 
|  | mova                  m2, [leftq] | 
|  | DEFINE_ARGS dst, stride, stride3, lines4 | 
|  | lea             stride3q, [strideq*3] | 
|  | mov              lines4d, 4 | 
|  | psadbw                m0, m1 | 
|  | psadbw                m2, m1 | 
|  | paddw                 m0, m2 | 
|  | movhlps               m2, m0 | 
|  | paddw                 m0, m2 | 
|  | paddw                 m0, [GLOBAL(pw_16)] | 
|  | psraw                 m0, 5 | 
|  | pshuflw               m0, m0, 0x0 | 
|  | punpcklqdq            m0, m0 | 
|  | packuswb              m0, m0 | 
|  | .loop: | 
|  | mova    [dstq          ], m0 | 
|  | mova    [dstq+strideq  ], m0 | 
|  | mova    [dstq+strideq*2], m0 | 
|  | mova    [dstq+stride3q ], m0 | 
|  | lea                 dstq, [dstq+strideq*4] | 
|  | dec              lines4d | 
|  | jnz .loop | 
|  |  | 
|  | RESTORE_GOT | 
|  | REP_RET | 
|  |  | 
|  |  | 
|  | INIT_XMM sse2 | 
|  | cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset | 
|  | GET_GOT     goffsetq | 
|  |  | 
|  | pxor                  m1, m1 | 
|  | mova                  m0, [aboveq] | 
|  | DEFINE_ARGS dst, stride, stride3, lines4 | 
|  | lea             stride3q, [strideq*3] | 
|  | mov              lines4d, 4 | 
|  | psadbw                m0, m1 | 
|  | movhlps               m2, m0 | 
|  | paddw                 m0, m2 | 
|  | paddw                 m0, [GLOBAL(pw2_16)] | 
|  | psraw                 m0, 4 | 
|  | pshuflw               m0, m0, 0x0 | 
|  | punpcklqdq            m0, m0 | 
|  | packuswb              m0, m0 | 
|  | .loop: | 
|  | mova    [dstq          ], m0 | 
|  | mova    [dstq+strideq  ], m0 | 
|  | mova    [dstq+strideq*2], m0 | 
|  | mova    [dstq+stride3q ], m0 | 
|  | lea                 dstq, [dstq+strideq*4] | 
|  | dec              lines4d | 
|  | jnz .loop | 
|  |  | 
|  | RESTORE_GOT | 
|  | REP_RET | 
|  |  | 
|  | INIT_XMM sse2 | 
|  | cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset | 
|  | GET_GOT     goffsetq | 
|  |  | 
|  | pxor                  m1, m1 | 
|  | mova                  m0, [leftq] | 
|  | DEFINE_ARGS dst, stride, stride3, lines4 | 
|  | lea             stride3q, [strideq*3] | 
|  | mov              lines4d, 4 | 
|  | psadbw                m0, m1 | 
|  | movhlps               m2, m0 | 
|  | paddw                 m0, m2 | 
|  | paddw                 m0, [GLOBAL(pw2_16)] | 
|  | psraw                 m0, 4 | 
|  | pshuflw               m0, m0, 0x0 | 
|  | punpcklqdq            m0, m0 | 
|  | packuswb              m0, m0 | 
|  | .loop: | 
|  | mova    [dstq          ], m0 | 
|  | mova    [dstq+strideq  ], m0 | 
|  | mova    [dstq+strideq*2], m0 | 
|  | mova    [dstq+stride3q ], m0 | 
|  | lea                 dstq, [dstq+strideq*4] | 
|  | dec              lines4d | 
|  | jnz .loop | 
|  |  | 
|  | RESTORE_GOT | 
|  | REP_RET | 
|  |  | 
|  | INIT_XMM sse2 | 
|  | cglobal dc_128_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset | 
|  | GET_GOT     goffsetq | 
|  |  | 
|  | DEFINE_ARGS dst, stride, stride3, lines4 | 
|  | lea             stride3q, [strideq*3] | 
|  | mov              lines4d, 4 | 
|  | mova    m0,        [GLOBAL(dc_128)] | 
|  | .loop: | 
|  | mova    [dstq          ], m0 | 
|  | mova    [dstq+strideq  ], m0 | 
|  | mova    [dstq+strideq*2], m0 | 
|  | mova    [dstq+stride3q ], m0 | 
|  | lea                 dstq, [dstq+strideq*4] | 
|  | dec              lines4d | 
|  | jnz .loop | 
|  | RESTORE_GOT | 
|  | RET | 
|  |  | 
|  |  | 
|  | INIT_XMM sse2 | 
|  | cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset | 
|  | GET_GOT     goffsetq | 
|  |  | 
|  | pxor                  m1, m1 | 
|  | mova                  m0, [aboveq] | 
|  | mova                  m2, [aboveq+16] | 
|  | mova                  m3, [leftq] | 
|  | mova                  m4, [leftq+16] | 
|  | DEFINE_ARGS dst, stride, stride3, lines4 | 
|  | lea             stride3q, [strideq*3] | 
|  | mov              lines4d, 8 | 
|  | psadbw                m0, m1 | 
|  | psadbw                m2, m1 | 
|  | psadbw                m3, m1 | 
|  | psadbw                m4, m1 | 
|  | paddw                 m0, m2 | 
|  | paddw                 m0, m3 | 
|  | paddw                 m0, m4 | 
|  | movhlps               m2, m0 | 
|  | paddw                 m0, m2 | 
|  | paddw                 m0, [GLOBAL(pw_32)] | 
|  | psraw                 m0, 6 | 
|  | pshuflw               m0, m0, 0x0 | 
|  | punpcklqdq            m0, m0 | 
|  | packuswb              m0, m0 | 
|  | .loop: | 
|  | mova [dstq             ], m0 | 
|  | mova [dstq          +16], m0 | 
|  | mova [dstq+strideq     ], m0 | 
|  | mova [dstq+strideq  +16], m0 | 
|  | mova [dstq+strideq*2   ], m0 | 
|  | mova [dstq+strideq*2+16], m0 | 
|  | mova [dstq+stride3q    ], m0 | 
|  | mova [dstq+stride3q +16], m0 | 
|  | lea                 dstq, [dstq+strideq*4] | 
|  | dec              lines4d | 
|  | jnz .loop | 
|  |  | 
|  | RESTORE_GOT | 
|  | REP_RET | 
|  |  | 
|  | INIT_XMM sse2 | 
|  | cglobal dc_top_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset | 
|  | GET_GOT     goffsetq | 
|  |  | 
|  | pxor                  m1, m1 | 
|  | mova                  m0, [aboveq] | 
|  | mova                  m2, [aboveq+16] | 
|  | DEFINE_ARGS dst, stride, stride3, lines4 | 
|  | lea             stride3q, [strideq*3] | 
|  | mov              lines4d, 8 | 
|  | psadbw                m0, m1 | 
|  | psadbw                m2, m1 | 
|  | paddw                 m0, m2 | 
|  | movhlps               m2, m0 | 
|  | paddw                 m0, m2 | 
|  | paddw                 m0, [GLOBAL(pw2_32)] | 
|  | psraw                 m0, 5 | 
|  | pshuflw               m0, m0, 0x0 | 
|  | punpcklqdq            m0, m0 | 
|  | packuswb              m0, m0 | 
|  | .loop: | 
|  | mova [dstq             ], m0 | 
|  | mova [dstq          +16], m0 | 
|  | mova [dstq+strideq     ], m0 | 
|  | mova [dstq+strideq  +16], m0 | 
|  | mova [dstq+strideq*2   ], m0 | 
|  | mova [dstq+strideq*2+16], m0 | 
|  | mova [dstq+stride3q    ], m0 | 
|  | mova [dstq+stride3q +16], m0 | 
|  | lea                 dstq, [dstq+strideq*4] | 
|  | dec              lines4d | 
|  | jnz .loop | 
|  |  | 
|  | RESTORE_GOT | 
|  | REP_RET | 
|  |  | 
|  | INIT_XMM sse2 | 
|  | cglobal dc_left_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset | 
|  | GET_GOT     goffsetq | 
|  |  | 
|  | pxor                  m1, m1 | 
|  | mova                  m0, [leftq] | 
|  | mova                  m2, [leftq+16] | 
|  | DEFINE_ARGS dst, stride, stride3, lines4 | 
|  | lea             stride3q, [strideq*3] | 
|  | mov              lines4d, 8 | 
|  | psadbw                m0, m1 | 
|  | psadbw                m2, m1 | 
|  | paddw                 m0, m2 | 
|  | movhlps               m2, m0 | 
|  | paddw                 m0, m2 | 
|  | paddw                 m0, [GLOBAL(pw2_32)] | 
|  | psraw                 m0, 5 | 
|  | pshuflw               m0, m0, 0x0 | 
|  | punpcklqdq            m0, m0 | 
|  | packuswb              m0, m0 | 
|  | .loop: | 
|  | mova [dstq             ], m0 | 
|  | mova [dstq          +16], m0 | 
|  | mova [dstq+strideq     ], m0 | 
|  | mova [dstq+strideq  +16], m0 | 
|  | mova [dstq+strideq*2   ], m0 | 
|  | mova [dstq+strideq*2+16], m0 | 
|  | mova [dstq+stride3q    ], m0 | 
|  | mova [dstq+stride3q +16], m0 | 
|  | lea                 dstq, [dstq+strideq*4] | 
|  | dec              lines4d | 
|  | jnz .loop | 
|  |  | 
|  | RESTORE_GOT | 
|  | REP_RET | 
|  |  | 
|  | INIT_XMM sse2 | 
|  | cglobal dc_128_predictor_32x32, 4, 5, 3, dst, stride, above, left, goffset | 
|  | GET_GOT     goffsetq | 
|  |  | 
|  | DEFINE_ARGS dst, stride, stride3, lines4 | 
|  | lea             stride3q, [strideq*3] | 
|  | mov              lines4d, 8 | 
|  | mova    m0,        [GLOBAL(dc_128)] | 
|  | .loop: | 
|  | mova [dstq             ], m0 | 
|  | mova [dstq          +16], m0 | 
|  | mova [dstq+strideq     ], m0 | 
|  | mova [dstq+strideq  +16], m0 | 
|  | mova [dstq+strideq*2   ], m0 | 
|  | mova [dstq+strideq*2+16], m0 | 
|  | mova [dstq+stride3q    ], m0 | 
|  | mova [dstq+stride3q +16], m0 | 
|  | lea                 dstq, [dstq+strideq*4] | 
|  | dec              lines4d | 
|  | jnz .loop | 
|  | RESTORE_GOT | 
|  | RET | 
|  |  | 
|  | INIT_XMM sse2 | 
|  | cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above | 
|  | movd                  m0, [aboveq] | 
|  | movd      [dstq        ], m0 | 
|  | movd      [dstq+strideq], m0 | 
|  | lea                 dstq, [dstq+strideq*2] | 
|  | movd      [dstq        ], m0 | 
|  | movd      [dstq+strideq], m0 | 
|  | RET | 
|  |  | 
|  | INIT_XMM sse2 | 
|  | cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above | 
|  | movq                  m0, [aboveq] | 
|  | DEFINE_ARGS dst, stride, stride3 | 
|  | lea             stride3q, [strideq*3] | 
|  | movq    [dstq          ], m0 | 
|  | movq    [dstq+strideq  ], m0 | 
|  | movq    [dstq+strideq*2], m0 | 
|  | movq    [dstq+stride3q ], m0 | 
|  | lea                 dstq, [dstq+strideq*4] | 
|  | movq    [dstq          ], m0 | 
|  | movq    [dstq+strideq  ], m0 | 
|  | movq    [dstq+strideq*2], m0 | 
|  | movq    [dstq+stride3q ], m0 | 
|  | RET | 
|  |  | 
|  | INIT_XMM sse2 | 
|  | cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above | 
|  | mova                  m0, [aboveq] | 
|  | DEFINE_ARGS dst, stride, stride3, nlines4 | 
|  | lea             stride3q, [strideq*3] | 
|  | mov              nlines4d, 4 | 
|  | .loop: | 
|  | mova    [dstq          ], m0 | 
|  | mova    [dstq+strideq  ], m0 | 
|  | mova    [dstq+strideq*2], m0 | 
|  | mova    [dstq+stride3q ], m0 | 
|  | lea                 dstq, [dstq+strideq*4] | 
|  | dec             nlines4d | 
|  | jnz .loop | 
|  | REP_RET | 
|  |  | 
|  | INIT_XMM sse2 | 
|  | cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above | 
|  | mova                  m0, [aboveq] | 
|  | mova                  m1, [aboveq+16] | 
|  | DEFINE_ARGS dst, stride, stride3, nlines4 | 
|  | lea             stride3q, [strideq*3] | 
|  | mov              nlines4d, 8 | 
|  | .loop: | 
|  | mova [dstq             ], m0 | 
|  | mova [dstq          +16], m1 | 
|  | mova [dstq+strideq     ], m0 | 
|  | mova [dstq+strideq  +16], m1 | 
|  | mova [dstq+strideq*2   ], m0 | 
|  | mova [dstq+strideq*2+16], m1 | 
|  | mova [dstq+stride3q    ], m0 | 
|  | mova [dstq+stride3q +16], m1 | 
|  | lea                 dstq, [dstq+strideq*4] | 
|  | dec             nlines4d | 
|  | jnz .loop | 
|  | REP_RET | 
|  |  | 
|  | INIT_XMM sse2 | 
|  | cglobal h_predictor_4x4, 2, 4, 4, dst, stride, line, left | 
|  | movifnidn          leftq, leftmp | 
|  | movd                  m0, [leftq] | 
|  | punpcklbw             m0, m0 | 
|  | punpcklbw             m0, m0 | 
|  | pshufd                m1, m0, 0x1 | 
|  | movd      [dstq        ], m0 | 
|  | movd      [dstq+strideq], m1 | 
|  | pshufd                m2, m0, 0x2 | 
|  | lea                 dstq, [dstq+strideq*2] | 
|  | pshufd                m3, m0, 0x3 | 
|  | movd      [dstq        ], m2 | 
|  | movd      [dstq+strideq], m3 | 
|  | RET | 
|  |  | 
|  | INIT_XMM sse2 | 
|  | cglobal h_predictor_8x8, 2, 5, 3, dst, stride, line, left | 
|  | movifnidn          leftq, leftmp | 
|  | mov                lineq, -2 | 
|  | DEFINE_ARGS  dst, stride, line, left, stride3 | 
|  | lea             stride3q, [strideq*3] | 
|  | movq                  m0, [leftq    ] | 
|  | punpcklbw             m0, m0              ; l1 l1 l2 l2 ... l8 l8 | 
|  | .loop: | 
|  | pshuflw               m1, m0, 0x0         ; l1 l1 l1 l1 l1 l1 l1 l1 | 
|  | pshuflw               m2, m0, 0x55        ; l2 l2 l2 l2 l2 l2 l2 l2 | 
|  | movq      [dstq        ], m1 | 
|  | movq      [dstq+strideq], m2 | 
|  | pshuflw               m1, m0, 0xaa | 
|  | pshuflw               m2, m0, 0xff | 
|  | movq    [dstq+strideq*2], m1 | 
|  | movq    [dstq+stride3q ], m2 | 
|  | pshufd                m0, m0, 0xe         ; [63:0] l5 l5 l6 l6 l7 l7 l8 l8 | 
|  | inc                lineq | 
|  | lea                 dstq, [dstq+strideq*4] | 
|  | jnz .loop | 
|  | REP_RET | 
|  |  | 
|  | INIT_XMM sse2 | 
|  | cglobal h_predictor_16x16, 2, 5, 3, dst, stride, line, left | 
|  | movifnidn          leftq, leftmp | 
|  | mov                lineq, -4 | 
|  | DEFINE_ARGS dst, stride, line, left, stride3 | 
|  | lea             stride3q, [strideq*3] | 
|  | .loop: | 
|  | movd                  m0, [leftq] | 
|  | punpcklbw             m0, m0 | 
|  | punpcklbw             m0, m0              ; l1 to l4 each repeated 4 times | 
|  | pshufd            m1, m0, 0x0             ; l1 repeated 16 times | 
|  | pshufd            m2, m0, 0x55            ; l2 repeated 16 times | 
|  | mova    [dstq          ], m1 | 
|  | mova    [dstq+strideq  ], m2 | 
|  | pshufd            m1, m0, 0xaa | 
|  | pshufd            m2, m0, 0xff | 
|  | mova    [dstq+strideq*2], m1 | 
|  | mova    [dstq+stride3q ], m2 | 
|  | inc                lineq | 
|  | lea                leftq, [leftq+4       ] | 
|  | lea                 dstq, [dstq+strideq*4] | 
|  | jnz .loop | 
|  | REP_RET | 
|  |  | 
|  | INIT_XMM sse2 | 
|  | cglobal h_predictor_32x32, 2, 5, 3, dst, stride, line, left | 
|  | movifnidn              leftq, leftmp | 
|  | mov                    lineq, -8 | 
|  | DEFINE_ARGS dst, stride, line, left, stride3 | 
|  | lea                 stride3q, [strideq*3] | 
|  | .loop: | 
|  | movd                      m0, [leftq] | 
|  | punpcklbw                 m0, m0 | 
|  | punpcklbw                 m0, m0              ; l1 to l4 each repeated 4 times | 
|  | pshufd                m1, m0, 0x0             ; l1 repeated 16 times | 
|  | pshufd                m2, m0, 0x55            ; l2 repeated 16 times | 
|  | mova     [dstq             ], m1 | 
|  | mova     [dstq+16          ], m1 | 
|  | mova     [dstq+strideq     ], m2 | 
|  | mova     [dstq+strideq+16  ], m2 | 
|  | pshufd                m1, m0, 0xaa | 
|  | pshufd                m2, m0, 0xff | 
|  | mova     [dstq+strideq*2   ], m1 | 
|  | mova     [dstq+strideq*2+16], m1 | 
|  | mova     [dstq+stride3q    ], m2 | 
|  | mova     [dstq+stride3q+16 ], m2 | 
|  | inc                    lineq | 
|  | lea                    leftq, [leftq+4       ] | 
|  | lea                     dstq, [dstq+strideq*4] | 
|  | jnz .loop | 
|  | REP_RET | 
|  |  | 
|  | INIT_XMM sse2 | 
|  | cglobal tm_predictor_4x4, 4, 4, 5, dst, stride, above, left | 
|  | pxor                  m1, m1 | 
|  | movq                  m0, [aboveq-1]; [63:0] tl t1 t2 t3 t4 x x x | 
|  | punpcklbw             m0, m1 | 
|  | pshuflw               m2, m0, 0x0   ; [63:0] tl tl tl tl [word] | 
|  | psrldq                m0, 2 | 
|  | psubw                 m0, m2        ; [63:0] t1-tl t2-tl t3-tl t4-tl [word] | 
|  | movd                  m2, [leftq] | 
|  | punpcklbw             m2, m1 | 
|  | pshuflw               m4, m2, 0x0   ; [63:0] l1 l1 l1 l1 [word] | 
|  | pshuflw               m3, m2, 0x55  ; [63:0] l2 l2 l2 l2 [word] | 
|  | paddw                 m4, m0 | 
|  | paddw                 m3, m0 | 
|  | packuswb              m4, m4 | 
|  | packuswb              m3, m3 | 
|  | movd      [dstq        ], m4 | 
|  | movd      [dstq+strideq], m3 | 
|  | lea                 dstq, [dstq+strideq*2] | 
|  | pshuflw               m4, m2, 0xaa | 
|  | pshuflw               m3, m2, 0xff | 
|  | paddw                 m4, m0 | 
|  | paddw                 m3, m0 | 
|  | packuswb              m4, m4 | 
|  | packuswb              m3, m3 | 
|  | movd      [dstq        ], m4 | 
|  | movd      [dstq+strideq], m3 | 
|  | RET | 
|  |  | 
|  | INIT_XMM sse2 | 
|  | cglobal tm_predictor_8x8, 4, 4, 5, dst, stride, above, left | 
|  | pxor                  m1, m1 | 
|  | movd                  m2, [aboveq-1] | 
|  | movq                  m0, [aboveq] | 
|  | punpcklbw             m2, m1 | 
|  | punpcklbw             m0, m1        ; t1 t2 t3 t4 t5 t6 t7 t8 [word] | 
|  | pshuflw               m2, m2, 0x0   ; [63:0] tl tl tl tl [word] | 
|  | DEFINE_ARGS dst, stride, line, left | 
|  | mov                lineq, -4 | 
|  | punpcklqdq            m2, m2        ; tl tl tl tl tl tl tl tl [word] | 
|  | psubw                 m0, m2        ; t1-tl t2-tl ... t8-tl [word] | 
|  | movq                  m2, [leftq] | 
|  | punpcklbw             m2, m1        ; l1 l2 l3 l4 l5 l6 l7 l8 [word] | 
|  | .loop: | 
|  | pshuflw               m4, m2, 0x0   ; [63:0] l1 l1 l1 l1 [word] | 
|  | pshuflw               m3, m2, 0x55  ; [63:0] l2 l2 l2 l2 [word] | 
|  | punpcklqdq            m4, m4        ; l1 l1 l1 l1 l1 l1 l1 l1 [word] | 
|  | punpcklqdq            m3, m3        ; l2 l2 l2 l2 l2 l2 l2 l2 [word] | 
|  | paddw                 m4, m0 | 
|  | paddw                 m3, m0 | 
|  | packuswb              m4, m3 | 
|  | movq      [dstq        ], m4 | 
|  | movhps    [dstq+strideq], m4 | 
|  | lea                 dstq, [dstq+strideq*2] | 
|  | psrldq                m2, 4 | 
|  | inc                lineq | 
|  | jnz .loop | 
|  | REP_RET | 
|  |  | 
|  | INIT_XMM sse2 | 
|  | cglobal tm_predictor_16x16, 4, 5, 8, dst, stride, above, left | 
|  | pxor                  m1, m1 | 
|  | mova                  m2, [aboveq-16]; | 
|  | mova                  m0, [aboveq]   ; t1 t2 ... t16 [byte] | 
|  | punpckhbw             m2, m1         ; [127:112] tl [word] | 
|  | punpckhbw             m4, m0, m1 | 
|  | punpcklbw             m0, m1         ; m0:m4 t1 t2 ... t16 [word] | 
|  | DEFINE_ARGS dst, stride, line, left, stride8 | 
|  | mov                lineq, -8 | 
|  | pshufhw               m2, m2, 0xff | 
|  | mova                  m3, [leftq]    ; l1 l2 ... l16 [byte] | 
|  | punpckhqdq            m2, m2         ; tl repeated 8 times [word] | 
|  | psubw                 m0, m2 | 
|  | psubw                 m4, m2         ; m0:m4 t1-tl t2-tl ... t16-tl [word] | 
|  | punpckhbw             m5, m3, m1 | 
|  | punpcklbw             m3, m1         ; m3:m5 l1 l2 ... l16 [word] | 
|  | lea             stride8q, [strideq*8] | 
|  | .loop: | 
|  | pshuflw               m6, m3, 0x0 | 
|  | pshuflw               m7, m5, 0x0 | 
|  | punpcklqdq            m6, m6         ; l1 repeated 8 times [word] | 
|  | punpcklqdq            m7, m7         ; l8 repeated 8 times [word] | 
|  | paddw                 m1, m6, m0 | 
|  | paddw                 m6, m4         ; m1:m6 ti-tl+l1 [i=1,15] [word] | 
|  | psrldq                m5, 2 | 
|  | packuswb              m1, m6 | 
|  | mova     [dstq         ], m1 | 
|  | paddw                 m1, m7, m0 | 
|  | paddw                 m7, m4         ; m1:m7 ti-tl+l8 [i=1,15] [word] | 
|  | psrldq                m3, 2 | 
|  | packuswb              m1, m7 | 
|  | mova     [dstq+stride8q], m1 | 
|  | inc                lineq | 
|  | lea                 dstq, [dstq+strideq] | 
|  | jnz .loop | 
|  | REP_RET | 
|  |  | 
|  | INIT_XMM sse2 | 
|  | cglobal tm_predictor_32x32, 4, 4, 8, dst, stride, above, left | 
|  | pxor                  m1, m1 | 
|  | movd                  m2, [aboveq-1] | 
|  | mova                  m0, [aboveq] | 
|  | mova                  m4, [aboveq+16] | 
|  | punpcklbw             m2, m1 | 
|  | punpckhbw             m3, m0, m1 | 
|  | punpckhbw             m5, m4, m1 | 
|  | punpcklbw             m0, m1 | 
|  | punpcklbw             m4, m1 | 
|  | pshuflw               m2, m2, 0x0 | 
|  | DEFINE_ARGS dst, stride, line, left | 
|  | mov                lineq, -16 | 
|  | punpcklqdq            m2, m2 | 
|  | add                leftq, 32 | 
|  | psubw                 m0, m2 | 
|  | psubw                 m3, m2 | 
|  | psubw                 m4, m2 | 
|  | psubw                 m5, m2 | 
|  | .loop: | 
|  | movd                  m2, [leftq+lineq*2] | 
|  | pxor                  m1, m1 | 
|  | punpcklbw             m2, m1 | 
|  | pshuflw               m7, m2, 0x55 | 
|  | pshuflw               m2, m2, 0x0 | 
|  | punpcklqdq            m2, m2 | 
|  | punpcklqdq            m7, m7 | 
|  | paddw                 m6, m2, m3 | 
|  | paddw                 m1, m2, m0 | 
|  | packuswb              m1, m6 | 
|  | mova   [dstq           ], m1 | 
|  | paddw                 m6, m2, m5 | 
|  | paddw                 m1, m2, m4 | 
|  | packuswb              m1, m6 | 
|  | mova   [dstq+16        ], m1 | 
|  | paddw                 m6, m7, m3 | 
|  | paddw                 m1, m7, m0 | 
|  | packuswb              m1, m6 | 
|  | mova   [dstq+strideq   ], m1 | 
|  | paddw                 m6, m7, m5 | 
|  | paddw                 m1, m7, m4 | 
|  | packuswb              m1, m6 | 
|  | mova   [dstq+strideq+16], m1 | 
|  | lea                 dstq, [dstq+strideq*2] | 
|  | inc                lineq | 
|  | jnz .loop | 
|  | REP_RET |