| ; | 
 | ; Copyright (c) 2016, Alliance for Open Media. All rights reserved | 
 | ; | 
 | ; This source code is subject to the terms of the BSD 2 Clause License and | 
 | ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License | 
 | ; was not distributed with this source code in the LICENSE file, you can | 
 | ; obtain it at www.aomedia.org/license/software. If the Alliance for Open | 
 | ; Media Patent License 1.0 was not distributed with this source code in the | 
 | ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. | 
 | ; | 
 |  | 
 | ; | 
 |  | 
 | %include "third_party/x86inc/x86inc.asm" | 
 |  | 
 | SECTION_RODATA | 
 | pb_1: times 16 db 1 | 
 | pw_4:  times 8 dw 4 | 
 | pw_8:  times 8 dw 8 | 
 | pw_16: times 8 dw 16 | 
 | pw_32: times 8 dw 32 | 
 | dc_128: times 16 db 128 | 
 | pw2_4:  times 8 dw 2 | 
 | pw2_8:  times 8 dw 4 | 
 | pw2_16:  times 8 dw 8 | 
 | pw2_32:  times 8 dw 16 | 
 |  | 
 | SECTION .text | 
 |  | 
 | INIT_XMM sse2 | 
 | cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset | 
 |   GET_GOT     goffsetq | 
 |  | 
 |   movd                  m2, [leftq] | 
 |   movd                  m0, [aboveq] | 
 |   pxor                  m1, m1 | 
 |   punpckldq             m0, m2 | 
 |   psadbw                m0, m1 | 
 |   paddw                 m0, [GLOBAL(pw_4)] | 
 |   psraw                 m0, 3 | 
 |   pshuflw               m0, m0, 0x0 | 
 |   packuswb              m0, m0 | 
 |   movd      [dstq        ], m0 | 
 |   movd      [dstq+strideq], m0 | 
 |   lea                 dstq, [dstq+strideq*2] | 
 |   movd      [dstq        ], m0 | 
 |   movd      [dstq+strideq], m0 | 
 |  | 
 |   RESTORE_GOT | 
 |   RET | 
 |  | 
 | INIT_XMM sse2 | 
 | cglobal dc_left_predictor_4x4, 2, 5, 2, dst, stride, above, left, goffset | 
 |   movifnidn          leftq, leftmp | 
 |   GET_GOT     goffsetq | 
 |  | 
 |   pxor                  m1, m1 | 
 |   movd                  m0, [leftq] | 
 |   psadbw                m0, m1 | 
 |   paddw                 m0, [GLOBAL(pw2_4)] | 
 |   psraw                 m0, 2 | 
 |   pshuflw               m0, m0, 0x0 | 
 |   packuswb              m0, m0 | 
 |   movd      [dstq        ], m0 | 
 |   movd      [dstq+strideq], m0 | 
 |   lea                 dstq, [dstq+strideq*2] | 
 |   movd      [dstq        ], m0 | 
 |   movd      [dstq+strideq], m0 | 
 |  | 
 |   RESTORE_GOT | 
 |   RET | 
 |  | 
 | INIT_XMM sse2 | 
 | cglobal dc_top_predictor_4x4, 3, 5, 2, dst, stride, above, left, goffset | 
 |   GET_GOT     goffsetq | 
 |  | 
 |   pxor                  m1, m1 | 
 |   movd                  m0, [aboveq] | 
 |   psadbw                m0, m1 | 
 |   paddw                 m0, [GLOBAL(pw2_4)] | 
 |   psraw                 m0, 2 | 
 |   pshuflw               m0, m0, 0x0 | 
 |   packuswb              m0, m0 | 
 |   movd      [dstq        ], m0 | 
 |   movd      [dstq+strideq], m0 | 
 |   lea                 dstq, [dstq+strideq*2] | 
 |   movd      [dstq        ], m0 | 
 |   movd      [dstq+strideq], m0 | 
 |  | 
 |   RESTORE_GOT | 
 |   RET | 
 |  | 
 | INIT_XMM sse2 | 
 | cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset | 
 |   GET_GOT     goffsetq | 
 |  | 
 |   pxor                  m1, m1 | 
 |   movq                  m0, [aboveq] | 
 |   movq                  m2, [leftq] | 
 |   DEFINE_ARGS dst, stride, stride3 | 
 |   lea             stride3q, [strideq*3] | 
 |   psadbw                m0, m1 | 
 |   psadbw                m2, m1 | 
 |   paddw                 m0, m2 | 
 |   paddw                 m0, [GLOBAL(pw_8)] | 
 |   psraw                 m0, 4 | 
 |   punpcklbw             m0, m0 | 
 |   pshuflw               m0, m0, 0x0 | 
 |   movq    [dstq          ], m0 | 
 |   movq    [dstq+strideq  ], m0 | 
 |   movq    [dstq+strideq*2], m0 | 
 |   movq    [dstq+stride3q ], m0 | 
 |   lea                 dstq, [dstq+strideq*4] | 
 |   movq    [dstq          ], m0 | 
 |   movq    [dstq+strideq  ], m0 | 
 |   movq    [dstq+strideq*2], m0 | 
 |   movq    [dstq+stride3q ], m0 | 
 |  | 
 |   RESTORE_GOT | 
 |   RET | 
 |  | 
 | INIT_XMM sse2 | 
 | cglobal dc_top_predictor_8x8, 3, 5, 2, dst, stride, above, left, goffset | 
 |   GET_GOT     goffsetq | 
 |  | 
 |   pxor                  m1, m1 | 
 |   movq                  m0, [aboveq] | 
 |   DEFINE_ARGS dst, stride, stride3 | 
 |   lea             stride3q, [strideq*3] | 
 |   psadbw                m0, m1 | 
 |   paddw                 m0, [GLOBAL(pw2_8)] | 
 |   psraw                 m0, 3 | 
 |   punpcklbw             m0, m0 | 
 |   pshuflw               m0, m0, 0x0 | 
 |   movq    [dstq          ], m0 | 
 |   movq    [dstq+strideq  ], m0 | 
 |   movq    [dstq+strideq*2], m0 | 
 |   movq    [dstq+stride3q ], m0 | 
 |   lea                 dstq, [dstq+strideq*4] | 
 |   movq    [dstq          ], m0 | 
 |   movq    [dstq+strideq  ], m0 | 
 |   movq    [dstq+strideq*2], m0 | 
 |   movq    [dstq+stride3q ], m0 | 
 |  | 
 |   RESTORE_GOT | 
 |   RET | 
 |  | 
 | INIT_XMM sse2 | 
 | cglobal dc_left_predictor_8x8, 2, 5, 2, dst, stride, above, left, goffset | 
 |   movifnidn          leftq, leftmp | 
 |   GET_GOT     goffsetq | 
 |  | 
 |   pxor                  m1, m1 | 
 |   movq                  m0, [leftq] | 
 |   DEFINE_ARGS dst, stride, stride3 | 
 |   lea             stride3q, [strideq*3] | 
 |   psadbw                m0, m1 | 
 |   paddw                 m0, [GLOBAL(pw2_8)] | 
 |   psraw                 m0, 3 | 
 |   punpcklbw             m0, m0 | 
 |   pshuflw               m0, m0, 0x0 | 
 |   movq    [dstq          ], m0 | 
 |   movq    [dstq+strideq  ], m0 | 
 |   movq    [dstq+strideq*2], m0 | 
 |   movq    [dstq+stride3q ], m0 | 
 |   lea                 dstq, [dstq+strideq*4] | 
 |   movq    [dstq          ], m0 | 
 |   movq    [dstq+strideq  ], m0 | 
 |   movq    [dstq+strideq*2], m0 | 
 |   movq    [dstq+stride3q ], m0 | 
 |  | 
 |   RESTORE_GOT | 
 |   RET | 
 |  | 
 | INIT_XMM sse2 | 
 | cglobal dc_128_predictor_4x4, 2, 5, 1, dst, stride, above, left, goffset | 
 |   GET_GOT     goffsetq | 
 |  | 
 |   DEFINE_ARGS dst, stride, stride3 | 
 |   lea             stride3q, [strideq*3] | 
 |   movd     m0,        [GLOBAL(dc_128)] | 
 |   movd    [dstq          ], m0 | 
 |   movd    [dstq+strideq  ], m0 | 
 |   movd    [dstq+strideq*2], m0 | 
 |   movd    [dstq+stride3q ], m0 | 
 |   RESTORE_GOT | 
 |   RET | 
 |  | 
 | INIT_XMM sse2 | 
 | cglobal dc_128_predictor_8x8, 2, 5, 1, dst, stride, above, left, goffset | 
 |   GET_GOT     goffsetq | 
 |  | 
 |   DEFINE_ARGS dst, stride, stride3 | 
 |   lea             stride3q, [strideq*3] | 
 |   movq    m0,        [GLOBAL(dc_128)] | 
 |   movq    [dstq          ], m0 | 
 |   movq    [dstq+strideq  ], m0 | 
 |   movq    [dstq+strideq*2], m0 | 
 |   movq    [dstq+stride3q ], m0 | 
 |   lea                 dstq, [dstq+strideq*4] | 
 |   movq    [dstq          ], m0 | 
 |   movq    [dstq+strideq  ], m0 | 
 |   movq    [dstq+strideq*2], m0 | 
 |   movq    [dstq+stride3q ], m0 | 
 |   RESTORE_GOT | 
 |   RET | 
 |  | 
 | INIT_XMM sse2 | 
 | cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset | 
 |   GET_GOT     goffsetq | 
 |  | 
 |   pxor                  m1, m1 | 
 |   mova                  m0, [aboveq] | 
 |   mova                  m2, [leftq] | 
 |   DEFINE_ARGS dst, stride, stride3, lines4 | 
 |   lea             stride3q, [strideq*3] | 
 |   mov              lines4d, 4 | 
 |   psadbw                m0, m1 | 
 |   psadbw                m2, m1 | 
 |   paddw                 m0, m2 | 
 |   movhlps               m2, m0 | 
 |   paddw                 m0, m2 | 
 |   paddw                 m0, [GLOBAL(pw_16)] | 
 |   psraw                 m0, 5 | 
 |   pshuflw               m0, m0, 0x0 | 
 |   punpcklqdq            m0, m0 | 
 |   packuswb              m0, m0 | 
 | .loop: | 
 |   mova    [dstq          ], m0 | 
 |   mova    [dstq+strideq  ], m0 | 
 |   mova    [dstq+strideq*2], m0 | 
 |   mova    [dstq+stride3q ], m0 | 
 |   lea                 dstq, [dstq+strideq*4] | 
 |   dec              lines4d | 
 |   jnz .loop | 
 |  | 
 |   RESTORE_GOT | 
 |   REP_RET | 
 |  | 
 |  | 
 | INIT_XMM sse2 | 
 | cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset | 
 |   GET_GOT     goffsetq | 
 |  | 
 |   pxor                  m1, m1 | 
 |   mova                  m0, [aboveq] | 
 |   DEFINE_ARGS dst, stride, stride3, lines4 | 
 |   lea             stride3q, [strideq*3] | 
 |   mov              lines4d, 4 | 
 |   psadbw                m0, m1 | 
 |   movhlps               m2, m0 | 
 |   paddw                 m0, m2 | 
 |   paddw                 m0, [GLOBAL(pw2_16)] | 
 |   psraw                 m0, 4 | 
 |   pshuflw               m0, m0, 0x0 | 
 |   punpcklqdq            m0, m0 | 
 |   packuswb              m0, m0 | 
 | .loop: | 
 |   mova    [dstq          ], m0 | 
 |   mova    [dstq+strideq  ], m0 | 
 |   mova    [dstq+strideq*2], m0 | 
 |   mova    [dstq+stride3q ], m0 | 
 |   lea                 dstq, [dstq+strideq*4] | 
 |   dec              lines4d | 
 |   jnz .loop | 
 |  | 
 |   RESTORE_GOT | 
 |   REP_RET | 
 |  | 
 | INIT_XMM sse2 | 
 | cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset | 
 |   GET_GOT     goffsetq | 
 |  | 
 |   pxor                  m1, m1 | 
 |   mova                  m0, [leftq] | 
 |   DEFINE_ARGS dst, stride, stride3, lines4 | 
 |   lea             stride3q, [strideq*3] | 
 |   mov              lines4d, 4 | 
 |   psadbw                m0, m1 | 
 |   movhlps               m2, m0 | 
 |   paddw                 m0, m2 | 
 |   paddw                 m0, [GLOBAL(pw2_16)] | 
 |   psraw                 m0, 4 | 
 |   pshuflw               m0, m0, 0x0 | 
 |   punpcklqdq            m0, m0 | 
 |   packuswb              m0, m0 | 
 | .loop: | 
 |   mova    [dstq          ], m0 | 
 |   mova    [dstq+strideq  ], m0 | 
 |   mova    [dstq+strideq*2], m0 | 
 |   mova    [dstq+stride3q ], m0 | 
 |   lea                 dstq, [dstq+strideq*4] | 
 |   dec              lines4d | 
 |   jnz .loop | 
 |  | 
 |   RESTORE_GOT | 
 |   REP_RET | 
 |  | 
 | INIT_XMM sse2 | 
 | cglobal dc_128_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset | 
 |   GET_GOT     goffsetq | 
 |  | 
 |   DEFINE_ARGS dst, stride, stride3, lines4 | 
 |   lea             stride3q, [strideq*3] | 
 |   mov              lines4d, 4 | 
 |   mova    m0,        [GLOBAL(dc_128)] | 
 | .loop: | 
 |   mova    [dstq          ], m0 | 
 |   mova    [dstq+strideq  ], m0 | 
 |   mova    [dstq+strideq*2], m0 | 
 |   mova    [dstq+stride3q ], m0 | 
 |   lea                 dstq, [dstq+strideq*4] | 
 |   dec              lines4d | 
 |   jnz .loop | 
 |   RESTORE_GOT | 
 |   RET | 
 |  | 
 |  | 
 | INIT_XMM sse2 | 
 | cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset | 
 |   GET_GOT     goffsetq | 
 |  | 
 |   pxor                  m1, m1 | 
 |   mova                  m0, [aboveq] | 
 |   mova                  m2, [aboveq+16] | 
 |   mova                  m3, [leftq] | 
 |   mova                  m4, [leftq+16] | 
 |   DEFINE_ARGS dst, stride, stride3, lines4 | 
 |   lea             stride3q, [strideq*3] | 
 |   mov              lines4d, 8 | 
 |   psadbw                m0, m1 | 
 |   psadbw                m2, m1 | 
 |   psadbw                m3, m1 | 
 |   psadbw                m4, m1 | 
 |   paddw                 m0, m2 | 
 |   paddw                 m0, m3 | 
 |   paddw                 m0, m4 | 
 |   movhlps               m2, m0 | 
 |   paddw                 m0, m2 | 
 |   paddw                 m0, [GLOBAL(pw_32)] | 
 |   psraw                 m0, 6 | 
 |   pshuflw               m0, m0, 0x0 | 
 |   punpcklqdq            m0, m0 | 
 |   packuswb              m0, m0 | 
 | .loop: | 
 |   mova [dstq             ], m0 | 
 |   mova [dstq          +16], m0 | 
 |   mova [dstq+strideq     ], m0 | 
 |   mova [dstq+strideq  +16], m0 | 
 |   mova [dstq+strideq*2   ], m0 | 
 |   mova [dstq+strideq*2+16], m0 | 
 |   mova [dstq+stride3q    ], m0 | 
 |   mova [dstq+stride3q +16], m0 | 
 |   lea                 dstq, [dstq+strideq*4] | 
 |   dec              lines4d | 
 |   jnz .loop | 
 |  | 
 |   RESTORE_GOT | 
 |   REP_RET | 
 |  | 
 | INIT_XMM sse2 | 
 | cglobal dc_top_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset | 
 |   GET_GOT     goffsetq | 
 |  | 
 |   pxor                  m1, m1 | 
 |   mova                  m0, [aboveq] | 
 |   mova                  m2, [aboveq+16] | 
 |   DEFINE_ARGS dst, stride, stride3, lines4 | 
 |   lea             stride3q, [strideq*3] | 
 |   mov              lines4d, 8 | 
 |   psadbw                m0, m1 | 
 |   psadbw                m2, m1 | 
 |   paddw                 m0, m2 | 
 |   movhlps               m2, m0 | 
 |   paddw                 m0, m2 | 
 |   paddw                 m0, [GLOBAL(pw2_32)] | 
 |   psraw                 m0, 5 | 
 |   pshuflw               m0, m0, 0x0 | 
 |   punpcklqdq            m0, m0 | 
 |   packuswb              m0, m0 | 
 | .loop: | 
 |   mova [dstq             ], m0 | 
 |   mova [dstq          +16], m0 | 
 |   mova [dstq+strideq     ], m0 | 
 |   mova [dstq+strideq  +16], m0 | 
 |   mova [dstq+strideq*2   ], m0 | 
 |   mova [dstq+strideq*2+16], m0 | 
 |   mova [dstq+stride3q    ], m0 | 
 |   mova [dstq+stride3q +16], m0 | 
 |   lea                 dstq, [dstq+strideq*4] | 
 |   dec              lines4d | 
 |   jnz .loop | 
 |  | 
 |   RESTORE_GOT | 
 |   REP_RET | 
 |  | 
 | INIT_XMM sse2 | 
 | cglobal dc_left_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset | 
 |   GET_GOT     goffsetq | 
 |  | 
 |   pxor                  m1, m1 | 
 |   mova                  m0, [leftq] | 
 |   mova                  m2, [leftq+16] | 
 |   DEFINE_ARGS dst, stride, stride3, lines4 | 
 |   lea             stride3q, [strideq*3] | 
 |   mov              lines4d, 8 | 
 |   psadbw                m0, m1 | 
 |   psadbw                m2, m1 | 
 |   paddw                 m0, m2 | 
 |   movhlps               m2, m0 | 
 |   paddw                 m0, m2 | 
 |   paddw                 m0, [GLOBAL(pw2_32)] | 
 |   psraw                 m0, 5 | 
 |   pshuflw               m0, m0, 0x0 | 
 |   punpcklqdq            m0, m0 | 
 |   packuswb              m0, m0 | 
 | .loop: | 
 |   mova [dstq             ], m0 | 
 |   mova [dstq          +16], m0 | 
 |   mova [dstq+strideq     ], m0 | 
 |   mova [dstq+strideq  +16], m0 | 
 |   mova [dstq+strideq*2   ], m0 | 
 |   mova [dstq+strideq*2+16], m0 | 
 |   mova [dstq+stride3q    ], m0 | 
 |   mova [dstq+stride3q +16], m0 | 
 |   lea                 dstq, [dstq+strideq*4] | 
 |   dec              lines4d | 
 |   jnz .loop | 
 |  | 
 |   RESTORE_GOT | 
 |   REP_RET | 
 |  | 
 | INIT_XMM sse2 | 
 | cglobal dc_128_predictor_32x32, 4, 5, 3, dst, stride, above, left, goffset | 
 |   GET_GOT     goffsetq | 
 |  | 
 |   DEFINE_ARGS dst, stride, stride3, lines4 | 
 |   lea             stride3q, [strideq*3] | 
 |   mov              lines4d, 8 | 
 |   mova    m0,        [GLOBAL(dc_128)] | 
 | .loop: | 
 |   mova [dstq             ], m0 | 
 |   mova [dstq          +16], m0 | 
 |   mova [dstq+strideq     ], m0 | 
 |   mova [dstq+strideq  +16], m0 | 
 |   mova [dstq+strideq*2   ], m0 | 
 |   mova [dstq+strideq*2+16], m0 | 
 |   mova [dstq+stride3q    ], m0 | 
 |   mova [dstq+stride3q +16], m0 | 
 |   lea                 dstq, [dstq+strideq*4] | 
 |   dec              lines4d | 
 |   jnz .loop | 
 |   RESTORE_GOT | 
 |   RET | 
 |  | 
 | INIT_XMM sse2 | 
 | cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above | 
 |   movd                  m0, [aboveq] | 
 |   movd      [dstq        ], m0 | 
 |   movd      [dstq+strideq], m0 | 
 |   lea                 dstq, [dstq+strideq*2] | 
 |   movd      [dstq        ], m0 | 
 |   movd      [dstq+strideq], m0 | 
 |   RET | 
 |  | 
 | INIT_XMM sse2 | 
 | cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above | 
 |   movq                  m0, [aboveq] | 
 |   DEFINE_ARGS dst, stride, stride3 | 
 |   lea             stride3q, [strideq*3] | 
 |   movq    [dstq          ], m0 | 
 |   movq    [dstq+strideq  ], m0 | 
 |   movq    [dstq+strideq*2], m0 | 
 |   movq    [dstq+stride3q ], m0 | 
 |   lea                 dstq, [dstq+strideq*4] | 
 |   movq    [dstq          ], m0 | 
 |   movq    [dstq+strideq  ], m0 | 
 |   movq    [dstq+strideq*2], m0 | 
 |   movq    [dstq+stride3q ], m0 | 
 |   RET | 
 |  | 
 | INIT_XMM sse2 | 
 | cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above | 
 |   mova                  m0, [aboveq] | 
 |   DEFINE_ARGS dst, stride, stride3, nlines4 | 
 |   lea             stride3q, [strideq*3] | 
 |   mov              nlines4d, 4 | 
 | .loop: | 
 |   mova    [dstq          ], m0 | 
 |   mova    [dstq+strideq  ], m0 | 
 |   mova    [dstq+strideq*2], m0 | 
 |   mova    [dstq+stride3q ], m0 | 
 |   lea                 dstq, [dstq+strideq*4] | 
 |   dec             nlines4d | 
 |   jnz .loop | 
 |   REP_RET | 
 |  | 
 | INIT_XMM sse2 | 
 | cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above | 
 |   mova                  m0, [aboveq] | 
 |   mova                  m1, [aboveq+16] | 
 |   DEFINE_ARGS dst, stride, stride3, nlines4 | 
 |   lea             stride3q, [strideq*3] | 
 |   mov              nlines4d, 8 | 
 | .loop: | 
 |   mova [dstq             ], m0 | 
 |   mova [dstq          +16], m1 | 
 |   mova [dstq+strideq     ], m0 | 
 |   mova [dstq+strideq  +16], m1 | 
 |   mova [dstq+strideq*2   ], m0 | 
 |   mova [dstq+strideq*2+16], m1 | 
 |   mova [dstq+stride3q    ], m0 | 
 |   mova [dstq+stride3q +16], m1 | 
 |   lea                 dstq, [dstq+strideq*4] | 
 |   dec             nlines4d | 
 |   jnz .loop | 
 |   REP_RET | 
 |  | 
 | INIT_XMM sse2 | 
 | cglobal h_predictor_4x4, 2, 4, 4, dst, stride, line, left | 
 |   movifnidn          leftq, leftmp | 
 |   movd                  m0, [leftq] | 
 |   punpcklbw             m0, m0 | 
 |   punpcklbw             m0, m0 | 
 |   pshufd                m1, m0, 0x1 | 
 |   movd      [dstq        ], m0 | 
 |   movd      [dstq+strideq], m1 | 
 |   pshufd                m2, m0, 0x2 | 
 |   lea                 dstq, [dstq+strideq*2] | 
 |   pshufd                m3, m0, 0x3 | 
 |   movd      [dstq        ], m2 | 
 |   movd      [dstq+strideq], m3 | 
 |   RET | 
 |  | 
 | INIT_XMM sse2 | 
 | cglobal h_predictor_8x8, 2, 5, 3, dst, stride, line, left | 
 |   movifnidn          leftq, leftmp | 
 |   mov                lineq, -2 | 
 |   DEFINE_ARGS  dst, stride, line, left, stride3 | 
 |   lea             stride3q, [strideq*3] | 
 |   movq                  m0, [leftq    ] | 
 |   punpcklbw             m0, m0              ; l1 l1 l2 l2 ... l8 l8 | 
 | .loop: | 
 |   pshuflw               m1, m0, 0x0         ; l1 l1 l1 l1 l1 l1 l1 l1 | 
 |   pshuflw               m2, m0, 0x55        ; l2 l2 l2 l2 l2 l2 l2 l2 | 
 |   movq      [dstq        ], m1 | 
 |   movq      [dstq+strideq], m2 | 
 |   pshuflw               m1, m0, 0xaa | 
 |   pshuflw               m2, m0, 0xff | 
 |   movq    [dstq+strideq*2], m1 | 
 |   movq    [dstq+stride3q ], m2 | 
 |   pshufd                m0, m0, 0xe         ; [63:0] l5 l5 l6 l6 l7 l7 l8 l8 | 
 |   inc                lineq | 
 |   lea                 dstq, [dstq+strideq*4] | 
 |   jnz .loop | 
 |   REP_RET | 
 |  | 
 | INIT_XMM sse2 | 
 | cglobal h_predictor_16x16, 2, 5, 3, dst, stride, line, left | 
 |   movifnidn          leftq, leftmp | 
 |   mov                lineq, -4 | 
 |   DEFINE_ARGS dst, stride, line, left, stride3 | 
 |   lea             stride3q, [strideq*3] | 
 | .loop: | 
 |   movd                  m0, [leftq] | 
 |   punpcklbw             m0, m0 | 
 |   punpcklbw             m0, m0              ; l1 to l4 each repeated 4 times | 
 |   pshufd            m1, m0, 0x0             ; l1 repeated 16 times | 
 |   pshufd            m2, m0, 0x55            ; l2 repeated 16 times | 
 |   mova    [dstq          ], m1 | 
 |   mova    [dstq+strideq  ], m2 | 
 |   pshufd            m1, m0, 0xaa | 
 |   pshufd            m2, m0, 0xff | 
 |   mova    [dstq+strideq*2], m1 | 
 |   mova    [dstq+stride3q ], m2 | 
 |   inc                lineq | 
 |   lea                leftq, [leftq+4       ] | 
 |   lea                 dstq, [dstq+strideq*4] | 
 |   jnz .loop | 
 |   REP_RET | 
 |  | 
 | INIT_XMM sse2 | 
 | cglobal h_predictor_32x32, 2, 5, 3, dst, stride, line, left | 
 |   movifnidn              leftq, leftmp | 
 |   mov                    lineq, -8 | 
 |   DEFINE_ARGS dst, stride, line, left, stride3 | 
 |   lea                 stride3q, [strideq*3] | 
 | .loop: | 
 |   movd                      m0, [leftq] | 
 |   punpcklbw                 m0, m0 | 
 |   punpcklbw                 m0, m0              ; l1 to l4 each repeated 4 times | 
 |   pshufd                m1, m0, 0x0             ; l1 repeated 16 times | 
 |   pshufd                m2, m0, 0x55            ; l2 repeated 16 times | 
 |   mova     [dstq             ], m1 | 
 |   mova     [dstq+16          ], m1 | 
 |   mova     [dstq+strideq     ], m2 | 
 |   mova     [dstq+strideq+16  ], m2 | 
 |   pshufd                m1, m0, 0xaa | 
 |   pshufd                m2, m0, 0xff | 
 |   mova     [dstq+strideq*2   ], m1 | 
 |   mova     [dstq+strideq*2+16], m1 | 
 |   mova     [dstq+stride3q    ], m2 | 
 |   mova     [dstq+stride3q+16 ], m2 | 
 |   inc                    lineq | 
 |   lea                    leftq, [leftq+4       ] | 
 |   lea                     dstq, [dstq+strideq*4] | 
 |   jnz .loop | 
 |   REP_RET |