| ; |
| ; Copyright (c) 2016, Alliance for Open Media. All rights reserved |
| ; |
| ; This source code is subject to the terms of the BSD 2 Clause License and |
| ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| ; was not distributed with this source code in the LICENSE file, you can |
| ; obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| ; Media Patent License 1.0 was not distributed with this source code in the |
| ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
| ; |
| |
| ; |
| |
| %include "third_party/x86inc/x86inc.asm" |
| |
| SECTION_RODATA |
| pb_1: times 16 db 1 |
| pw_4: times 8 dw 4 |
| pw_8: times 8 dw 8 |
| pw_16: times 8 dw 16 |
| pw_32: times 8 dw 32 |
| dc_128: times 16 db 128 |
| pw2_4: times 8 dw 2 |
| pw2_8: times 8 dw 4 |
| pw2_16: times 8 dw 8 |
| pw2_32: times 8 dw 16 |
| |
| SECTION .text |
| |
| ; ------------------------------------------ |
| ; input: x, y, z, result |
| ; |
| ; trick from pascal |
| ; (x+2y+z+2)>>2 can be calculated as: |
| ; result = avg(x,z) |
| ; result -= xor(x,z) & 1 |
| ; result = avg(result,y) |
| ; ------------------------------------------ |
| %macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4 |
| pavgb %4, %1, %3 |
| pxor %3, %1 |
| pand %3, [GLOBAL(pb_1)] |
| psubb %4, %3 |
| pavgb %4, %2 |
| %endmacro |
| |
| INIT_XMM sse2 |
| cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset |
| GET_GOT goffsetq |
| |
| movd m2, [leftq] |
| movd m0, [aboveq] |
| pxor m1, m1 |
| punpckldq m0, m2 |
| psadbw m0, m1 |
| paddw m0, [GLOBAL(pw_4)] |
| psraw m0, 3 |
| pshuflw m0, m0, 0x0 |
| packuswb m0, m0 |
| movd [dstq ], m0 |
| movd [dstq+strideq], m0 |
| lea dstq, [dstq+strideq*2] |
| movd [dstq ], m0 |
| movd [dstq+strideq], m0 |
| |
| RESTORE_GOT |
| RET |
| |
| INIT_XMM sse2 |
| cglobal dc_left_predictor_4x4, 2, 5, 2, dst, stride, above, left, goffset |
| movifnidn leftq, leftmp |
| GET_GOT goffsetq |
| |
| pxor m1, m1 |
| movd m0, [leftq] |
| psadbw m0, m1 |
| paddw m0, [GLOBAL(pw2_4)] |
| psraw m0, 2 |
| pshuflw m0, m0, 0x0 |
| packuswb m0, m0 |
| movd [dstq ], m0 |
| movd [dstq+strideq], m0 |
| lea dstq, [dstq+strideq*2] |
| movd [dstq ], m0 |
| movd [dstq+strideq], m0 |
| |
| RESTORE_GOT |
| RET |
| |
| INIT_XMM sse2 |
| cglobal dc_top_predictor_4x4, 3, 5, 2, dst, stride, above, left, goffset |
| GET_GOT goffsetq |
| |
| pxor m1, m1 |
| movd m0, [aboveq] |
| psadbw m0, m1 |
| paddw m0, [GLOBAL(pw2_4)] |
| psraw m0, 2 |
| pshuflw m0, m0, 0x0 |
| packuswb m0, m0 |
| movd [dstq ], m0 |
| movd [dstq+strideq], m0 |
| lea dstq, [dstq+strideq*2] |
| movd [dstq ], m0 |
| movd [dstq+strideq], m0 |
| |
| RESTORE_GOT |
| RET |
| |
| INIT_XMM sse2 |
| cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset |
| GET_GOT goffsetq |
| |
| pxor m1, m1 |
| movq m0, [aboveq] |
| movq m2, [leftq] |
| DEFINE_ARGS dst, stride, stride3 |
| lea stride3q, [strideq*3] |
| psadbw m0, m1 |
| psadbw m2, m1 |
| paddw m0, m2 |
| paddw m0, [GLOBAL(pw_8)] |
| psraw m0, 4 |
| punpcklbw m0, m0 |
| pshuflw m0, m0, 0x0 |
| movq [dstq ], m0 |
| movq [dstq+strideq ], m0 |
| movq [dstq+strideq*2], m0 |
| movq [dstq+stride3q ], m0 |
| lea dstq, [dstq+strideq*4] |
| movq [dstq ], m0 |
| movq [dstq+strideq ], m0 |
| movq [dstq+strideq*2], m0 |
| movq [dstq+stride3q ], m0 |
| |
| RESTORE_GOT |
| RET |
| |
| INIT_XMM sse2 |
| cglobal dc_top_predictor_8x8, 3, 5, 2, dst, stride, above, left, goffset |
| GET_GOT goffsetq |
| |
| pxor m1, m1 |
| movq m0, [aboveq] |
| DEFINE_ARGS dst, stride, stride3 |
| lea stride3q, [strideq*3] |
| psadbw m0, m1 |
| paddw m0, [GLOBAL(pw2_8)] |
| psraw m0, 3 |
| punpcklbw m0, m0 |
| pshuflw m0, m0, 0x0 |
| movq [dstq ], m0 |
| movq [dstq+strideq ], m0 |
| movq [dstq+strideq*2], m0 |
| movq [dstq+stride3q ], m0 |
| lea dstq, [dstq+strideq*4] |
| movq [dstq ], m0 |
| movq [dstq+strideq ], m0 |
| movq [dstq+strideq*2], m0 |
| movq [dstq+stride3q ], m0 |
| |
| RESTORE_GOT |
| RET |
| |
| INIT_XMM sse2 |
| cglobal dc_left_predictor_8x8, 2, 5, 2, dst, stride, above, left, goffset |
| movifnidn leftq, leftmp |
| GET_GOT goffsetq |
| |
| pxor m1, m1 |
| movq m0, [leftq] |
| DEFINE_ARGS dst, stride, stride3 |
| lea stride3q, [strideq*3] |
| psadbw m0, m1 |
| paddw m0, [GLOBAL(pw2_8)] |
| psraw m0, 3 |
| punpcklbw m0, m0 |
| pshuflw m0, m0, 0x0 |
| movq [dstq ], m0 |
| movq [dstq+strideq ], m0 |
| movq [dstq+strideq*2], m0 |
| movq [dstq+stride3q ], m0 |
| lea dstq, [dstq+strideq*4] |
| movq [dstq ], m0 |
| movq [dstq+strideq ], m0 |
| movq [dstq+strideq*2], m0 |
| movq [dstq+stride3q ], m0 |
| |
| RESTORE_GOT |
| RET |
| |
| INIT_XMM sse2 |
| cglobal dc_128_predictor_4x4, 2, 5, 1, dst, stride, above, left, goffset |
| GET_GOT goffsetq |
| |
| DEFINE_ARGS dst, stride, stride3 |
| lea stride3q, [strideq*3] |
| movd m0, [GLOBAL(dc_128)] |
| movd [dstq ], m0 |
| movd [dstq+strideq ], m0 |
| movd [dstq+strideq*2], m0 |
| movd [dstq+stride3q ], m0 |
| RESTORE_GOT |
| RET |
| |
| INIT_XMM sse2 |
| cglobal dc_128_predictor_8x8, 2, 5, 1, dst, stride, above, left, goffset |
| GET_GOT goffsetq |
| |
| DEFINE_ARGS dst, stride, stride3 |
| lea stride3q, [strideq*3] |
| movq m0, [GLOBAL(dc_128)] |
| movq [dstq ], m0 |
| movq [dstq+strideq ], m0 |
| movq [dstq+strideq*2], m0 |
| movq [dstq+stride3q ], m0 |
| lea dstq, [dstq+strideq*4] |
| movq [dstq ], m0 |
| movq [dstq+strideq ], m0 |
| movq [dstq+strideq*2], m0 |
| movq [dstq+stride3q ], m0 |
| RESTORE_GOT |
| RET |
| |
| INIT_XMM sse2 |
| cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset |
| GET_GOT goffsetq |
| |
| pxor m1, m1 |
| mova m0, [aboveq] |
| mova m2, [leftq] |
| DEFINE_ARGS dst, stride, stride3, lines4 |
| lea stride3q, [strideq*3] |
| mov lines4d, 4 |
| psadbw m0, m1 |
| psadbw m2, m1 |
| paddw m0, m2 |
| movhlps m2, m0 |
| paddw m0, m2 |
| paddw m0, [GLOBAL(pw_16)] |
| psraw m0, 5 |
| pshuflw m0, m0, 0x0 |
| punpcklqdq m0, m0 |
| packuswb m0, m0 |
| .loop: |
| mova [dstq ], m0 |
| mova [dstq+strideq ], m0 |
| mova [dstq+strideq*2], m0 |
| mova [dstq+stride3q ], m0 |
| lea dstq, [dstq+strideq*4] |
| dec lines4d |
| jnz .loop |
| |
| RESTORE_GOT |
| REP_RET |
| |
| |
| INIT_XMM sse2 |
| cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset |
| GET_GOT goffsetq |
| |
| pxor m1, m1 |
| mova m0, [aboveq] |
| DEFINE_ARGS dst, stride, stride3, lines4 |
| lea stride3q, [strideq*3] |
| mov lines4d, 4 |
| psadbw m0, m1 |
| movhlps m2, m0 |
| paddw m0, m2 |
| paddw m0, [GLOBAL(pw2_16)] |
| psraw m0, 4 |
| pshuflw m0, m0, 0x0 |
| punpcklqdq m0, m0 |
| packuswb m0, m0 |
| .loop: |
| mova [dstq ], m0 |
| mova [dstq+strideq ], m0 |
| mova [dstq+strideq*2], m0 |
| mova [dstq+stride3q ], m0 |
| lea dstq, [dstq+strideq*4] |
| dec lines4d |
| jnz .loop |
| |
| RESTORE_GOT |
| REP_RET |
| |
| INIT_XMM sse2 |
| cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset |
| GET_GOT goffsetq |
| |
| pxor m1, m1 |
| mova m0, [leftq] |
| DEFINE_ARGS dst, stride, stride3, lines4 |
| lea stride3q, [strideq*3] |
| mov lines4d, 4 |
| psadbw m0, m1 |
| movhlps m2, m0 |
| paddw m0, m2 |
| paddw m0, [GLOBAL(pw2_16)] |
| psraw m0, 4 |
| pshuflw m0, m0, 0x0 |
| punpcklqdq m0, m0 |
| packuswb m0, m0 |
| .loop: |
| mova [dstq ], m0 |
| mova [dstq+strideq ], m0 |
| mova [dstq+strideq*2], m0 |
| mova [dstq+stride3q ], m0 |
| lea dstq, [dstq+strideq*4] |
| dec lines4d |
| jnz .loop |
| |
| RESTORE_GOT |
| REP_RET |
| |
| INIT_XMM sse2 |
| cglobal dc_128_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset |
| GET_GOT goffsetq |
| |
| DEFINE_ARGS dst, stride, stride3, lines4 |
| lea stride3q, [strideq*3] |
| mov lines4d, 4 |
| mova m0, [GLOBAL(dc_128)] |
| .loop: |
| mova [dstq ], m0 |
| mova [dstq+strideq ], m0 |
| mova [dstq+strideq*2], m0 |
| mova [dstq+stride3q ], m0 |
| lea dstq, [dstq+strideq*4] |
| dec lines4d |
| jnz .loop |
| RESTORE_GOT |
| RET |
| |
| |
| INIT_XMM sse2 |
| cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset |
| GET_GOT goffsetq |
| |
| pxor m1, m1 |
| mova m0, [aboveq] |
| mova m2, [aboveq+16] |
| mova m3, [leftq] |
| mova m4, [leftq+16] |
| DEFINE_ARGS dst, stride, stride3, lines4 |
| lea stride3q, [strideq*3] |
| mov lines4d, 8 |
| psadbw m0, m1 |
| psadbw m2, m1 |
| psadbw m3, m1 |
| psadbw m4, m1 |
| paddw m0, m2 |
| paddw m0, m3 |
| paddw m0, m4 |
| movhlps m2, m0 |
| paddw m0, m2 |
| paddw m0, [GLOBAL(pw_32)] |
| psraw m0, 6 |
| pshuflw m0, m0, 0x0 |
| punpcklqdq m0, m0 |
| packuswb m0, m0 |
| .loop: |
| mova [dstq ], m0 |
| mova [dstq +16], m0 |
| mova [dstq+strideq ], m0 |
| mova [dstq+strideq +16], m0 |
| mova [dstq+strideq*2 ], m0 |
| mova [dstq+strideq*2+16], m0 |
| mova [dstq+stride3q ], m0 |
| mova [dstq+stride3q +16], m0 |
| lea dstq, [dstq+strideq*4] |
| dec lines4d |
| jnz .loop |
| |
| RESTORE_GOT |
| REP_RET |
| |
| INIT_XMM sse2 |
| cglobal dc_top_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset |
| GET_GOT goffsetq |
| |
| pxor m1, m1 |
| mova m0, [aboveq] |
| mova m2, [aboveq+16] |
| DEFINE_ARGS dst, stride, stride3, lines4 |
| lea stride3q, [strideq*3] |
| mov lines4d, 8 |
| psadbw m0, m1 |
| psadbw m2, m1 |
| paddw m0, m2 |
| movhlps m2, m0 |
| paddw m0, m2 |
| paddw m0, [GLOBAL(pw2_32)] |
| psraw m0, 5 |
| pshuflw m0, m0, 0x0 |
| punpcklqdq m0, m0 |
| packuswb m0, m0 |
| .loop: |
| mova [dstq ], m0 |
| mova [dstq +16], m0 |
| mova [dstq+strideq ], m0 |
| mova [dstq+strideq +16], m0 |
| mova [dstq+strideq*2 ], m0 |
| mova [dstq+strideq*2+16], m0 |
| mova [dstq+stride3q ], m0 |
| mova [dstq+stride3q +16], m0 |
| lea dstq, [dstq+strideq*4] |
| dec lines4d |
| jnz .loop |
| |
| RESTORE_GOT |
| REP_RET |
| |
| INIT_XMM sse2 |
| cglobal dc_left_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset |
| GET_GOT goffsetq |
| |
| pxor m1, m1 |
| mova m0, [leftq] |
| mova m2, [leftq+16] |
| DEFINE_ARGS dst, stride, stride3, lines4 |
| lea stride3q, [strideq*3] |
| mov lines4d, 8 |
| psadbw m0, m1 |
| psadbw m2, m1 |
| paddw m0, m2 |
| movhlps m2, m0 |
| paddw m0, m2 |
| paddw m0, [GLOBAL(pw2_32)] |
| psraw m0, 5 |
| pshuflw m0, m0, 0x0 |
| punpcklqdq m0, m0 |
| packuswb m0, m0 |
| .loop: |
| mova [dstq ], m0 |
| mova [dstq +16], m0 |
| mova [dstq+strideq ], m0 |
| mova [dstq+strideq +16], m0 |
| mova [dstq+strideq*2 ], m0 |
| mova [dstq+strideq*2+16], m0 |
| mova [dstq+stride3q ], m0 |
| mova [dstq+stride3q +16], m0 |
| lea dstq, [dstq+strideq*4] |
| dec lines4d |
| jnz .loop |
| |
| RESTORE_GOT |
| REP_RET |
| |
| INIT_XMM sse2 |
| cglobal dc_128_predictor_32x32, 4, 5, 3, dst, stride, above, left, goffset |
| GET_GOT goffsetq |
| |
| DEFINE_ARGS dst, stride, stride3, lines4 |
| lea stride3q, [strideq*3] |
| mov lines4d, 8 |
| mova m0, [GLOBAL(dc_128)] |
| .loop: |
| mova [dstq ], m0 |
| mova [dstq +16], m0 |
| mova [dstq+strideq ], m0 |
| mova [dstq+strideq +16], m0 |
| mova [dstq+strideq*2 ], m0 |
| mova [dstq+strideq*2+16], m0 |
| mova [dstq+stride3q ], m0 |
| mova [dstq+stride3q +16], m0 |
| lea dstq, [dstq+strideq*4] |
| dec lines4d |
| jnz .loop |
| RESTORE_GOT |
| RET |
| |
| INIT_XMM sse2 |
| cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above |
| movd m0, [aboveq] |
| movd [dstq ], m0 |
| movd [dstq+strideq], m0 |
| lea dstq, [dstq+strideq*2] |
| movd [dstq ], m0 |
| movd [dstq+strideq], m0 |
| RET |
| |
| INIT_XMM sse2 |
| cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above |
| movq m0, [aboveq] |
| DEFINE_ARGS dst, stride, stride3 |
| lea stride3q, [strideq*3] |
| movq [dstq ], m0 |
| movq [dstq+strideq ], m0 |
| movq [dstq+strideq*2], m0 |
| movq [dstq+stride3q ], m0 |
| lea dstq, [dstq+strideq*4] |
| movq [dstq ], m0 |
| movq [dstq+strideq ], m0 |
| movq [dstq+strideq*2], m0 |
| movq [dstq+stride3q ], m0 |
| RET |
| |
| INIT_XMM sse2 |
| cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above |
| mova m0, [aboveq] |
| DEFINE_ARGS dst, stride, stride3, nlines4 |
| lea stride3q, [strideq*3] |
| mov nlines4d, 4 |
| .loop: |
| mova [dstq ], m0 |
| mova [dstq+strideq ], m0 |
| mova [dstq+strideq*2], m0 |
| mova [dstq+stride3q ], m0 |
| lea dstq, [dstq+strideq*4] |
| dec nlines4d |
| jnz .loop |
| REP_RET |
| |
| INIT_XMM sse2 |
| cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above |
| mova m0, [aboveq] |
| mova m1, [aboveq+16] |
| DEFINE_ARGS dst, stride, stride3, nlines4 |
| lea stride3q, [strideq*3] |
| mov nlines4d, 8 |
| .loop: |
| mova [dstq ], m0 |
| mova [dstq +16], m1 |
| mova [dstq+strideq ], m0 |
| mova [dstq+strideq +16], m1 |
| mova [dstq+strideq*2 ], m0 |
| mova [dstq+strideq*2+16], m1 |
| mova [dstq+stride3q ], m0 |
| mova [dstq+stride3q +16], m1 |
| lea dstq, [dstq+strideq*4] |
| dec nlines4d |
| jnz .loop |
| REP_RET |
| |
| INIT_XMM sse2 |
| cglobal h_predictor_4x4, 2, 4, 4, dst, stride, line, left |
| movifnidn leftq, leftmp |
| movd m0, [leftq] |
| punpcklbw m0, m0 |
| punpcklbw m0, m0 |
| pshufd m1, m0, 0x1 |
| movd [dstq ], m0 |
| movd [dstq+strideq], m1 |
| pshufd m2, m0, 0x2 |
| lea dstq, [dstq+strideq*2] |
| pshufd m3, m0, 0x3 |
| movd [dstq ], m2 |
| movd [dstq+strideq], m3 |
| RET |
| |
| INIT_XMM sse2 |
| cglobal h_predictor_8x8, 2, 5, 3, dst, stride, line, left |
| movifnidn leftq, leftmp |
| mov lineq, -2 |
| DEFINE_ARGS dst, stride, line, left, stride3 |
| lea stride3q, [strideq*3] |
| movq m0, [leftq ] |
| punpcklbw m0, m0 ; l1 l1 l2 l2 ... l8 l8 |
| .loop: |
| pshuflw m1, m0, 0x0 ; l1 l1 l1 l1 l1 l1 l1 l1 |
| pshuflw m2, m0, 0x55 ; l2 l2 l2 l2 l2 l2 l2 l2 |
| movq [dstq ], m1 |
| movq [dstq+strideq], m2 |
| pshuflw m1, m0, 0xaa |
| pshuflw m2, m0, 0xff |
| movq [dstq+strideq*2], m1 |
| movq [dstq+stride3q ], m2 |
| pshufd m0, m0, 0xe ; [63:0] l5 l5 l6 l6 l7 l7 l8 l8 |
| inc lineq |
| lea dstq, [dstq+strideq*4] |
| jnz .loop |
| REP_RET |
| |
| INIT_XMM sse2 |
| cglobal h_predictor_16x16, 2, 5, 3, dst, stride, line, left |
| movifnidn leftq, leftmp |
| mov lineq, -4 |
| DEFINE_ARGS dst, stride, line, left, stride3 |
| lea stride3q, [strideq*3] |
| .loop: |
| movd m0, [leftq] |
| punpcklbw m0, m0 |
| punpcklbw m0, m0 ; l1 to l4 each repeated 4 times |
| pshufd m1, m0, 0x0 ; l1 repeated 16 times |
| pshufd m2, m0, 0x55 ; l2 repeated 16 times |
| mova [dstq ], m1 |
| mova [dstq+strideq ], m2 |
| pshufd m1, m0, 0xaa |
| pshufd m2, m0, 0xff |
| mova [dstq+strideq*2], m1 |
| mova [dstq+stride3q ], m2 |
| inc lineq |
| lea leftq, [leftq+4 ] |
| lea dstq, [dstq+strideq*4] |
| jnz .loop |
| REP_RET |
| |
| INIT_XMM sse2 |
| cglobal h_predictor_32x32, 2, 5, 3, dst, stride, line, left |
| movifnidn leftq, leftmp |
| mov lineq, -8 |
| DEFINE_ARGS dst, stride, line, left, stride3 |
| lea stride3q, [strideq*3] |
| .loop: |
| movd m0, [leftq] |
| punpcklbw m0, m0 |
| punpcklbw m0, m0 ; l1 to l4 each repeated 4 times |
| pshufd m1, m0, 0x0 ; l1 repeated 16 times |
| pshufd m2, m0, 0x55 ; l2 repeated 16 times |
| mova [dstq ], m1 |
| mova [dstq+16 ], m1 |
| mova [dstq+strideq ], m2 |
| mova [dstq+strideq+16 ], m2 |
| pshufd m1, m0, 0xaa |
| pshufd m2, m0, 0xff |
| mova [dstq+strideq*2 ], m1 |
| mova [dstq+strideq*2+16], m1 |
| mova [dstq+stride3q ], m2 |
| mova [dstq+stride3q+16 ], m2 |
| inc lineq |
| lea leftq, [leftq+4 ] |
| lea dstq, [dstq+strideq*4] |
| jnz .loop |
| REP_RET |
| |
| INIT_XMM sse2 |
| cglobal tm_predictor_4x4, 4, 4, 5, dst, stride, above, left |
| pxor m1, m1 |
| movq m0, [aboveq-1]; [63:0] tl t1 t2 t3 t4 x x x |
| punpcklbw m0, m1 |
| pshuflw m2, m0, 0x0 ; [63:0] tl tl tl tl [word] |
| psrldq m0, 2 |
| psubw m0, m2 ; [63:0] t1-tl t2-tl t3-tl t4-tl [word] |
| movd m2, [leftq] |
| punpcklbw m2, m1 |
| pshuflw m4, m2, 0x0 ; [63:0] l1 l1 l1 l1 [word] |
| pshuflw m3, m2, 0x55 ; [63:0] l2 l2 l2 l2 [word] |
| paddw m4, m0 |
| paddw m3, m0 |
| packuswb m4, m4 |
| packuswb m3, m3 |
| movd [dstq ], m4 |
| movd [dstq+strideq], m3 |
| lea dstq, [dstq+strideq*2] |
| pshuflw m4, m2, 0xaa |
| pshuflw m3, m2, 0xff |
| paddw m4, m0 |
| paddw m3, m0 |
| packuswb m4, m4 |
| packuswb m3, m3 |
| movd [dstq ], m4 |
| movd [dstq+strideq], m3 |
| RET |
| |
| INIT_XMM sse2 |
| cglobal tm_predictor_8x8, 4, 4, 5, dst, stride, above, left |
| pxor m1, m1 |
| movd m2, [aboveq-1] |
| movq m0, [aboveq] |
| punpcklbw m2, m1 |
| punpcklbw m0, m1 ; t1 t2 t3 t4 t5 t6 t7 t8 [word] |
| pshuflw m2, m2, 0x0 ; [63:0] tl tl tl tl [word] |
| DEFINE_ARGS dst, stride, line, left |
| mov lineq, -4 |
| punpcklqdq m2, m2 ; tl tl tl tl tl tl tl tl [word] |
| psubw m0, m2 ; t1-tl t2-tl ... t8-tl [word] |
| movq m2, [leftq] |
| punpcklbw m2, m1 ; l1 l2 l3 l4 l5 l6 l7 l8 [word] |
| .loop: |
| pshuflw m4, m2, 0x0 ; [63:0] l1 l1 l1 l1 [word] |
| pshuflw m3, m2, 0x55 ; [63:0] l2 l2 l2 l2 [word] |
| punpcklqdq m4, m4 ; l1 l1 l1 l1 l1 l1 l1 l1 [word] |
| punpcklqdq m3, m3 ; l2 l2 l2 l2 l2 l2 l2 l2 [word] |
| paddw m4, m0 |
| paddw m3, m0 |
| packuswb m4, m3 |
| movq [dstq ], m4 |
| movhps [dstq+strideq], m4 |
| lea dstq, [dstq+strideq*2] |
| psrldq m2, 4 |
| inc lineq |
| jnz .loop |
| REP_RET |
| |
| INIT_XMM sse2 |
| cglobal tm_predictor_16x16, 4, 5, 8, dst, stride, above, left |
| pxor m1, m1 |
| mova m2, [aboveq-16]; |
| mova m0, [aboveq] ; t1 t2 ... t16 [byte] |
| punpckhbw m2, m1 ; [127:112] tl [word] |
| punpckhbw m4, m0, m1 |
| punpcklbw m0, m1 ; m0:m4 t1 t2 ... t16 [word] |
| DEFINE_ARGS dst, stride, line, left, stride8 |
| mov lineq, -8 |
| pshufhw m2, m2, 0xff |
| mova m3, [leftq] ; l1 l2 ... l16 [byte] |
| punpckhqdq m2, m2 ; tl repeated 8 times [word] |
| psubw m0, m2 |
| psubw m4, m2 ; m0:m4 t1-tl t2-tl ... t16-tl [word] |
| punpckhbw m5, m3, m1 |
| punpcklbw m3, m1 ; m3:m5 l1 l2 ... l16 [word] |
| lea stride8q, [strideq*8] |
| .loop: |
| pshuflw m6, m3, 0x0 |
| pshuflw m7, m5, 0x0 |
| punpcklqdq m6, m6 ; l1 repeated 8 times [word] |
| punpcklqdq m7, m7 ; l8 repeated 8 times [word] |
| paddw m1, m6, m0 |
| paddw m6, m4 ; m1:m6 ti-tl+l1 [i=1,15] [word] |
| psrldq m5, 2 |
| packuswb m1, m6 |
| mova [dstq ], m1 |
| paddw m1, m7, m0 |
| paddw m7, m4 ; m1:m7 ti-tl+l8 [i=1,15] [word] |
| psrldq m3, 2 |
| packuswb m1, m7 |
| mova [dstq+stride8q], m1 |
| inc lineq |
| lea dstq, [dstq+strideq] |
| jnz .loop |
| REP_RET |
| |
| INIT_XMM sse2 |
| cglobal tm_predictor_32x32, 4, 4, 8, dst, stride, above, left |
| pxor m1, m1 |
| movd m2, [aboveq-1] |
| mova m0, [aboveq] |
| mova m4, [aboveq+16] |
| punpcklbw m2, m1 |
| punpckhbw m3, m0, m1 |
| punpckhbw m5, m4, m1 |
| punpcklbw m0, m1 |
| punpcklbw m4, m1 |
| pshuflw m2, m2, 0x0 |
| DEFINE_ARGS dst, stride, line, left |
| mov lineq, -16 |
| punpcklqdq m2, m2 |
| add leftq, 32 |
| psubw m0, m2 |
| psubw m3, m2 |
| psubw m4, m2 |
| psubw m5, m2 |
| .loop: |
| movd m2, [leftq+lineq*2] |
| pxor m1, m1 |
| punpcklbw m2, m1 |
| pshuflw m7, m2, 0x55 |
| pshuflw m2, m2, 0x0 |
| punpcklqdq m2, m2 |
| punpcklqdq m7, m7 |
| paddw m6, m2, m3 |
| paddw m1, m2, m0 |
| packuswb m1, m6 |
| mova [dstq ], m1 |
| paddw m6, m2, m5 |
| paddw m1, m2, m4 |
| packuswb m1, m6 |
| mova [dstq+16 ], m1 |
| paddw m6, m7, m3 |
| paddw m1, m7, m0 |
| packuswb m1, m6 |
| mova [dstq+strideq ], m1 |
| paddw m6, m7, m5 |
| paddw m1, m7, m4 |
| packuswb m1, m6 |
| mova [dstq+strideq+16], m1 |
| lea dstq, [dstq+strideq*2] |
| inc lineq |
| jnz .loop |
| REP_RET |