| ; |
| ; Copyright (c) 2016, Alliance for Open Media. All rights reserved |
| ; |
| ; This source code is subject to the terms of the BSD 2 Clause License and |
| ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| ; was not distributed with this source code in the LICENSE file, you can |
| ; obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| ; Media Patent License 1.0 was not distributed with this source code in the |
| ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
| ; |
| |
| ; |
| |
| %include "third_party/x86inc/x86inc.asm" |
| |
| SECTION_RODATA |
| pb_1: times 16 db 1 |
| pw_4: times 8 dw 4 |
| pw_8: times 8 dw 8 |
| pw_16: times 8 dw 16 |
| pw_32: times 8 dw 32 |
| dc_128: times 16 db 128 |
| pw2_4: times 8 dw 2 |
| pw2_8: times 8 dw 4 |
| pw2_16: times 8 dw 8 |
| pw2_32: times 8 dw 16 |
| |
| SECTION .text |
| |
| ; ------------------------------------------ |
| ; input: x, y, z, result |
| ; |
| ; trick from pascal |
| ; (x+2y+z+2)>>2 can be calculated as: |
| ; result = avg(x,z) |
| ; result -= xor(x,z) & 1 |
| ; result = avg(result,y) |
| ; ------------------------------------------ |
| %macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4 |
| pavgb %4, %1, %3 |
| pxor %3, %1 |
| pand %3, [GLOBAL(pb_1)] |
| psubb %4, %3 |
| pavgb %4, %2 |
| %endmacro |
| |
| INIT_XMM sse2 |
| cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset |
| GET_GOT goffsetq |
| |
| movd m2, [leftq] |
| movd m0, [aboveq] |
| pxor m1, m1 |
| punpckldq m0, m2 |
| psadbw m0, m1 |
| paddw m0, [GLOBAL(pw_4)] |
| psraw m0, 3 |
| pshuflw m0, m0, 0x0 |
| packuswb m0, m0 |
| movd [dstq ], m0 |
| movd [dstq+strideq], m0 |
| lea dstq, [dstq+strideq*2] |
| movd [dstq ], m0 |
| movd [dstq+strideq], m0 |
| |
| RESTORE_GOT |
| RET |
| |
| INIT_XMM sse2 |
| cglobal dc_left_predictor_4x4, 2, 5, 2, dst, stride, above, left, goffset |
| movifnidn leftq, leftmp |
| GET_GOT goffsetq |
| |
| pxor m1, m1 |
| movd m0, [leftq] |
| psadbw m0, m1 |
| paddw m0, [GLOBAL(pw2_4)] |
| psraw m0, 2 |
| pshuflw m0, m0, 0x0 |
| packuswb m0, m0 |
| movd [dstq ], m0 |
| movd [dstq+strideq], m0 |
| lea dstq, [dstq+strideq*2] |
| movd [dstq ], m0 |
| movd [dstq+strideq], m0 |
| |
| RESTORE_GOT |
| RET |
| |
| INIT_XMM sse2 |
| cglobal dc_top_predictor_4x4, 3, 5, 2, dst, stride, above, left, goffset |
| GET_GOT goffsetq |
| |
| pxor m1, m1 |
| movd m0, [aboveq] |
| psadbw m0, m1 |
| paddw m0, [GLOBAL(pw2_4)] |
| psraw m0, 2 |
| pshuflw m0, m0, 0x0 |
| packuswb m0, m0 |
| movd [dstq ], m0 |
| movd [dstq+strideq], m0 |
| lea dstq, [dstq+strideq*2] |
| movd [dstq ], m0 |
| movd [dstq+strideq], m0 |
| |
| RESTORE_GOT |
| RET |
| |
| INIT_XMM sse2 |
| cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset |
| GET_GOT goffsetq |
| |
| pxor m1, m1 |
| movq m0, [aboveq] |
| movq m2, [leftq] |
| DEFINE_ARGS dst, stride, stride3 |
| lea stride3q, [strideq*3] |
| psadbw m0, m1 |
| psadbw m2, m1 |
| paddw m0, m2 |
| paddw m0, [GLOBAL(pw_8)] |
| psraw m0, 4 |
| punpcklbw m0, m0 |
| pshuflw m0, m0, 0x0 |
| movq [dstq ], m0 |
| movq [dstq+strideq ], m0 |
| movq [dstq+strideq*2], m0 |
| movq [dstq+stride3q ], m0 |
| lea dstq, [dstq+strideq*4] |
| movq [dstq ], m0 |
| movq [dstq+strideq ], m0 |
| movq [dstq+strideq*2], m0 |
| movq [dstq+stride3q ], m0 |
| |
| RESTORE_GOT |
| RET |
| |
| INIT_XMM sse2 |
| cglobal dc_top_predictor_8x8, 3, 5, 2, dst, stride, above, left, goffset |
| GET_GOT goffsetq |
| |
| pxor m1, m1 |
| movq m0, [aboveq] |
| DEFINE_ARGS dst, stride, stride3 |
| lea stride3q, [strideq*3] |
| psadbw m0, m1 |
| paddw m0, [GLOBAL(pw2_8)] |
| psraw m0, 3 |
| punpcklbw m0, m0 |
| pshuflw m0, m0, 0x0 |
| movq [dstq ], m0 |
| movq [dstq+strideq ], m0 |
| movq [dstq+strideq*2], m0 |
| movq [dstq+stride3q ], m0 |
| lea dstq, [dstq+strideq*4] |
| movq [dstq ], m0 |
| movq [dstq+strideq ], m0 |
| movq [dstq+strideq*2], m0 |
| movq [dstq+stride3q ], m0 |
| |
| RESTORE_GOT |
| RET |
| |
| INIT_XMM sse2 |
| cglobal dc_left_predictor_8x8, 2, 5, 2, dst, stride, above, left, goffset |
| movifnidn leftq, leftmp |
| GET_GOT goffsetq |
| |
| pxor m1, m1 |
| movq m0, [leftq] |
| DEFINE_ARGS dst, stride, stride3 |
| lea stride3q, [strideq*3] |
| psadbw m0, m1 |
| paddw m0, [GLOBAL(pw2_8)] |
| psraw m0, 3 |
| punpcklbw m0, m0 |
| pshuflw m0, m0, 0x0 |
| movq [dstq ], m0 |
| movq [dstq+strideq ], m0 |
| movq [dstq+strideq*2], m0 |
| movq [dstq+stride3q ], m0 |
| lea dstq, [dstq+strideq*4] |
| movq [dstq ], m0 |
| movq [dstq+strideq ], m0 |
| movq [dstq+strideq*2], m0 |
| movq [dstq+stride3q ], m0 |
| |
| RESTORE_GOT |
| RET |
| |
| INIT_XMM sse2 |
| cglobal dc_128_predictor_4x4, 2, 5, 1, dst, stride, above, left, goffset |
| GET_GOT goffsetq |
| |
| DEFINE_ARGS dst, stride, stride3 |
| lea stride3q, [strideq*3] |
| movd m0, [GLOBAL(dc_128)] |
| movd [dstq ], m0 |
| movd [dstq+strideq ], m0 |
| movd [dstq+strideq*2], m0 |
| movd [dstq+stride3q ], m0 |
| RESTORE_GOT |
| RET |
| |
| INIT_XMM sse2 |
| cglobal dc_128_predictor_8x8, 2, 5, 1, dst, stride, above, left, goffset |
| GET_GOT goffsetq |
| |
| DEFINE_ARGS dst, stride, stride3 |
| lea stride3q, [strideq*3] |
| movq m0, [GLOBAL(dc_128)] |
| movq [dstq ], m0 |
| movq [dstq+strideq ], m0 |
| movq [dstq+strideq*2], m0 |
| movq [dstq+stride3q ], m0 |
| lea dstq, [dstq+strideq*4] |
| movq [dstq ], m0 |
| movq [dstq+strideq ], m0 |
| movq [dstq+strideq*2], m0 |
| movq [dstq+stride3q ], m0 |
| RESTORE_GOT |
| RET |
| |
| INIT_XMM sse2 |
| cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset |
| GET_GOT goffsetq |
| |
| pxor m1, m1 |
| mova m0, [aboveq] |
| mova m2, [leftq] |
| DEFINE_ARGS dst, stride, stride3, lines4 |
| lea stride3q, [strideq*3] |
| mov lines4d, 4 |
| psadbw m0, m1 |
| psadbw m2, m1 |
| paddw m0, m2 |
| movhlps m2, m0 |
| paddw m0, m2 |
| paddw m0, [GLOBAL(pw_16)] |
| psraw m0, 5 |
| pshuflw m0, m0, 0x0 |
| punpcklqdq m0, m0 |
| packuswb m0, m0 |
| .loop: |
| mova [dstq ], m0 |
| mova [dstq+strideq ], m0 |
| mova [dstq+strideq*2], m0 |
| mova [dstq+stride3q ], m0 |
| lea dstq, [dstq+strideq*4] |
| dec lines4d |
| jnz .loop |
| |
| RESTORE_GOT |
| REP_RET |
| |
| |
| INIT_XMM sse2 |
| cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset |
| GET_GOT goffsetq |
| |
| pxor m1, m1 |
| mova m0, [aboveq] |
| DEFINE_ARGS dst, stride, stride3, lines4 |
| lea stride3q, [strideq*3] |
| mov lines4d, 4 |
| psadbw m0, m1 |
| movhlps m2, m0 |
| paddw m0, m2 |
| paddw m0, [GLOBAL(pw2_16)] |
| psraw m0, 4 |
| pshuflw m0, m0, 0x0 |
| punpcklqdq m0, m0 |
| packuswb m0, m0 |
| .loop: |
| mova [dstq ], m0 |
| mova [dstq+strideq ], m0 |
| mova [dstq+strideq*2], m0 |
| mova [dstq+stride3q ], m0 |
| lea dstq, [dstq+strideq*4] |
| dec lines4d |
| jnz .loop |
| |
| RESTORE_GOT |
| REP_RET |
| |
| INIT_XMM sse2 |
| cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset |
| GET_GOT goffsetq |
| |
| pxor m1, m1 |
| mova m0, [leftq] |
| DEFINE_ARGS dst, stride, stride3, lines4 |
| lea stride3q, [strideq*3] |
| mov lines4d, 4 |
| psadbw m0, m1 |
| movhlps m2, m0 |
| paddw m0, m2 |
| paddw m0, [GLOBAL(pw2_16)] |
| psraw m0, 4 |
| pshuflw m0, m0, 0x0 |
| punpcklqdq m0, m0 |
| packuswb m0, m0 |
| .loop: |
| mova [dstq ], m0 |
| mova [dstq+strideq ], m0 |
| mova [dstq+strideq*2], m0 |
| mova [dstq+stride3q ], m0 |
| lea dstq, [dstq+strideq*4] |
| dec lines4d |
| jnz .loop |
| |
| RESTORE_GOT |
| REP_RET |
| |
| INIT_XMM sse2 |
| cglobal dc_128_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset |
| GET_GOT goffsetq |
| |
| DEFINE_ARGS dst, stride, stride3, lines4 |
| lea stride3q, [strideq*3] |
| mov lines4d, 4 |
| mova m0, [GLOBAL(dc_128)] |
| .loop: |
| mova [dstq ], m0 |
| mova [dstq+strideq ], m0 |
| mova [dstq+strideq*2], m0 |
| mova [dstq+stride3q ], m0 |
| lea dstq, [dstq+strideq*4] |
| dec lines4d |
| jnz .loop |
| RESTORE_GOT |
| RET |
| |
| |
| INIT_XMM sse2 |
| cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset |
| GET_GOT goffsetq |
| |
| pxor m1, m1 |
| mova m0, [aboveq] |
| mova m2, [aboveq+16] |
| mova m3, [leftq] |
| mova m4, [leftq+16] |
| DEFINE_ARGS dst, stride, stride3, lines4 |
| lea stride3q, [strideq*3] |
| mov lines4d, 8 |
| psadbw m0, m1 |
| psadbw m2, m1 |
| psadbw m3, m1 |
| psadbw m4, m1 |
| paddw m0, m2 |
| paddw m0, m3 |
| paddw m0, m4 |
| movhlps m2, m0 |
| paddw m0, m2 |
| paddw m0, [GLOBAL(pw_32)] |
| psraw m0, 6 |
| pshuflw m0, m0, 0x0 |
| punpcklqdq m0, m0 |
| packuswb m0, m0 |
| .loop: |
| mova [dstq ], m0 |
| mova [dstq +16], m0 |
| mova [dstq+strideq ], m0 |
| mova [dstq+strideq +16], m0 |
| mova [dstq+strideq*2 ], m0 |
| mova [dstq+strideq*2+16], m0 |
| mova [dstq+stride3q ], m0 |
| mova [dstq+stride3q +16], m0 |
| lea dstq, [dstq+strideq*4] |
| dec lines4d |
| jnz .loop |
| |
| RESTORE_GOT |
| REP_RET |
| |
| INIT_XMM sse2 |
| cglobal dc_top_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset |
| GET_GOT goffsetq |
| |
| pxor m1, m1 |
| mova m0, [aboveq] |
| mova m2, [aboveq+16] |
| DEFINE_ARGS dst, stride, stride3, lines4 |
| lea stride3q, [strideq*3] |
| mov lines4d, 8 |
| psadbw m0, m1 |
| psadbw m2, m1 |
| paddw m0, m2 |
| movhlps m2, m0 |
| paddw m0, m2 |
| paddw m0, [GLOBAL(pw2_32)] |
| psraw m0, 5 |
| pshuflw m0, m0, 0x0 |
| punpcklqdq m0, m0 |
| packuswb m0, m0 |
| .loop: |
| mova [dstq ], m0 |
| mova [dstq +16], m0 |
| mova [dstq+strideq ], m0 |
| mova [dstq+strideq +16], m0 |
| mova [dstq+strideq*2 ], m0 |
| mova [dstq+strideq*2+16], m0 |
| mova [dstq+stride3q ], m0 |
| mova [dstq+stride3q +16], m0 |
| lea dstq, [dstq+strideq*4] |
| dec lines4d |
| jnz .loop |
| |
| RESTORE_GOT |
| REP_RET |
| |
| INIT_XMM sse2 |
| cglobal dc_left_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset |
| GET_GOT goffsetq |
| |
| pxor m1, m1 |
| mova m0, [leftq] |
| mova m2, [leftq+16] |
| DEFINE_ARGS dst, stride, stride3, lines4 |
| lea stride3q, [strideq*3] |
| mov lines4d, 8 |
| psadbw m0, m1 |
| psadbw m2, m1 |
| paddw m0, m2 |
| movhlps m2, m0 |
| paddw m0, m2 |
| paddw m0, [GLOBAL(pw2_32)] |
| psraw m0, 5 |
| pshuflw m0, m0, 0x0 |
| punpcklqdq m0, m0 |
| packuswb m0, m0 |
| .loop: |
| mova [dstq ], m0 |
| mova [dstq +16], m0 |
| mova [dstq+strideq ], m0 |
| mova [dstq+strideq +16], m0 |
| mova [dstq+strideq*2 ], m0 |
| mova [dstq+strideq*2+16], m0 |
| mova [dstq+stride3q ], m0 |
| mova [dstq+stride3q +16], m0 |
| lea dstq, [dstq+strideq*4] |
| dec lines4d |
| jnz .loop |
| |
| RESTORE_GOT |
| REP_RET |
| |
| INIT_XMM sse2 |
| cglobal dc_128_predictor_32x32, 4, 5, 3, dst, stride, above, left, goffset |
| GET_GOT goffsetq |
| |
| DEFINE_ARGS dst, stride, stride3, lines4 |
| lea stride3q, [strideq*3] |
| mov lines4d, 8 |
| mova m0, [GLOBAL(dc_128)] |
| .loop: |
| mova [dstq ], m0 |
| mova [dstq +16], m0 |
| mova [dstq+strideq ], m0 |
| mova [dstq+strideq +16], m0 |
| mova [dstq+strideq*2 ], m0 |
| mova [dstq+strideq*2+16], m0 |
| mova [dstq+stride3q ], m0 |
| mova [dstq+stride3q +16], m0 |
| lea dstq, [dstq+strideq*4] |
| dec lines4d |
| jnz .loop |
| RESTORE_GOT |
| RET |
| |
| INIT_XMM sse2 |
| cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above |
| movd m0, [aboveq] |
| movd [dstq ], m0 |
| movd [dstq+strideq], m0 |
| lea dstq, [dstq+strideq*2] |
| movd [dstq ], m0 |
| movd [dstq+strideq], m0 |
| RET |
| |
| INIT_XMM sse2 |
| cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above |
| movq m0, [aboveq] |
| DEFINE_ARGS dst, stride, stride3 |
| lea stride3q, [strideq*3] |
| movq [dstq ], m0 |
| movq [dstq+strideq ], m0 |
| movq [dstq+strideq*2], m0 |
| movq [dstq+stride3q ], m0 |
| lea dstq, [dstq+strideq*4] |
| movq [dstq ], m0 |
| movq [dstq+strideq ], m0 |
| movq [dstq+strideq*2], m0 |
| movq [dstq+stride3q ], m0 |
| RET |
| |
| INIT_XMM sse2 |
| cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above |
| mova m0, [aboveq] |
| DEFINE_ARGS dst, stride, stride3, nlines4 |
| lea stride3q, [strideq*3] |
| mov nlines4d, 4 |
| .loop: |
| mova [dstq ], m0 |
| mova [dstq+strideq ], m0 |
| mova [dstq+strideq*2], m0 |
| mova [dstq+stride3q ], m0 |
| lea dstq, [dstq+strideq*4] |
| dec nlines4d |
| jnz .loop |
| REP_RET |
| |
| INIT_XMM sse2 |
| cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above |
| mova m0, [aboveq] |
| mova m1, [aboveq+16] |
| DEFINE_ARGS dst, stride, stride3, nlines4 |
| lea stride3q, [strideq*3] |
| mov nlines4d, 8 |
| .loop: |
| mova [dstq ], m0 |
| mova [dstq +16], m1 |
| mova [dstq+strideq ], m0 |
| mova [dstq+strideq +16], m1 |
| mova [dstq+strideq*2 ], m0 |
| mova [dstq+strideq*2+16], m1 |
| mova [dstq+stride3q ], m0 |
| mova [dstq+stride3q +16], m1 |
| lea dstq, [dstq+strideq*4] |
| dec nlines4d |
| jnz .loop |
| REP_RET |
| |
| INIT_XMM sse2 |
| cglobal h_predictor_4x4, 2, 4, 4, dst, stride, line, left |
| movifnidn leftq, leftmp |
| movd m0, [leftq] |
| punpcklbw m0, m0 |
| punpcklbw m0, m0 |
| pshufd m1, m0, 0x1 |
| movd [dstq ], m0 |
| movd [dstq+strideq], m1 |
| pshufd m2, m0, 0x2 |
| lea dstq, [dstq+strideq*2] |
| pshufd m3, m0, 0x3 |
| movd [dstq ], m2 |
| movd [dstq+strideq], m3 |
| RET |
| |
| INIT_XMM sse2 |
| cglobal h_predictor_8x8, 2, 5, 3, dst, stride, line, left |
| movifnidn leftq, leftmp |
| mov lineq, -2 |
| DEFINE_ARGS dst, stride, line, left, stride3 |
| lea stride3q, [strideq*3] |
| movq m0, [leftq ] |
| punpcklbw m0, m0 ; l1 l1 l2 l2 ... l8 l8 |
| .loop: |
| pshuflw m1, m0, 0x0 ; l1 l1 l1 l1 l1 l1 l1 l1 |
| pshuflw m2, m0, 0x55 ; l2 l2 l2 l2 l2 l2 l2 l2 |
| movq [dstq ], m1 |
| movq [dstq+strideq], m2 |
| pshuflw m1, m0, 0xaa |
| pshuflw m2, m0, 0xff |
| movq [dstq+strideq*2], m1 |
| movq [dstq+stride3q ], m2 |
| pshufd m0, m0, 0xe ; [63:0] l5 l5 l6 l6 l7 l7 l8 l8 |
| inc lineq |
| lea dstq, [dstq+strideq*4] |
| jnz .loop |
| REP_RET |
| |
| INIT_XMM sse2 |
| cglobal h_predictor_16x16, 2, 5, 3, dst, stride, line, left |
| movifnidn leftq, leftmp |
| mov lineq, -4 |
| DEFINE_ARGS dst, stride, line, left, stride3 |
| lea stride3q, [strideq*3] |
| .loop: |
| movd m0, [leftq] |
| punpcklbw m0, m0 |
| punpcklbw m0, m0 ; l1 to l4 each repeated 4 times |
| pshufd m1, m0, 0x0 ; l1 repeated 16 times |
| pshufd m2, m0, 0x55 ; l2 repeated 16 times |
| mova [dstq ], m1 |
| mova [dstq+strideq ], m2 |
| pshufd m1, m0, 0xaa |
| pshufd m2, m0, 0xff |
| mova [dstq+strideq*2], m1 |
| mova [dstq+stride3q ], m2 |
| inc lineq |
| lea leftq, [leftq+4 ] |
| lea dstq, [dstq+strideq*4] |
| jnz .loop |
| REP_RET |
| |
| INIT_XMM sse2 |
| cglobal h_predictor_32x32, 2, 5, 3, dst, stride, line, left |
| movifnidn leftq, leftmp |
| mov lineq, -8 |
| DEFINE_ARGS dst, stride, line, left, stride3 |
| lea stride3q, [strideq*3] |
| .loop: |
| movd m0, [leftq] |
| punpcklbw m0, m0 |
| punpcklbw m0, m0 ; l1 to l4 each repeated 4 times |
| pshufd m1, m0, 0x0 ; l1 repeated 16 times |
| pshufd m2, m0, 0x55 ; l2 repeated 16 times |
| mova [dstq ], m1 |
| mova [dstq+16 ], m1 |
| mova [dstq+strideq ], m2 |
| mova [dstq+strideq+16 ], m2 |
| pshufd m1, m0, 0xaa |
| pshufd m2, m0, 0xff |
| mova [dstq+strideq*2 ], m1 |
| mova [dstq+strideq*2+16], m1 |
| mova [dstq+stride3q ], m2 |
| mova [dstq+stride3q+16 ], m2 |
| inc lineq |
| lea leftq, [leftq+4 ] |
| lea dstq, [dstq+strideq*4] |
| jnz .loop |
| REP_RET |