convolve_copy_sse2: replace SSE w/SSE2 code this should be neutral or slightly faster on modern (P4+) architectures Change-Id: Iec4c080275941eb8c9e05a66a2daf0405d86a69b
diff --git a/vpx_dsp/x86/vpx_convolve_copy_sse2.asm b/vpx_dsp/x86/vpx_convolve_copy_sse2.asm index 9c5b414..abc0270 100644 --- a/vpx_dsp/x86/vpx_convolve_copy_sse2.asm +++ b/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
@@ -13,15 +13,21 @@ SECTION .text %macro convolve_fn 1-2 -INIT_XMM sse2 +%ifidn %1, avg +%define AUX_XMM_REGS 4 +%else +%define AUX_XMM_REGS 0 +%endif %ifidn %2, highbd %define pavg pavgw -cglobal %2_convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \ - fx, fxs, fy, fys, w, h, bd +cglobal %2_convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \ + dst, dst_stride, \ + fx, fxs, fy, fys, w, h, bd %else %define pavg pavgb -cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \ - fx, fxs, fy, fys, w, h +cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \ + dst, dst_stride, \ + fx, fxs, fy, fys, w, h %endif mov r4d, dword wm %ifidn %2, highbd @@ -152,38 +158,11 @@ jnz .loop16 RET -INIT_MMX sse .w8: mov r4d, dword hm lea r5q, [src_strideq*3] lea r6q, [dst_strideq*3] .loop8: - movu m0, [srcq] - movu m1, [srcq+src_strideq] - movu m2, [srcq+src_strideq*2] - movu m3, [srcq+r5q] - lea srcq, [srcq+src_strideq*4] -%ifidn %1, avg - pavg m0, [dstq] - pavg m1, [dstq+dst_strideq] - pavg m2, [dstq+dst_strideq*2] - pavg m3, [dstq+r6q] -%endif - mova [dstq ], m0 - mova [dstq+dst_strideq ], m1 - mova [dstq+dst_strideq*2], m2 - mova [dstq+r6q ], m3 - lea dstq, [dstq+dst_strideq*4] - sub r4d, 4 - jnz .loop8 - RET - -%ifnidn %2, highbd -.w4: - mov r4d, dword hm - lea r5q, [src_strideq*3] - lea r6q, [dst_strideq*3] -.loop4: movh m0, [srcq] movh m1, [srcq+src_strideq] movh m2, [srcq+src_strideq*2] @@ -205,11 +184,42 @@ movh [dstq+r6q ], m3 lea dstq, [dstq+dst_strideq*4] sub r4d, 4 + jnz .loop8 + RET + +%ifnidn %2, highbd +.w4: + mov r4d, dword hm + lea r5q, [src_strideq*3] + lea r6q, [dst_strideq*3] +.loop4: + movd m0, [srcq] + movd m1, [srcq+src_strideq] + movd m2, [srcq+src_strideq*2] + movd m3, [srcq+r5q] + lea srcq, [srcq+src_strideq*4] +%ifidn %1, avg + movd m4, [dstq] + movd m5, [dstq+dst_strideq] + movd m6, [dstq+dst_strideq*2] + movd m7, [dstq+r6q] + pavg m0, m4 + pavg m1, m5 + pavg m2, m6 + pavg m3, m7 +%endif + movd [dstq ], m0 + movd [dstq+dst_strideq ], m1 + movd [dstq+dst_strideq*2], m2 + movd [dstq+r6q ], m3 + lea dstq, [dstq+dst_strideq*4] + sub r4d, 4 jnz .loop4 RET %endif %endmacro +INIT_XMM sse2 convolve_fn copy convolve_fn avg %if CONFIG_VP9_HIGHBITDEPTH