Merge "Remove INLINE/FORCEINLINE"
diff --git a/vp8/common/arm/loopfilter_arm.c b/vp8/common/arm/loopfilter_arm.c
index bb4af22..12e56ab 100644
--- a/vp8/common/arm/loopfilter_arm.c
+++ b/vp8/common/arm/loopfilter_arm.c
@@ -14,16 +14,6 @@
#include "loopfilter.h"
#include "onyxc_int.h"
-typedef void loop_filter_uvfunction
-(
- unsigned char *u, // source pointer
- int p, // pitch
- const signed char *flimit,
- const signed char *limit,
- const signed char *thresh,
- unsigned char *v
-);
-
extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_armv6);
extern prototype_loopfilter(vp8_loop_filter_vertical_edge_armv6);
extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_armv6);
diff --git a/vp8/common/loopfilter.h b/vp8/common/loopfilter.h
index f051a31..66185d1 100644
--- a/vp8/common/loopfilter.h
+++ b/vp8/common/loopfilter.h
@@ -117,5 +117,14 @@
#define LF_INVOKE(ctx,fn) vp8_lf_##fn
#endif
+typedef void loop_filter_uvfunction
+(
+ unsigned char *u, // source pointer
+ int p, // pitch
+ const signed char *flimit,
+ const signed char *limit,
+ const signed char *thresh,
+ unsigned char *v
+);
#endif
diff --git a/vp8/common/onyxc_int.h b/vp8/common/onyxc_int.h
index 4d5d987..33a5433 100644
--- a/vp8/common/onyxc_int.h
+++ b/vp8/common/onyxc_int.h
@@ -201,6 +201,7 @@
void vp8_adjust_mb_lf_value(MACROBLOCKD *mbd, int *filter_level);
void vp8_init_loop_filter(VP8_COMMON *cm);
+void vp8_frame_init_loop_filter(loop_filter_info *lfi, int frame_type);
extern void vp8_loop_filter_frame(VP8_COMMON *cm, MACROBLOCKD *mbd, int filt_val);
#endif
diff --git a/vp8/common/x86/loopfilter_sse2.asm b/vp8/common/x86/loopfilter_sse2.asm
index f11fcad..ad2f36c 100644
--- a/vp8/common/x86/loopfilter_sse2.asm
+++ b/vp8/common/x86/loopfilter_sse2.asm
@@ -12,6 +12,283 @@
%include "vpx_ports/x86_abi_support.asm"
+%macro LFH_FILTER_MASK 1
+%if %1
+ movdqa xmm2, [rdi+2*rax] ; q3
+ movdqa xmm1, [rsi+2*rax] ; q2
+%else
+ movq xmm0, [rsi + rcx*2] ; q3
+ movq xmm2, [rdi + rcx*2]
+ pslldq xmm2, 8
+ por xmm2, xmm0
+ movq xmm1, [rsi + rcx] ; q2
+ movq xmm3, [rdi + rcx]
+ pslldq xmm3, 8
+ por xmm1, xmm3
+ movdqa XMMWORD PTR [rsp], xmm1 ; store q2
+%endif
+
+ movdqa xmm6, xmm1 ; q2
+ psubusb xmm1, xmm2 ; q2-=q3
+ psubusb xmm2, xmm6 ; q3-=q2
+ por xmm1, xmm2 ; abs(q3-q2)
+
+ psubusb xmm1, xmm7
+
+%if %1
+ movdqa xmm4, [rsi+rax] ; q1
+%else
+ movq xmm0, [rsi] ; q1
+ movq xmm4, [rdi]
+ pslldq xmm4, 8
+ por xmm4, xmm0
+ movdqa XMMWORD PTR [rsp + 16], xmm4 ; store q1
+%endif
+
+ movdqa xmm3, xmm4 ; q1
+ psubusb xmm4, xmm6 ; q1-=q2
+ psubusb xmm6, xmm3 ; q2-=q1
+ por xmm4, xmm6 ; abs(q2-q1)
+ psubusb xmm4, xmm7
+
+ por xmm1, xmm4
+
+%if %1
+ movdqa xmm4, [rsi] ; q0
+%else
+ movq xmm4, [rsi + rax] ; q0
+ movq xmm0, [rdi + rax]
+ pslldq xmm0, 8
+ por xmm4, xmm0
+%endif
+
+ movdqa xmm0, xmm4 ; q0
+ psubusb xmm4, xmm3 ; q0-=q1
+ psubusb xmm3, xmm0 ; q1-=q0
+ por xmm4, xmm3 ; abs(q0-q1)
+ movdqa t0, xmm4 ; save to t0
+
+ psubusb xmm4, xmm7
+ por xmm1, xmm4
+
+%if %1
+ neg rax ; negate pitch to deal with above border
+
+ movdqa xmm2, [rsi+4*rax] ; p3
+ movdqa xmm4, [rdi+4*rax] ; p2
+%else
+ lea rsi, [rsi + rax*4]
+ lea rdi, [rdi + rax*4]
+
+ movq xmm2, [rsi + rax] ; p3
+ movq xmm3, [rdi + rax]
+ pslldq xmm3, 8
+ por xmm2, xmm3
+ movq xmm4, [rsi] ; p2
+ movq xmm5, [rdi]
+ pslldq xmm5, 8
+ por xmm4, xmm5
+ movdqa XMMWORD PTR [rsp + 32], xmm4 ; store p2
+%endif
+
+ movdqa xmm5, xmm4 ; p2
+ psubusb xmm4, xmm2 ; p2-=p3
+ psubusb xmm2, xmm5 ; p3-=p2
+ por xmm4, xmm2 ; abs(p3 - p2)
+
+ psubusb xmm4, xmm7
+ por xmm1, xmm4
+
+%if %1
+ movdqa xmm4, [rsi+2*rax] ; p1
+%else
+ movq xmm4, [rsi + rcx] ; p1
+ movq xmm3, [rdi + rcx]
+ pslldq xmm3, 8
+ por xmm4, xmm3
+ movdqa XMMWORD PTR [rsp + 48], xmm4 ; store p1
+%endif
+
+ movdqa xmm3, xmm4 ; p1
+ psubusb xmm4, xmm5 ; p1-=p2
+ psubusb xmm5, xmm3 ; p2-=p1
+ por xmm4, xmm5 ; abs(p2 - p1)
+ psubusb xmm4, xmm7
+
+ por xmm1, xmm4
+ movdqa xmm2, xmm3 ; p1
+
+%if %1
+ movdqa xmm4, [rsi+rax] ; p0
+%else
+ movq xmm4, [rsi + rcx*2] ; p0
+ movq xmm5, [rdi + rcx*2]
+ pslldq xmm5, 8
+ por xmm4, xmm5
+%endif
+
+ movdqa xmm5, xmm4 ; p0
+ psubusb xmm4, xmm3 ; p0-=p1
+ psubusb xmm3, xmm5 ; p1-=p0
+ por xmm4, xmm3 ; abs(p1 - p0)
+ movdqa t1, xmm4 ; save to t1
+
+ psubusb xmm4, xmm7
+ por xmm1, xmm4
+
+%if %1
+ movdqa xmm3, [rdi] ; q1
+%else
+ movdqa xmm3, q1 ; q1
+%endif
+
+ movdqa xmm4, xmm3 ; q1
+ psubusb xmm3, xmm2 ; q1-=p1
+ psubusb xmm2, xmm4 ; p1-=q1
+ por xmm2, xmm3 ; abs(p1-q1)
+ pand xmm2, [tfe GLOBAL] ; set lsb of each byte to zero
+ psrlw xmm2, 1 ; abs(p1-q1)/2
+
+ movdqa xmm6, xmm5 ; p0
+ movdqa xmm3, xmm0 ; q0
+ psubusb xmm5, xmm3 ; p0-=q0
+ psubusb xmm3, xmm6 ; q0-=p0
+ por xmm5, xmm3 ; abs(p0 - q0)
+ paddusb xmm5, xmm5 ; abs(p0-q0)*2
+ paddusb xmm5, xmm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+ mov rdx, arg(2) ; get flimit
+ movdqa xmm2, XMMWORD PTR [rdx]
+ paddb xmm2, xmm2 ; flimit*2 (less than 255)
+ paddb xmm7, xmm2 ; flimit * 2 + limit (less than 255)
+
+ psubusb xmm5, xmm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit
+ por xmm1, xmm5
+ pxor xmm5, xmm5
+ pcmpeqb xmm1, xmm5 ; mask mm1
+%endmacro
+
+%macro LFH_HEV_MASK 0
+ mov rdx, arg(4) ; get thresh
+ movdqa xmm7, XMMWORD PTR [rdx]
+
+ movdqa xmm4, t0 ; get abs (q1 - q0)
+ psubusb xmm4, xmm7
+ movdqa xmm3, t1 ; get abs (p1 - p0)
+ psubusb xmm3, xmm7
+ paddb xmm4, xmm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+ pcmpeqb xmm4, xmm5
+
+ pcmpeqb xmm5, xmm5
+ pxor xmm4, xmm5
+%endmacro
+
+%macro BH_FILTER 1
+%if %1
+ movdqa xmm2, [rsi+2*rax] ; p1
+ movdqa xmm7, [rdi] ; q1
+%else
+ movdqa xmm2, p1 ; p1
+ movdqa xmm7, q1 ; q1
+%endif
+
+ pxor xmm2, [t80 GLOBAL] ; p1 offset to convert to signed values
+ pxor xmm7, [t80 GLOBAL] ; q1 offset to convert to signed values
+
+ psubsb xmm2, xmm7 ; p1 - q1
+ pand xmm2, xmm4 ; high var mask (hvm)(p1 - q1)
+ pxor xmm6, [t80 GLOBAL] ; offset to convert to signed values
+
+ pxor xmm0, [t80 GLOBAL] ; offset to convert to signed values
+ movdqa xmm3, xmm0 ; q0
+
+ psubsb xmm0, xmm6 ; q0 - p0
+ paddsb xmm2, xmm0 ; 1 * (q0 - p0) + hvm(p1 - q1)
+ paddsb xmm2, xmm0 ; 2 * (q0 - p0) + hvm(p1 - q1)
+ paddsb xmm2, xmm0 ; 3 * (q0 - p0) + hvm(p1 - q1)
+ pand xmm1, xmm2 ; mask filter values we don't care about
+ movdqa xmm2, xmm1
+ paddsb xmm1, [t4 GLOBAL] ; 3* (q0 - p0) + hvm(p1 - q1) + 4
+ paddsb xmm2, [t3 GLOBAL] ; 3* (q0 - p0) + hvm(p1 - q1) + 3
+
+ pxor xmm0, xmm0
+ pxor xmm5, xmm5
+ punpcklbw xmm0, xmm2
+ punpckhbw xmm5, xmm2
+ psraw xmm0, 11
+ psraw xmm5, 11
+ packsswb xmm0, xmm5
+ movdqa xmm2, xmm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
+
+ pxor xmm0, xmm0 ; 0
+ movdqa xmm5, xmm1 ; abcdefgh
+ punpcklbw xmm0, xmm1 ; e0f0g0h0
+ psraw xmm0, 11 ; sign extended shift right by 3
+ pxor xmm1, xmm1 ; 0
+ punpckhbw xmm1, xmm5 ; a0b0c0d0
+ psraw xmm1, 11 ; sign extended shift right by 3
+ movdqa xmm5, xmm0 ; save results
+
+ packsswb xmm0, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
+ paddsw xmm5, [ones GLOBAL]
+ paddsw xmm1, [ones GLOBAL]
+ psraw xmm5, 1 ; partial shifted one more time for 2nd tap
+ psraw xmm1, 1 ; partial shifted one more time for 2nd tap
+ packsswb xmm5, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
+ pandn xmm4, xmm5 ; high edge variance additive
+%endmacro
+
+%macro BH_WRITEBACK 1
+ paddsb xmm6, xmm2 ; p0+= p0 add
+ pxor xmm6, [t80 GLOBAL] ; unoffset
+%if %1
+ movdqa [rsi+rax], xmm6 ; write back
+%else
+ lea rsi, [rsi + rcx*2]
+ lea rdi, [rdi + rcx*2]
+ movq MMWORD PTR [rsi], xmm6 ; p0
+ psrldq xmm6, 8
+ movq MMWORD PTR [rdi], xmm6
+%endif
+
+%if %1
+ movdqa xmm6, [rsi+2*rax] ; p1
+%else
+ movdqa xmm6, p1 ; p1
+%endif
+ pxor xmm6, [t80 GLOBAL] ; reoffset
+ paddsb xmm6, xmm4 ; p1+= p1 add
+ pxor xmm6, [t80 GLOBAL] ; unoffset
+%if %1
+ movdqa [rsi+2*rax], xmm6 ; write back
+%else
+ movq MMWORD PTR [rsi + rax], xmm6 ; p1
+ psrldq xmm6, 8
+ movq MMWORD PTR [rdi + rax], xmm6
+%endif
+
+ psubsb xmm3, xmm0 ; q0-= q0 add
+ pxor xmm3, [t80 GLOBAL] ; unoffset
+%if %1
+ movdqa [rsi], xmm3 ; write back
+%else
+ movq MMWORD PTR [rsi + rcx], xmm3 ; q0
+ psrldq xmm3, 8
+ movq MMWORD PTR [rdi + rcx], xmm3
+%endif
+
+ psubsb xmm7, xmm4 ; q1-= q1 add
+ pxor xmm7, [t80 GLOBAL] ; unoffset
+%if %1
+ movdqa [rdi], xmm7 ; write back
+%else
+ movq MMWORD PTR [rsi + rcx*2],xmm7 ; q1
+ psrldq xmm7, 8
+ movq MMWORD PTR [rdi + rcx*2],xmm7
+%endif
+%endmacro
+
+
;void vp8_loop_filter_horizontal_edge_sse2
;(
; unsigned char *src_ptr,
@@ -33,179 +310,28 @@
; end prolog
ALIGN_STACK 16, rax
- sub rsp, 32 ; reserve 32 bytes
+ sub rsp, 32 ; reserve 32 bytes
%define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
%define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
- mov rsi, arg(0) ;src_ptr
- movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
+ mov rsi, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixel_step
- mov rdx, arg(3) ;limit
- movdqa xmm7, XMMWORD PTR [rdx]
- mov rdi, rsi ; rdi points to row +1 for indirect addressing
- add rdi, rax
+ mov rdx, arg(3) ;limit
+ movdqa xmm7, XMMWORD PTR [rdx]
+
+ lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing
; calculate breakout conditions
- movdqu xmm2, [rdi+2*rax] ; q3
- movdqu xmm1, [rsi+2*rax] ; q2
- movdqa xmm6, xmm1 ; q2
- psubusb xmm1, xmm2 ; q2-=q3
- psubusb xmm2, xmm6 ; q3-=q2
- por xmm1, xmm2 ; abs(q3-q2)
- psubusb xmm1, xmm7 ;
-
-
- movdqu xmm4, [rsi+rax] ; q1
- movdqa xmm3, xmm4 ; q1
- psubusb xmm4, xmm6 ; q1-=q2
- psubusb xmm6, xmm3 ; q2-=q1
- por xmm4, xmm6 ; abs(q2-q1)
-
- psubusb xmm4, xmm7
- por xmm1, xmm4
-
- movdqu xmm4, [rsi] ; q0
- movdqa xmm0, xmm4 ; q0
- psubusb xmm4, xmm3 ; q0-=q1
- psubusb xmm3, xmm0 ; q1-=q0
- por xmm4, xmm3 ; abs(q0-q1)
- movdqa t0, xmm4 ; save to t0
- psubusb xmm4, xmm7
- por xmm1, xmm4
-
- neg rax ; negate pitch to deal with above border
- movdqu xmm2, [rsi+4*rax] ; p3
- movdqu xmm4, [rdi+4*rax] ; p2
- movdqa xmm5, xmm4 ; p2
- psubusb xmm4, xmm2 ; p2-=p3
- psubusb xmm2, xmm5 ; p3-=p2
- por xmm4, xmm2 ; abs(p3 - p2)
- psubusb xmm4, xmm7
- por xmm1, xmm4
-
-
- movdqu xmm4, [rsi+2*rax] ; p1
- movdqa xmm3, xmm4 ; p1
- psubusb xmm4, xmm5 ; p1-=p2
- psubusb xmm5, xmm3 ; p2-=p1
- por xmm4, xmm5 ; abs(p2 - p1)
- psubusb xmm4, xmm7
- por xmm1, xmm4
-
- movdqa xmm2, xmm3 ; p1
-
- movdqu xmm4, [rsi+rax] ; p0
- movdqa xmm5, xmm4 ; p0
- psubusb xmm4, xmm3 ; p0-=p1
- psubusb xmm3, xmm5 ; p1-=p0
- por xmm4, xmm3 ; abs(p1 - p0)
- movdqa t1, xmm4 ; save to t1
- psubusb xmm4, xmm7
- por xmm1, xmm4
-
- movdqu xmm3, [rdi] ; q1
- movdqa xmm4, xmm3 ; q1
- psubusb xmm3, xmm2 ; q1-=p1
- psubusb xmm2, xmm4 ; p1-=q1
- por xmm2, xmm3 ; abs(p1-q1)
- pand xmm2, [tfe GLOBAL] ; set lsb of each byte to zero
- psrlw xmm2, 1 ; abs(p1-q1)/2
-
- movdqa xmm6, xmm5 ; p0
- movdqu xmm3, [rsi] ; q0
- psubusb xmm5, xmm3 ; p0-=q0
- psubusb xmm3, xmm6 ; q0-=p0
- por xmm5, xmm3 ; abs(p0 - q0)
- paddusb xmm5, xmm5 ; abs(p0-q0)*2
- paddusb xmm5, xmm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
- mov rdx, arg(2) ;flimit ; get flimit
- movdqa xmm2, [rdx] ;
-
- paddb xmm2, xmm2 ; flimit*2 (less than 255)
- paddb xmm7, xmm2 ; flimit * 2 + limit (less than 255)
-
- psubusb xmm5, xmm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit
- por xmm1, xmm5
- pxor xmm5, xmm5
- pcmpeqb xmm1, xmm5 ; mask mm1
-
+ LFH_FILTER_MASK 1
; calculate high edge variance
- mov rdx, arg(4) ;thresh ; get thresh
- movdqa xmm7, [rdx] ;
- movdqa xmm4, t0 ; get abs (q1 - q0)
- psubusb xmm4, xmm7
- movdqa xmm3, t1 ; get abs (p1 - p0)
- psubusb xmm3, xmm7
- paddb xmm4, xmm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
- pcmpeqb xmm4, xmm5
- pcmpeqb xmm5, xmm5
- pxor xmm4, xmm5
-
+ LFH_HEV_MASK
; start work on filters
- movdqu xmm2, [rsi+2*rax] ; p1
- movdqu xmm7, [rdi] ; q1
- pxor xmm2, [t80 GLOBAL] ; p1 offset to convert to signed values
- pxor xmm7, [t80 GLOBAL] ; q1 offset to convert to signed values
- psubsb xmm2, xmm7 ; p1 - q1
- pand xmm2, xmm4 ; high var mask (hvm)(p1 - q1)
- pxor xmm6, [t80 GLOBAL] ; offset to convert to signed values
- pxor xmm0, [t80 GLOBAL] ; offset to convert to signed values
- movdqa xmm3, xmm0 ; q0
- psubsb xmm0, xmm6 ; q0 - p0
- paddsb xmm2, xmm0 ; 1 * (q0 - p0) + hvm(p1 - q1)
- paddsb xmm2, xmm0 ; 2 * (q0 - p0) + hvm(p1 - q1)
- paddsb xmm2, xmm0 ; 3 * (q0 - p0) + hvm(p1 - q1)
- pand xmm1, xmm2 ; mask filter values we don't care about
- movdqa xmm2, xmm1
- paddsb xmm1, [t4 GLOBAL] ; 3* (q0 - p0) + hvm(p1 - q1) + 4
- paddsb xmm2, [t3 GLOBAL] ; 3* (q0 - p0) + hvm(p1 - q1) + 3
-
- pxor xmm0, xmm0 ;
- pxor xmm5, xmm5
- punpcklbw xmm0, xmm2 ;
- punpckhbw xmm5, xmm2 ;
- psraw xmm0, 11 ;
- psraw xmm5, 11
- packsswb xmm0, xmm5
- movdqa xmm2, xmm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
-
- pxor xmm0, xmm0 ; 0
- movdqa xmm5, xmm1 ; abcdefgh
- punpcklbw xmm0, xmm1 ; e0f0g0h0
- psraw xmm0, 11 ; sign extended shift right by 3
- pxor xmm1, xmm1 ; 0
- punpckhbw xmm1, xmm5 ; a0b0c0d0
- psraw xmm1, 11 ; sign extended shift right by 3
- movdqa xmm5, xmm0 ; save results
-
- packsswb xmm0, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
- paddsw xmm5, [ones GLOBAL]
- paddsw xmm1, [ones GLOBAL]
- psraw xmm5, 1 ; partial shifted one more time for 2nd tap
- psraw xmm1, 1 ; partial shifted one more time for 2nd tap
- packsswb xmm5, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
- pandn xmm4, xmm5 ; high edge variance additive
-
- paddsb xmm6, xmm2 ; p0+= p0 add
- pxor xmm6, [t80 GLOBAL] ; unoffset
- movdqu [rsi+rax], xmm6 ; write back
-
- movdqu xmm6, [rsi+2*rax] ; p1
- pxor xmm6, [t80 GLOBAL] ; reoffset
- paddsb xmm6, xmm4 ; p1+= p1 add
- pxor xmm6, [t80 GLOBAL] ; unoffset
- movdqu [rsi+2*rax], xmm6 ; write back
-
- psubsb xmm3, xmm0 ; q0-= q0 add
- pxor xmm3, [t80 GLOBAL] ; unoffset
- movdqu [rsi], xmm3 ; write back
-
- psubsb xmm7, xmm4 ; q1-= q1 add
- pxor xmm7, [t80 GLOBAL] ; unoffset
- movdqu [rdi], xmm7 ; write back
+ BH_FILTER 1
+ ; write back the result
+ BH_WRITEBACK 1
add rsp, 32
pop rsp
@@ -219,7 +345,7 @@
ret
-;void vp8_loop_filter_vertical_edge_sse2
+;void vp8_loop_filter_horizontal_edge_uv_sse2
;(
; unsigned char *src_ptr,
; int src_pixel_step,
@@ -228,8 +354,8 @@
; const char *thresh,
; int count
;)
-global sym(vp8_loop_filter_vertical_edge_sse2)
-sym(vp8_loop_filter_vertical_edge_sse2):
+global sym(vp8_loop_filter_horizontal_edge_uv_sse2)
+sym(vp8_loop_filter_horizontal_edge_uv_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
@@ -240,414 +366,35 @@
; end prolog
ALIGN_STACK 16, rax
- sub rsp, 96 ; reserve 96 bytes
- %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
- %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
- %define srct [rsp + 32] ;__declspec(align(16)) char srct[64];
+ sub rsp, 96 ; reserve 96 bytes
+ %define q2 [rsp + 0] ;__declspec(align(16)) char q2[16];
+ %define q1 [rsp + 16] ;__declspec(align(16)) char q1[16];
+ %define p2 [rsp + 32] ;__declspec(align(16)) char p2[16];
+ %define p1 [rsp + 48] ;__declspec(align(16)) char p1[16];
+ %define t0 [rsp + 64] ;__declspec(align(16)) char t0[16];
+ %define t1 [rsp + 80] ;__declspec(align(16)) char t1[16];
- mov rsi, arg(0) ;src_ptr
- movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
+ mov rsi, arg(0) ; u
+ mov rdi, arg(5) ; v
+ movsxd rax, dword ptr arg(1) ; src_pixel_step
+ mov rcx, rax
+ neg rax ; negate pitch to deal with above border
- lea rsi, [rsi + rax*4 - 4]
- mov rdi, rsi ; rdi points to row +1 for indirect addressing
+ mov rdx, arg(3) ;limit
+ movdqa xmm7, XMMWORD PTR [rdx]
- add rdi, rax
- lea rcx, [rdi + rax *8]
+ lea rsi, [rsi + rcx]
+ lea rdi, [rdi + rcx]
- ;transpose
- movq xmm7, QWORD PTR [rsi+2*rax] ; 67 66 65 64 63 62 61 60
- movq xmm6, QWORD PTR [rdi+2*rax] ; 77 76 75 74 73 72 71 70
-
- punpcklbw xmm7, xmm6 ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60
- movq xmm5, QWORD PTR [rsi] ; 47 46 45 44 43 42 41 40
-
- movq xmm4, QWORD PTR [rsi+rax] ; 57 56 55 54 53 52 51 50
- punpcklbw xmm5, xmm4 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
-
- movdqa xmm3, xmm5 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
- punpckhwd xmm5, xmm7 ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44
-
- lea rsi, [rsi+ rax*8]
-
- punpcklwd xmm3, xmm7 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
- movq xmm6, QWORD PTR [rsi + 2*rax] ; e7 e6 e5 e4 e3 e2 e1 e0
-
- movq xmm7, QWORD PTR [rcx + 2*rax] ; f7 f6 f5 f4 f3 f2 f1 f0
- punpcklbw xmm6, xmm7 ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0
-
- movq xmm4, QWORD PTR [rsi] ; c7 c6 c5 c4 c3 c2 c1 c0
- movq xmm7, QWORD PTR [rsi + rax] ; d7 d6 d5 d4 d3 d2 d1 d0
-
- punpcklbw xmm4, xmm7 ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 c1 d0 c0
- movdqa xmm7, xmm4 ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 c1 d0 c0
-
- punpckhwd xmm7, xmm6 ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4
- punpcklwd xmm4, xmm6 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
-
- ; xmm3 xmm4, xmm5 xmm7 in use
- neg rax
-
- lea rsi, [rsi+rax*8]
- movq xmm6, QWORD PTR [rsi+rax*2] ; 27 26 25 24 23 22 21 20
-
- movq xmm1, QWORD PTR [rsi+rax ] ; 37 36 35 34 33 32 31 30
- punpcklbw xmm6, xmm1 ; 37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20
-
- movq xmm2, QWORD PTR [rsi+rax*4] ; 07 06 05 04 03 02 01 00
- movq xmm1, QWORD PTR [rdi+rax*4] ; 17 16 15 14 13 12 11 10
-
- punpcklbw xmm2, xmm1 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
- movdqa xmm0, xmm2
-
- punpckhwd xmm2, xmm6 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
- punpcklwd xmm0, xmm6 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
-
- movdqa xmm6, xmm2
- punpckldq xmm2, xmm5 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
-
- punpckhdq xmm6, xmm5 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
- ;xmm0 xmm2 xmm3 xmm4, xmm6, xmm7
-
- movdqa xmm5, xmm0 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
- punpckhdq xmm5, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
-
- punpckldq xmm0, xmm3 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
- lea rsi, [rcx+rax]
- ; xmm1, xmm3 free
- movq xmm1, QWORD PTR [rsi+rax*2] ; a7 a6 a5 a4 a3 a2 a1 a0
- movq xmm3, QWORD PTR [rsi+rax] ; b7 b6 b5 b4 b3 b2 b1 b0
-
- punpcklbw xmm1, xmm3 ;
- lea rdx, srct ;
-
- movdqa [rdx+16], xmm1 ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0
- movq xmm3, QWORD PTR [rsi+rax*4] ; 87 86 85 84 83 82 81 80
-
- movq xmm1, QWORD PTR [rcx+rax*4]
- punpcklbw xmm3, xmm1 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
-
- movdqa [rdx], xmm3 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
-
- punpckhwd xmm3, [rdx+16] ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
- movdqa xmm1, xmm3 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
-
- punpckhdq xmm1, xmm7 ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86
- punpckldq xmm3, xmm7 ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84
-
- movdqa xmm7, xmm2 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
- punpcklqdq xmm7, xmm3 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
-
- punpckhqdq xmm2, xmm3 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
- movdqa [rdx+32], xmm7 ; save 4s
-
- movdqa [rdx+48], xmm2 ; save 5s
- movdqa xmm7, xmm6 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
-
- punpckhqdq xmm7, xmm1 ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07 = q3
- punpcklqdq xmm6, xmm1 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06 = q2
-
- ; free 1, 3 xmm7-7s xmm6-6s, xmm2-5s
- movq xmm1, QWORD PTR [rdx] ; 93 83 92 82 91 81 90 80
- movq xmm3, QWORD PTR [rdx+16] ; b3 a3 b2 a2 b1 a1 b0 a0
-
- punpcklwd xmm1, xmm3 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
- movdqa xmm3, xmm1 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
-
- punpckhdq xmm3, xmm4 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
- punpckldq xmm1, xmm4 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
-
- movdqa xmm4, xmm5 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
- punpcklqdq xmm5, xmm3 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
-
- punpckhqdq xmm4, xmm3 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
- movdqa [rdx], xmm5 ; save 2s
-
- movdqa [rdx+16], xmm4 ; save 3s
-
- movdqa xmm3, xmm6 ;
- psubusb xmm3, xmm7 ; q3 - q2
-
- psubusb xmm7, xmm6 ; q2 - q3
- por xmm7, xmm3 ; abs(q3-q2)
-
- movdqa xmm3, xmm2 ; q1
- psubusb xmm3, xmm6 ; q1 - q2
-
- psubusb xmm6, xmm2 ; q2 - q1
- por xmm6, xmm3 ; abs(q2-q1)
-
-
- movdqa xmm3, xmm0 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
- punpcklqdq xmm0, xmm1 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
-
- punpckhqdq xmm3, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
- movdqa xmm1, xmm3
-
- psubusb xmm3, xmm0 ; p2-p3
- psubusb xmm0, xmm1 ; p3-p2
-
- por xmm0, xmm3 ; abs(p3-p2)
- movdqa xmm3, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
-
- psubusb xmm3, xmm1 ; p1-p2
- psubusb xmm1, xmm5 ; p2-p1
-
- por xmm1, xmm3 ; abs(p1-p2)
- mov rdx, arg(3) ;limit
-
- movdqa xmm3, [rdx] ; limit
-
- psubusb xmm7, xmm3
- psubusb xmm0, xmm3
-
- psubusb xmm1, xmm3
- psubusb xmm6, xmm3
-
- por xmm7, xmm6
- por xmm0, xmm1
-
- por xmm0, xmm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
-
- movdqa xmm1, xmm5 ; p1
-
- movdqa xmm7, xmm4 ; xmm4 xmm7 = p0
-
- psubusb xmm7, xmm5 ; p0 - p1
- psubusb xmm5, xmm4 ; p1 - p0
-
- por xmm5, xmm7 ; abs(p1-p0)
- movdqa t0, xmm5 ; save abs(p1-p0)
-
- lea rdx, srct
- psubusb xmm5, xmm3
-
- por xmm0, xmm5 ; xmm0=mask
- movdqa xmm5, [rdx+32] ; xmm5=q0
-
- movdqa xmm7, [rdx+48] ; xmm7=q1
- movdqa xmm6, xmm5 ; mm6=q0
-
- movdqa xmm2, xmm7 ; q1
-
- psubusb xmm5, xmm7 ; q0-q1
- psubusb xmm7, xmm6 ; q1-q0
-
- por xmm7, xmm5 ; abs(q1-q0)
- movdqa t1, xmm7 ; save abs(q1-q0)
-
- psubusb xmm7, xmm3
- por xmm0, xmm7 ; mask
-
- movdqa xmm5, xmm2 ; q1
- psubusb xmm5, xmm1 ; q1-=p1
- psubusb xmm1, xmm2 ; p1-=q1
- por xmm5, xmm1 ; abs(p1-q1)
- pand xmm5, [tfe GLOBAL] ; set lsb of each byte to zero
- psrlw xmm5, 1 ; abs(p1-q1)/2
-
- mov rdx, arg(2) ;flimit ;
- movdqa xmm2, [rdx] ;flimit xmm2
-
- movdqa xmm1, xmm4 ; xmm1=xmm4=p0
-
- movdqa xmm7, xmm6 ; xmm7=xmm6=q0
- psubusb xmm1, xmm7 ; p0-q0
-
- psubusb xmm7, xmm4 ; q0-p0
- por xmm1, xmm7 ; abs(q0-p0)
- paddusb xmm1, xmm1 ; abs(q0-p0)*2
- paddusb xmm1, xmm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
- paddb xmm2, xmm2 ; flimit*2 (less than 255)
- paddb xmm3, xmm2 ; flimit * 2 + limit (less than 255)
-
- psubusb xmm1, xmm3 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit
-
- por xmm1, xmm0; ; mask
-
- pxor xmm0, xmm0
- pcmpeqb xmm1, xmm0
+ ; calculate breakout conditions
+ LFH_FILTER_MASK 0
; calculate high edge variance
- mov rdx, arg(4) ;thresh ; get thresh
- movdqa xmm7, [rdx]
-
- ;
- movdqa xmm4, t0 ; get abs (q1 - q0)
- psubusb xmm4, xmm7
-
- movdqa xmm3, t1 ; get abs (p1 - p0)
- psubusb xmm3, xmm7
-
- por xmm4, xmm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
- pcmpeqb xmm4, xmm0
-
- pcmpeqb xmm0, xmm0
- pxor xmm4, xmm0
+ LFH_HEV_MASK
; start work on filters
- lea rdx, srct
-
- movdqa xmm2, [rdx] ; p1
- movdqa xmm7, [rdx+48] ; q1
-
- movdqa xmm6, [rdx+16] ; p0
- movdqa xmm0, [rdx+32] ; q0
-
- pxor xmm2, [t80 GLOBAL] ; p1 offset to convert to signed values
- pxor xmm7, [t80 GLOBAL] ; q1 offset to convert to signed values
-
- psubsb xmm2, xmm7 ; p1 - q1
- pand xmm2, xmm4 ; high var mask (hvm)(p1 - q1)
-
- pxor xmm6, [t80 GLOBAL] ; offset to convert to signed values
- pxor xmm0, [t80 GLOBAL] ; offset to convert to signed values
-
- movdqa xmm3, xmm0 ; q0
- psubsb xmm0, xmm6 ; q0 - p0
-
- paddsb xmm2, xmm0 ; 1 * (q0 - p0) + hvm(p1 - q1)
- paddsb xmm2, xmm0 ; 2 * (q0 - p0) + hvm(p1 - q1)
-
- paddsb xmm2, xmm0 ; 3 * (q0 - p0) + hvm(p1 - q1)
- pand xmm1, xmm2 ; mask filter values we don't care about
-
- movdqa xmm2, xmm1
- paddsb xmm1, [t4 GLOBAL] ; 3* (q0 - p0) + hvm(p1 - q1) + 4
-
- paddsb xmm2, [t3 GLOBAL] ; 3* (q0 - p0) + hvm(p1 - q1) + 3
- pxor xmm0, xmm0 ;
-
- pxor xmm5, xmm5
- punpcklbw xmm0, xmm2 ;
-
- punpckhbw xmm5, xmm2 ;
- psraw xmm0, 11 ;
-
- psraw xmm5, 11
- packsswb xmm0, xmm5
-
- movdqa xmm2, xmm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
-
- pxor xmm0, xmm0 ; 0
- movdqa xmm5, xmm1 ; abcdefgh
-
- punpcklbw xmm0, xmm1 ; e0f0g0h0
- psraw xmm0, 11 ; sign extended shift right by 3
-
- pxor xmm1, xmm1 ; 0
- punpckhbw xmm1, xmm5 ; a0b0c0d0
-
- psraw xmm1, 11 ; sign extended shift right by 3
- movdqa xmm5, xmm0 ; save results
-
- packsswb xmm0, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
- paddsw xmm5, [ones GLOBAL]
-
- paddsw xmm1, [ones GLOBAL]
- psraw xmm5, 1 ; partial shifted one more time for 2nd tap
-
- psraw xmm1, 1 ; partial shifted one more time for 2nd tap
- packsswb xmm5, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
-
- pandn xmm4, xmm5 ; high edge variance additive
-
- paddsb xmm6, xmm2 ; p0+= p0 add
- pxor xmm6, [t80 GLOBAL] ; unoffset
-
- ; mm6=p0 ;
- movdqa xmm1, [rdx] ; p1
- pxor xmm1, [t80 GLOBAL] ; reoffset
-
- paddsb xmm1, xmm4 ; p1+= p1 add
- pxor xmm1, [t80 GLOBAL] ; unoffset
- ; mm6 = p0 mm1 = p1
-
- psubsb xmm3, xmm0 ; q0-= q0 add
- pxor xmm3, [t80 GLOBAL] ; unoffset
-
- ; mm3 = q0
- psubsb xmm7, xmm4 ; q1-= q1 add
- pxor xmm7, [t80 GLOBAL] ; unoffset
- ; mm7 = q1
-
- ; tranpose and write back
- ; xmm1 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
- ; xmm6 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
- ; xmm3 = f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
- ; xmm7 = f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
- movdqa xmm2, xmm1 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
- punpcklbw xmm2, xmm6 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
-
- movdqa xmm4, xmm3 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
- punpckhbw xmm1, xmm6 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
-
- punpcklbw xmm4, xmm7 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
- punpckhbw xmm3, xmm7 ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
-
- movdqa xmm6, xmm2 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
- punpcklwd xmm2, xmm4 ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
-
- punpckhwd xmm6, xmm4 ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
- movdqa xmm5, xmm1 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
-
- punpcklwd xmm1, xmm3 ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
- punpckhwd xmm5, xmm3 ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
-
- ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
- ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
- ; xmm5 = f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
- ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
- lea rsi, [rsi+rax*8]
-
- movd [rsi+rax*4+2], xmm2
- psrldq xmm2, 4
-
- movd [rdi+rax*4+2], xmm2
- psrldq xmm2, 4
-
- movd [rsi+rax*2+2], xmm2
- psrldq xmm2, 4
-
- movd [rdi+rax*2+2], xmm2
- movd [rsi+2], xmm6
-
- psrldq xmm6, 4
- movd [rdi+2], xmm6
-
- psrldq xmm6, 4
- neg rax
-
- movd [rdi+rax+2], xmm6
- psrldq xmm6, 4
-
- movd [rdi+rax*2+2], xmm6
- lea rsi, [rsi+rax*8]
-
- neg rax
- ;;;;;;;;;;;;;;;;;;;;/
- movd [rsi+rax*4+2], xmm1
- psrldq xmm1, 4
-
- movd [rcx+rax*4+2], xmm1
- psrldq xmm1, 4
-
- movd [rsi+rax*2+2], xmm1
- psrldq xmm1, 4
-
- movd [rcx+rax*2+2], xmm1
- psrldq xmm1, 4
-
- movd [rsi+2], xmm5
- psrldq xmm5, 4
-
- movd [rcx+2], xmm5
- psrldq xmm5, 4
-
- neg rax
- movd [rcx+rax+2], xmm5
-
- psrldq xmm5, 4
- movd [rcx+rax*2+2], xmm5
+ BH_FILTER 0
+ ; write back the result
+ BH_WRITEBACK 0
add rsp, 96
pop rsp
@@ -661,233 +408,58 @@
ret
-;void vp8_mbloop_filter_horizontal_edge_sse2
-;(
-; unsigned char *src_ptr,
-; int src_pixel_step,
-; const char *flimit,
-; const char *limit,
-; const char *thresh,
-; int count
-;)
-global sym(vp8_mbloop_filter_horizontal_edge_sse2)
-sym(vp8_mbloop_filter_horizontal_edge_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 32 ; reserve 32 bytes
- %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8];
- %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8];
-
- mov rsi, arg(0) ;src_ptr
- movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
-
- mov rdx, arg(3) ;limit
- movdqa xmm7, XMMWORD PTR [rdx]
-
- mov rdi, rsi ; rdi points to row +1 for indirect addressing
- add rdi, rax
-
- ; calculate breakout conditions
- movdqa xmm2, XMMWORD PTR [rdi+2*rax] ; q3
- movdqa xmm1, XMMWORD PTR [rsi+2*rax] ; q2
-
- movdqa xmm6, xmm1 ; q2
- psubusb xmm1, xmm2 ; q2-=q3
-
-
- psubusb xmm2, xmm6 ; q3-=q2
- por xmm1, xmm2 ; abs(q3-q2)
-
- psubusb xmm1, xmm7
-
- ; mm1 = abs(q3-q2), mm6 =q2, mm7 = limit
- movdqa xmm4, XMMWORD PTR [rsi+rax] ; q1
- movdqa xmm3, xmm4 ; q1
-
- psubusb xmm4, xmm6 ; q1-=q2
- psubusb xmm6, xmm3 ; q2-=q1
-
- por xmm4, xmm6 ; abs(q2-q1)
- psubusb xmm4, xmm7
-
- por xmm1, xmm4
- ; mm1 = mask, mm3=q1, mm7 = limit
-
- movdqa xmm4, XMMWORD PTR [rsi] ; q0
- movdqa xmm0, xmm4 ; q0
-
- psubusb xmm4, xmm3 ; q0-=q1
- psubusb xmm3, xmm0 ; q1-=q0
-
- por xmm4, xmm3 ; abs(q0-q1)
- movdqa t0, xmm4 ; save to t0
-
- psubusb xmm4, xmm7
- por xmm1, xmm4
-
- ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1)
- neg rax ; negate pitch to deal with above border
-
- movdqa xmm2, XMMWORD PTR [rsi+4*rax] ; p3
- movdqa xmm4, XMMWORD PTR [rdi+4*rax] ; p2
-
- movdqa xmm5, xmm4 ; p2
- psubusb xmm4, xmm2 ; p2-=p3
-
- psubusb xmm2, xmm5 ; p3-=p2
- por xmm4, xmm2 ; abs(p3 - p2)
-
- psubusb xmm4, xmm7
- por xmm1, xmm4
-
- ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1)
- movdqa xmm4, XMMWORD PTR [rsi+2*rax] ; p1
- movdqa xmm3, xmm4 ; p1
-
- psubusb xmm4, xmm5 ; p1-=p2
- psubusb xmm5, xmm3 ; p2-=p1
-
- por xmm4, xmm5 ; abs(p2 - p1)
- psubusb xmm4, xmm7
-
- por xmm1, xmm4
-
- movdqa xmm2, xmm3 ; p1
-
- ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1)
- movdqa xmm4, XMMWORD PTR [rsi+rax] ; p0
- movdqa xmm5, xmm4 ; p0
-
- psubusb xmm4, xmm3 ; p0-=p1
- psubusb xmm3, xmm5 ; p1-=p0
-
- por xmm4, xmm3 ; abs(p1 - p0)
- movdqa t1, xmm4 ; save to t1
-
- psubusb xmm4, xmm7
- por xmm1, xmm4
-
- ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) t1 = abs(p1-p0)
- ; mm5 = p0
- movdqa xmm3, XMMWORD PTR [rdi] ; q1
- movdqa xmm4, xmm3 ; q1
- psubusb xmm3, xmm2 ; q1-=p1
- psubusb xmm2, xmm4 ; p1-=q1
- por xmm2, xmm3 ; abs(p1-q1)
- pand xmm2, [tfe GLOBAL] ; set lsb of each byte to zero
- psrlw xmm2, 1 ; abs(p1-q1)/2
-
- movdqa xmm6, xmm5 ; p0
- movdqa xmm3, xmm0 ; q0
-
- psubusb xmm5, xmm3 ; p0-=q0
- psubusb xmm3, xmm6 ; q0-=p0
-
- por xmm5, xmm3 ; abs(p0 - q0)
- paddusb xmm5, xmm5 ; abs(p0-q0)*2
- paddusb xmm5, xmm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
- mov rdx, arg(2) ;flimit ; get flimit
- movdqa xmm2, XMMWORD PTR [rdx] ;
- paddb xmm2, xmm2 ; flimit*2 (less than 255)
- paddb xmm7, xmm2 ; flimit * 2 + limit (less than 255)
-
- psubusb xmm5, xmm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit
- por xmm1, xmm5
- pxor xmm5, xmm5
- pcmpeqb xmm1, xmm5 ; mask mm1
- ; mm1 = mask, mm0=q0, mm7 = flimit, t0 = abs(q0-q1) t1 = abs(p1-p0)
- ; mm6 = p0,
-
- ; calculate high edge variance
- mov rdx, arg(4) ;thresh ; get thresh
- movdqa xmm7, XMMWORD PTR [rdx] ;
-
- movdqa xmm4, t0 ; get abs (q1 - q0)
- psubusb xmm4, xmm7
-
- movdqa xmm3, t1 ; get abs (p1 - p0)
- psubusb xmm3, xmm7
-
- paddb xmm4, xmm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
- pcmpeqb xmm4, xmm5
-
- pcmpeqb xmm5, xmm5
- pxor xmm4, xmm5
- ; mm1 = mask, mm0=q0, mm7 = thresh, t0 = abs(q0-q1) t1 = abs(p1-p0)
- ; mm6 = p0, mm4=hev
- ; start work on filters
- movdqa xmm2, XMMWORD PTR [rsi+2*rax] ; p1
- movdqa xmm7, XMMWORD PTR [rdi] ; q1
-
- pxor xmm2, [t80 GLOBAL] ; p1 offset to convert to signed values
- pxor xmm7, [t80 GLOBAL] ; q1 offset to convert to signed values
+%macro MBH_FILTER 1
+%if %1
+ movdqa xmm2, [rsi+2*rax] ; p1
+ movdqa xmm7, [rdi] ; q1
+%else
+ movdqa xmm2, p1 ; p1
+ movdqa xmm7, q1 ; q1
+%endif
+ pxor xmm2, [t80 GLOBAL] ; p1 offset to convert to signed values
+ pxor xmm7, [t80 GLOBAL] ; q1 offset to convert to signed values
psubsb xmm2, xmm7 ; p1 - q1
- pxor xmm6, [t80 GLOBAL] ; offset to convert to signed values
-
- pxor xmm0, [t80 GLOBAL] ; offset to convert to signed values
+ pxor xmm6, [t80 GLOBAL] ; offset to convert to signed values
+ pxor xmm0, [t80 GLOBAL] ; offset to convert to signed values
movdqa xmm3, xmm0 ; q0
-
psubsb xmm0, xmm6 ; q0 - p0
paddsb xmm2, xmm0 ; 1 * (q0 - p0) + (p1 - q1)
-
paddsb xmm2, xmm0 ; 2 * (q0 - p0)
paddsb xmm2, xmm0 ; 3 * (q0 - p0) + (p1 - q1)
pand xmm1, xmm2 ; mask filter values we don't care about
- ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0
movdqa xmm2, xmm1 ; vp8_filter
pand xmm2, xmm4; ; Filter2 = vp8_filter & hev
-
- movdqa xmm5, xmm2 ;
- paddsb xmm5, [t3 GLOBAL];
+ movdqa xmm5, xmm2
+ paddsb xmm5, [t3 GLOBAL]
pxor xmm0, xmm0 ; 0
pxor xmm7, xmm7 ; 0
-
punpcklbw xmm0, xmm5 ; e0f0g0h0
psraw xmm0, 11 ; sign extended shift right by 3
-
punpckhbw xmm7, xmm5 ; a0b0c0d0
psraw xmm7, 11 ; sign extended shift right by 3
-
packsswb xmm0, xmm7 ; Filter2 >>=3;
movdqa xmm5, xmm0 ; Filter2
-
paddsb xmm2, [t4 GLOBAL] ; vp8_signed_char_clamp(Filter2 + 4)
- pxor xmm0, xmm0 ; 0
+ pxor xmm0, xmm0 ; 0
pxor xmm7, xmm7 ; 0
punpcklbw xmm0, xmm2 ; e0f0g0h0
-
psraw xmm0, 11 ; sign extended shift right by 3
punpckhbw xmm7, xmm2 ; a0b0c0d0
-
psraw xmm7, 11 ; sign extended shift right by 3
packsswb xmm0, xmm7 ; Filter2 >>=3;
- ; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0
psubsb xmm3, xmm0 ; qs0 =qs0 - filter1
paddsb xmm6, xmm5 ; ps0 =ps0 + Fitler2
- ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0
- ; vp8_filter &= ~hev;
- ; Filter2 = vp8_filter;
pandn xmm4, xmm1 ; vp8_filter&=~hev
+%endmacro
-
- ; mm3=qs0, mm4=filter2, mm6=ps0
-
+%macro MBH_WRITEBACK 1
; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
; s = vp8_signed_char_clamp(qs0 - u);
; *oq0 = s^0x80;
@@ -917,8 +489,20 @@
pxor xmm3, [t80 GLOBAL]
pxor xmm6, [t80 GLOBAL]
+%if %1
movdqa XMMWORD PTR [rsi+rax], xmm6
movdqa XMMWORD PTR [rsi], xmm3
+%else
+ lea rsi, [rsi + rcx*2]
+ lea rdi, [rdi + rcx*2]
+
+ movq MMWORD PTR [rsi], xmm6 ; p0
+ psrldq xmm6, 8
+ movq MMWORD PTR [rdi], xmm6
+ movq MMWORD PTR [rsi + rcx], xmm3 ; q0
+ psrldq xmm3, 8
+ movq MMWORD PTR [rdi + rcx], xmm3
+%endif
; roughly 2/7th difference across boundary
; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
@@ -943,8 +527,13 @@
packsswb xmm1, xmm2
+%if %1
movdqa xmm3, XMMWORD PTR [rdi]
- movdqa xmm6, XMMWORD PTR [rsi+rax*2] ; p1
+ movdqa xmm6, XMMWORD PTR [rsi+rax*2] ; p1
+%else
+ movdqa xmm3, q1 ; q1
+ movdqa xmm6, p1 ; p1
+%endif
pxor xmm3, [t80 GLOBAL]
pxor xmm6, [t80 GLOBAL]
@@ -955,9 +544,18 @@
pxor xmm6, [t80 GLOBAL]
pxor xmm3, [t80 GLOBAL]
+%if %1
movdqa XMMWORD PTR [rdi], xmm3
movdqa XMMWORD PTR [rsi+rax*2],xmm6
+%else
+ movq MMWORD PTR [rsi + rcx*2],xmm3 ; q1
+ psrldq xmm3, 8
+ movq MMWORD PTR [rdi + rcx*2],xmm3
+ movq MMWORD PTR [rsi + rax], xmm6 ; p1
+ psrldq xmm6, 8
+ movq MMWORD PTR [rdi + rax], xmm6
+%endif
; roughly 1/7th difference across boundary
; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
; s = vp8_signed_char_clamp(qs2 - u);
@@ -981,11 +579,15 @@
packsswb xmm1, xmm2
-
+%if %1
movdqa xmm6, XMMWORD PTR [rdi+rax*4]
neg rax
- movdqa xmm3, XMMWORD PTR [rdi+rax ]
+ movdqa xmm3, XMMWORD PTR [rdi+rax]
+%else
+ movdqa xmm6, p2 ; p2
+ movdqa xmm3, q2 ; q2
+%endif
pxor xmm6, [t80 GLOBAL]
pxor xmm3, [t80 GLOBAL]
@@ -995,11 +597,68 @@
pxor xmm6, [t80 GLOBAL]
pxor xmm3, [t80 GLOBAL]
-
- movdqa XMMWORD PTR [rdi+rax ], xmm3
+%if %1
+ movdqa XMMWORD PTR [rdi+rax ],xmm3
neg rax
- movdqa XMMWORD PTR [rdi+rax*4], xmm6
+ movdqa XMMWORD PTR [rdi+rax*4],xmm6
+%else
+ movq MMWORD PTR [rsi+rax*2], xmm6 ; p2
+ psrldq xmm6, 8
+ movq MMWORD PTR [rdi+rax*2], xmm6
+
+ lea rsi, [rsi + rcx]
+ lea rdi, [rdi + rcx]
+ movq MMWORD PTR [rsi+rcx*2 ],xmm3 ; q2
+ psrldq xmm3, 8
+ movq MMWORD PTR [rdi+rcx*2 ],xmm3
+%endif
+%endmacro
+
+
+;void vp8_mbloop_filter_horizontal_edge_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *flimit,
+; const char *limit,
+; const char *thresh,
+; int count
+;)
+global sym(vp8_mbloop_filter_horizontal_edge_sse2)
+sym(vp8_mbloop_filter_horizontal_edge_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 32 ; reserve 32 bytes
+ %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
+ %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixel_step
+
+ mov rdx, arg(3) ;limit
+ movdqa xmm7, XMMWORD PTR [rdx]
+
+ lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing
+
+ ; calculate breakout conditions
+ LFH_FILTER_MASK 1
+
+ ; calculate high edge variance
+ LFH_HEV_MASK
+
+ ; start work on filters
+ MBH_FILTER 1
+ ; write back the result
+ MBH_WRITEBACK 1
add rsp, 32
pop rsp
@@ -1013,6 +672,877 @@
ret
+;void vp8_mbloop_filter_horizontal_edge_uv_sse2
+;(
+; unsigned char *u,
+; int src_pixel_step,
+; const char *flimit,
+; const char *limit,
+; const char *thresh,
+; unsigned char *v
+;)
+global sym(vp8_mbloop_filter_horizontal_edge_uv_sse2)
+sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 96 ; reserve 96 bytes
+ %define q2 [rsp + 0] ;__declspec(align(16)) char q2[16];
+ %define q1 [rsp + 16] ;__declspec(align(16)) char q1[16];
+ %define p2 [rsp + 32] ;__declspec(align(16)) char p2[16];
+ %define p1 [rsp + 48] ;__declspec(align(16)) char p1[16];
+ %define t0 [rsp + 64] ;__declspec(align(16)) char t0[16];
+ %define t1 [rsp + 80] ;__declspec(align(16)) char t1[16];
+
+ mov rsi, arg(0) ; u
+ mov rdi, arg(5) ; v
+ movsxd rax, dword ptr arg(1) ; src_pixel_step
+ mov rcx, rax
+ neg rax ; negate pitch to deal with above border
+
+ mov rdx, arg(3) ;limit
+ movdqa xmm7, XMMWORD PTR [rdx]
+
+ lea rsi, [rsi + rcx]
+ lea rdi, [rdi + rcx]
+
+ ; calculate breakout conditions
+ LFH_FILTER_MASK 0
+
+ ; calculate high edge variance
+ LFH_HEV_MASK
+
+ ; start work on filters
+ MBH_FILTER 0
+ ; write back the result
+ MBH_WRITEBACK 0
+
+ add rsp, 96
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+%macro TRANSPOSE_16X8_1 0
+ movq xmm0, QWORD PTR [rdi+rcx*2] ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70
+ movq xmm7, QWORD PTR [rsi+rcx*2] ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60
+
+ punpcklbw xmm7, xmm0 ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60
+ movq xmm0, QWORD PTR [rsi+rcx]
+
+ movq xmm5, QWORD PTR [rsi] ;
+ punpcklbw xmm5, xmm0 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
+
+ movdqa xmm6, xmm5 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
+ punpcklwd xmm5, xmm7 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
+
+ punpckhwd xmm6, xmm7 ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44
+ movq xmm7, QWORD PTR [rsi + rax] ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30
+
+ movq xmm0, QWORD PTR [rsi + rax*2] ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20
+ punpcklbw xmm0, xmm7 ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20
+
+ movq xmm4, QWORD PTR [rsi + rax*4] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00
+ movq xmm7, QWORD PTR [rdi + rax*4] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10
+
+ punpcklbw xmm4, xmm7 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
+ movdqa xmm3, xmm4 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
+
+ punpcklwd xmm3, xmm0 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
+ punpckhwd xmm4, xmm0 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
+
+ movdqa xmm7, xmm4 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
+ movdqa xmm2, xmm3 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
+
+ punpckhdq xmm7, xmm6 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
+ punpckldq xmm4, xmm6 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
+
+ punpckhdq xmm3, xmm5 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
+ punpckldq xmm2, xmm5 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
+
+ movdqa t0, xmm2 ; save to free XMM2
+%endmacro
+
+%macro TRANSPOSE_16X8_2 1
+ movq xmm6, QWORD PTR [rdi+rcx*2] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0
+ movq xmm5, QWORD PTR [rsi+rcx*2] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0
+
+ punpcklbw xmm5, xmm6 ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0
+ movq xmm6, QWORD PTR [rsi+rcx] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0
+
+ movq xmm1, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0
+ punpcklbw xmm1, xmm6 ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0
+
+ movdqa xmm6, xmm1 ;
+ punpckhwd xmm6, xmm5 ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4
+
+ punpcklwd xmm1, xmm5 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
+ movq xmm5, QWORD PTR [rsi+rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0
+
+ movq xmm0, QWORD PTR [rsi+rax*2] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0
+ punpcklbw xmm0, xmm5 ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0
+
+ movq xmm2, QWORD PTR [rsi+rax*4] ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80
+ movq xmm5, QWORD PTR [rdi+rax*4] ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90
+
+ punpcklbw xmm2, xmm5 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
+ movdqa xmm5, xmm2 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
+
+ punpcklwd xmm5, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
+ punpckhwd xmm2, xmm0 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
+
+ movdqa xmm0, xmm5
+ punpckldq xmm0, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
+
+
+ punpckhdq xmm5, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
+ movdqa xmm1, xmm2 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
+
+ punpckldq xmm1, xmm6 ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84
+ punpckhdq xmm2, xmm6 ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86
+
+ movdqa xmm6, xmm7 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
+ punpcklqdq xmm6, xmm2 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
+
+ punpckhqdq xmm7, xmm2 ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07
+%if %1
+ movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
+
+ punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+
+ punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+ movdqa [rdx], xmm2 ; save 2
+
+ movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
+ punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+
+ movdqa [rdx+16], xmm3 ; save 3
+ punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
+
+ movdqa [rdx+32], xmm4 ; save 4
+ movdqa [rdx+48], xmm5 ; save 5
+
+ movdqa xmm1, t0 ; get
+ movdqa xmm2, xmm1 ;
+
+ punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
+ punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+%else
+ movdqa [rdx+112], xmm7 ; save 7
+ movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
+
+ movdqa [rdx+96], xmm6 ; save 6
+ punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+
+ punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+ movdqa [rdx+32], xmm2 ; save 2
+
+ movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
+ punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+
+ movdqa [rdx+48], xmm3 ; save 3
+ punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
+
+ movdqa [rdx+64], xmm4 ; save 4
+ movdqa [rdx+80], xmm5 ; save 5
+
+ movdqa xmm1, t0 ; get
+ movdqa xmm2, xmm1
+
+ punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
+ punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+
+ movdqa [rdx+16], xmm1
+ movdqa [rdx], xmm2
+%endif
+%endmacro
+
+%macro LFV_FILTER_MASK 1
+ movdqa xmm0, xmm6 ; q2
+ psubusb xmm0, xmm7 ; q2-q3
+
+ psubusb xmm7, xmm6 ; q3-q2
+ por xmm7, xmm0 ; abs (q3-q2)
+
+ movdqa xmm4, xmm5 ; q1
+ psubusb xmm4, xmm6 ; q1-q2
+
+ psubusb xmm6, xmm5 ; q2-q1
+ por xmm6, xmm4 ; abs (q2-q1)
+
+ movdqa xmm0, xmm1
+
+ psubusb xmm0, xmm2 ; p2 - p3;
+ psubusb xmm2, xmm1 ; p3 - p2;
+
+ por xmm0, xmm2 ; abs(p2-p3)
+%if %1
+ movdqa xmm2, [rdx] ; p1
+%else
+ movdqa xmm2, [rdx+32] ; p1
+%endif
+ movdqa xmm5, xmm2 ; p1
+
+ psubusb xmm5, xmm1 ; p1-p2
+ psubusb xmm1, xmm2 ; p2-p1
+
+ por xmm1, xmm5 ; abs(p2-p1)
+
+ mov rdx, arg(3) ; limit
+ movdqa xmm4, [rdx] ; limit
+
+ psubusb xmm7, xmm4
+
+ psubusb xmm0, xmm4 ; abs(p3-p2) > limit
+ psubusb xmm1, xmm4 ; abs(p2-p1) > limit
+
+ psubusb xmm6, xmm4 ; abs(q2-q1) > limit
+ por xmm7, xmm6 ; or
+
+ por xmm0, xmm1
+ por xmm0, xmm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
+
+ movdqa xmm1, xmm2 ; p1
+
+ movdqa xmm7, xmm3 ; p0
+ psubusb xmm7, xmm2 ; p0-p1
+
+ psubusb xmm2, xmm3 ; p1-p0
+ por xmm2, xmm7 ; abs(p1-p0)
+
+ movdqa t0, xmm2 ; save abs(p1-p0)
+ lea rdx, srct
+
+ psubusb xmm2, xmm4 ; abs(p1-p0)>limit
+ por xmm0, xmm2 ; mask
+%if %1
+ movdqa xmm5, [rdx+32] ; q0
+ movdqa xmm7, [rdx+48] ; q1
+%else
+ movdqa xmm5, [rdx+64] ; q0
+ movdqa xmm7, [rdx+80] ; q1
+%endif
+ movdqa xmm6, xmm5 ; q0
+ movdqa xmm2, xmm7 ; q1
+ psubusb xmm5, xmm7 ; q0-q1
+
+ psubusb xmm7, xmm6 ; q1-q0
+ por xmm7, xmm5 ; abs(q1-q0)
+
+ movdqa t1, xmm7 ; save abs(q1-q0)
+ psubusb xmm7, xmm4 ; abs(q1-q0)> limit
+
+ por xmm0, xmm7 ; mask
+
+ movdqa xmm5, xmm2 ; q1
+ psubusb xmm5, xmm1 ; q1-=p1
+ psubusb xmm1, xmm2 ; p1-=q1
+ por xmm5, xmm1 ; abs(p1-q1)
+ pand xmm5, [tfe GLOBAL] ; set lsb of each byte to zero
+ psrlw xmm5, 1 ; abs(p1-q1)/2
+
+ mov rdx, arg(2) ; flimit
+ movdqa xmm2, [rdx] ; flimit
+
+ movdqa xmm1, xmm3 ; p0
+ movdqa xmm7, xmm6 ; q0
+ psubusb xmm1, xmm7 ; p0-q0
+ psubusb xmm7, xmm3 ; q0-p0
+ por xmm1, xmm7 ; abs(q0-p0)
+ paddusb xmm1, xmm1 ; abs(q0-p0)*2
+ paddusb xmm1, xmm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+ paddb xmm2, xmm2 ; flimit*2 (less than 255)
+ paddb xmm4, xmm2 ; flimit * 2 + limit (less than 255)
+
+ psubusb xmm1, xmm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit
+ por xmm1, xmm0; ; mask
+ pxor xmm0, xmm0
+ pcmpeqb xmm1, xmm0
+%endmacro
+
+%macro LFV_HEV_MASK 0
+ mov rdx, arg(4) ; get thresh
+ movdqa xmm7, XMMWORD PTR [rdx]
+
+ movdqa xmm4, t0 ; get abs (q1 - q0)
+ psubusb xmm4, xmm7 ; abs(q1 - q0) > thresh
+
+ movdqa xmm3, t1 ; get abs (p1 - p0)
+ psubusb xmm3, xmm7 ; abs(p1 - p0)> thresh
+
+ por xmm4, xmm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+ pcmpeqb xmm4, xmm0
+
+ pcmpeqb xmm0, xmm0
+ pxor xmm4, xmm0
+%endmacro
+
+%macro BV_FILTER 0
+ lea rdx, srct
+
+ movdqa xmm2, [rdx] ; p1 lea rsi, [rsi+rcx*8]
+ lea rdi, [rsi+rcx]
+ movdqa xmm7, [rdx+48] ; q1
+ movdqa xmm6, [rdx+16] ; p0
+ movdqa xmm0, [rdx+32] ; q0
+
+ pxor xmm2, [t80 GLOBAL] ; p1 offset to convert to signed values
+ pxor xmm7, [t80 GLOBAL] ; q1 offset to convert to signed values
+
+ psubsb xmm2, xmm7 ; p1 - q1
+ pand xmm2, xmm4 ; high var mask (hvm)(p1 - q1)
+
+ pxor xmm6, [t80 GLOBAL] ; offset to convert to signed values
+ pxor xmm0, [t80 GLOBAL] ; offset to convert to signed values
+
+ movdqa xmm3, xmm0 ; q0
+ psubsb xmm0, xmm6 ; q0 - p0
+
+ paddsb xmm2, xmm0 ; 1 * (q0 - p0) + hvm(p1 - q1)
+ paddsb xmm2, xmm0 ; 2 * (q0 - p0) + hvm(p1 - q1)
+
+ paddsb xmm2, xmm0 ; 3 * (q0 - p0) + hvm(p1 - q1)
+ pand xmm1, xmm2 ; mask filter values we don't care about
+
+ movdqa xmm2, xmm1
+ paddsb xmm1, [t4 GLOBAL] ; 3* (q0 - p0) + hvm(p1 - q1) + 4
+
+ paddsb xmm2, [t3 GLOBAL] ; 3* (q0 - p0) + hvm(p1 - q1) + 3
+ pxor xmm0, xmm0
+
+ pxor xmm5, xmm5
+ punpcklbw xmm0, xmm2
+
+ punpckhbw xmm5, xmm2
+ psraw xmm0, 11
+
+ psraw xmm5, 11
+ packsswb xmm0, xmm5
+
+ movdqa xmm2, xmm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
+
+ pxor xmm0, xmm0 ; 0
+ movdqa xmm5, xmm1 ; abcdefgh
+
+ punpcklbw xmm0, xmm1 ; e0f0g0h0
+ psraw xmm0, 11 ; sign extended shift right by 3
+
+ pxor xmm1, xmm1 ; 0
+ punpckhbw xmm1, xmm5 ; a0b0c0d0
+
+ psraw xmm1, 11 ; sign extended shift right by 3
+ movdqa xmm5, xmm0 ; save results
+
+ packsswb xmm0, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
+ paddsw xmm5, [ones GLOBAL]
+
+ paddsw xmm1, [ones GLOBAL]
+ psraw xmm5, 1 ; partial shifted one more time for 2nd tap
+
+ psraw xmm1, 1 ; partial shifted one more time for 2nd tap
+ packsswb xmm5, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
+
+ pandn xmm4, xmm5 ; high edge variance additive
+
+ paddsb xmm6, xmm2 ; p0+= p0 add
+ pxor xmm6, [t80 GLOBAL] ; unoffset
+
+ movdqa xmm1, [rdx] ; p1
+ pxor xmm1, [t80 GLOBAL] ; reoffset
+
+ paddsb xmm1, xmm4 ; p1+= p1 add
+ pxor xmm1, [t80 GLOBAL] ; unoffset
+
+ psubsb xmm3, xmm0 ; q0-= q0 add
+ pxor xmm3, [t80 GLOBAL] ; unoffset
+
+ psubsb xmm7, xmm4 ; q1-= q1 add
+ pxor xmm7, [t80 GLOBAL] ; unoffset
+%endmacro
+
+%macro BV_TRANSPOSE 0
+ ; xmm1 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+ ; xmm6 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+ ; xmm3 = f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+ ; xmm7 = f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
+ movdqa xmm2, xmm1 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+ punpcklbw xmm2, xmm6 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
+
+ movdqa xmm4, xmm3 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+ punpckhbw xmm1, xmm6 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+
+ punpcklbw xmm4, xmm7 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
+ punpckhbw xmm3, xmm7 ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
+
+ movdqa xmm6, xmm2 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
+ punpcklwd xmm2, xmm4 ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
+
+ punpckhwd xmm6, xmm4 ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
+ movdqa xmm5, xmm1 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+
+ punpcklwd xmm1, xmm3 ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
+ punpckhwd xmm5, xmm3 ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
+ ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
+ ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
+ ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
+ ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
+%endmacro
+
+%macro BV_WRITEBACK 2
+ movd [rsi+rax*4+2], %1
+ psrldq %1, 4
+
+ movd [rdi+rax*4+2], %1
+ psrldq %1, 4
+
+ movd [rsi+rax*2+2], %1
+ psrldq %1, 4
+
+ movd [rdi+rax*2+2], %1
+
+ movd [rsi+2], %2
+ psrldq %2, 4
+
+ movd [rdi+2], %2
+ psrldq %2, 4
+
+ movd [rdi+rcx+2], %2
+ psrldq %2, 4
+
+ movd [rdi+rcx*2+2], %2
+%endmacro
+
+
+;void vp8_loop_filter_vertical_edge_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *flimit,
+; const char *limit,
+; const char *thresh,
+; int count
+;)
+global sym(vp8_loop_filter_vertical_edge_sse2)
+sym(vp8_loop_filter_vertical_edge_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 96 ; reserve 96 bytes
+ %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
+ %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
+ %define srct [rsp + 32] ;__declspec(align(16)) char srct[64];
+
+ mov rsi, arg(0) ; src_ptr
+ movsxd rax, dword ptr arg(1) ; src_pixel_step
+
+ lea rsi, [rsi + rax*4 - 4]
+ lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
+ mov rcx, rax
+ neg rax
+
+ ;transpose 16x8 to 8x16, and store the 8-line result on stack.
+ TRANSPOSE_16X8_1
+
+ lea rsi, [rsi+rcx*8]
+ lea rdi, [rdi+rcx*8]
+ lea rdx, srct
+ TRANSPOSE_16X8_2 1
+
+ ; calculate filter mask
+ LFV_FILTER_MASK 1
+ ; calculate high edge variance
+ LFV_HEV_MASK
+
+ ; start work on filters
+ BV_FILTER
+
+ ; tranpose and write back - only work on q1, q0, p0, p1
+ BV_TRANSPOSE
+ ; store 16-line result
+ BV_WRITEBACK xmm1, xmm5
+
+ lea rsi, [rsi+rax*8]
+ lea rdi, [rsi+rcx]
+ BV_WRITEBACK xmm2, xmm6
+
+ add rsp, 96
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_loop_filter_vertical_edge_uv_sse2
+;(
+; unsigned char *u,
+; int src_pixel_step,
+; const char *flimit,
+; const char *limit,
+; const char *thresh,
+; unsigned char *v
+;)
+global sym(vp8_loop_filter_vertical_edge_uv_sse2)
+sym(vp8_loop_filter_vertical_edge_uv_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 96 ; reserve 96 bytes
+ %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
+ %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
+ %define srct [rsp + 32] ;__declspec(align(16)) char srct[64];
+
+ mov rsi, arg(0) ; u_ptr
+ movsxd rax, dword ptr arg(1) ; src_pixel_step
+
+ lea rsi, [rsi + rax*4 - 4]
+ lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
+ mov rcx, rax
+ neg rax
+
+ ;transpose 16x8 to 8x16, and store the 8-line result on stack.
+ TRANSPOSE_16X8_1
+
+ mov rsi, arg(5) ; v_ptr
+ lea rsi, [rsi + rcx*4 - 4]
+ lea rdi, [rsi + rcx] ; rdi points to row +1 for indirect addressing
+
+ lea rdx, srct
+ TRANSPOSE_16X8_2 1
+
+ ; calculate filter mask
+ LFV_FILTER_MASK 1
+ ; calculate high edge variance
+ LFV_HEV_MASK
+
+ ; start work on filters
+ BV_FILTER
+
+ ; tranpose and write back - only work on q1, q0, p0, p1
+ BV_TRANSPOSE
+ ; store 16-line result
+ BV_WRITEBACK xmm1, xmm5
+
+ mov rsi, arg(0) ;u_ptr
+ lea rsi, [rsi + rcx*4 - 4]
+ lea rdi, [rsi + rcx]
+ BV_WRITEBACK xmm2, xmm6
+
+ add rsp, 96
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+%macro MBV_FILTER 0
+ lea rdx, srct
+
+ movdqa xmm2, [rdx+32] ; p1
+ movdqa xmm7, [rdx+80] ; q1
+ movdqa xmm6, [rdx+48] ; p0
+ movdqa xmm0, [rdx+64] ; q0
+
+ pxor xmm2, [t80 GLOBAL] ; p1 offset to convert to signed values
+ pxor xmm7, [t80 GLOBAL] ; q1 offset to convert to signed values
+ pxor xmm6, [t80 GLOBAL] ; offset to convert to signed values
+ pxor xmm0, [t80 GLOBAL] ; offset to convert to signed values
+
+ psubsb xmm2, xmm7 ; p1 - q1
+
+ movdqa xmm3, xmm0 ; q0
+
+ psubsb xmm0, xmm6 ; q0 - p0
+ paddsb xmm2, xmm0 ; 1 * (q0 - p0) + (p1 - q1)
+
+ paddsb xmm2, xmm0 ; 2 * (q0 - p0)
+ paddsb xmm2, xmm0 ; 3 * (q0 - p0)+ (p1 - q1)
+
+ pand xmm1, xmm2 ; mask filter values we don't care about
+
+ movdqa xmm2, xmm1 ; vp8_filter
+ pand xmm2, xmm4; ; Filter2 = vp8_filter & hev
+
+ movdqa xmm5, xmm2
+ paddsb xmm5, [t3 GLOBAL]
+
+ pxor xmm0, xmm0 ; 0
+ pxor xmm7, xmm7 ; 0
+
+ punpcklbw xmm0, xmm5 ; e0f0g0h0
+ psraw xmm0, 11 ; sign extended shift right by 3
+
+ punpckhbw xmm7, xmm5 ; a0b0c0d0
+ psraw xmm7, 11 ; sign extended shift right by 3
+
+ packsswb xmm0, xmm7 ; Filter2 >>=3;
+ movdqa xmm5, xmm0 ; Filter2
+
+ paddsb xmm2, [t4 GLOBAL] ; vp8_signed_char_clamp(Filter2 + 4)
+ pxor xmm0, xmm0 ; 0
+
+ pxor xmm7, xmm7 ; 0
+ punpcklbw xmm0, xmm2 ; e0f0g0h0
+
+ psraw xmm0, 11 ; sign extended shift right by 3
+ punpckhbw xmm7, xmm2 ; a0b0c0d0
+
+ psraw xmm7, 11 ; sign extended shift right by 3
+ packsswb xmm0, xmm7 ; Filter2 >>=3;
+
+ psubsb xmm3, xmm0 ; qs0 =qs0 - filter1
+ paddsb xmm6, xmm5 ; ps0 =ps0 + Fitler2
+
+ ; vp8_filter &= ~hev;
+ ; Filter2 = vp8_filter;
+ pandn xmm4, xmm1 ; vp8_filter&=~hev
+
+ ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
+ ; s = vp8_signed_char_clamp(qs0 - u);
+ ; *oq0 = s^0x80;
+ ; s = vp8_signed_char_clamp(ps0 + u);
+ ; *op0 = s^0x80;
+ pxor xmm0, xmm0
+ pxor xmm1, xmm1
+
+ pxor xmm2, xmm2
+ punpcklbw xmm1, xmm4
+
+ punpckhbw xmm2, xmm4
+ pmulhw xmm1, [s27 GLOBAL]
+
+ pmulhw xmm2, [s27 GLOBAL]
+ paddw xmm1, [s63 GLOBAL]
+
+ paddw xmm2, [s63 GLOBAL]
+ psraw xmm1, 7
+
+ psraw xmm2, 7
+ packsswb xmm1, xmm2
+
+ psubsb xmm3, xmm1
+ paddsb xmm6, xmm1
+
+ pxor xmm3, [t80 GLOBAL]
+ pxor xmm6, [t80 GLOBAL]
+
+ movdqa [rdx+48], xmm6
+ movdqa [rdx+64], xmm3
+
+ ; roughly 2/7th difference across boundary
+ ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
+ ; s = vp8_signed_char_clamp(qs1 - u);
+ ; *oq1 = s^0x80;
+ ; s = vp8_signed_char_clamp(ps1 + u);
+ ; *op1 = s^0x80;
+ pxor xmm1, xmm1
+ pxor xmm2, xmm2
+
+ punpcklbw xmm1, xmm4
+ punpckhbw xmm2, xmm4
+
+ pmulhw xmm1, [s18 GLOBAL]
+ pmulhw xmm2, [s18 GLOBAL]
+
+ paddw xmm1, [s63 GLOBAL]
+ paddw xmm2, [s63 GLOBAL]
+
+ psraw xmm1, 7
+ psraw xmm2, 7
+
+ packsswb xmm1, xmm2
+
+ movdqa xmm3, [rdx + 80] ; q1
+ movdqa xmm6, [rdx + 32] ; p1
+
+ pxor xmm3, [t80 GLOBAL]
+ pxor xmm6, [t80 GLOBAL]
+
+ paddsb xmm6, xmm1
+ psubsb xmm3, xmm1
+
+ pxor xmm6, [t80 GLOBAL]
+ pxor xmm3, [t80 GLOBAL]
+
+ movdqa [rdx + 80], xmm3
+ movdqa [rdx + 32], xmm6
+
+ ; roughly 1/7th difference across boundary
+ ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
+ ; s = vp8_signed_char_clamp(qs2 - u);
+ ; *oq2 = s^0x80;
+ ; s = vp8_signed_char_clamp(ps2 + u);
+ ; *op2 = s^0x80;
+ pxor xmm1, xmm1
+ pxor xmm2, xmm2
+
+ punpcklbw xmm1, xmm4
+ punpckhbw xmm2, xmm4
+
+ pmulhw xmm1, [s9 GLOBAL]
+ pmulhw xmm2, [s9 GLOBAL]
+
+ paddw xmm1, [s63 GLOBAL]
+ paddw xmm2, [s63 GLOBAL]
+
+ psraw xmm1, 7
+ psraw xmm2, 7
+
+ packsswb xmm1, xmm2
+
+ movdqa xmm6, [rdx+16]
+ movdqa xmm3, [rdx+96]
+
+ pxor xmm6, [t80 GLOBAL]
+ pxor xmm3, [t80 GLOBAL]
+
+ paddsb xmm6, xmm1
+ psubsb xmm3, xmm1
+
+ pxor xmm6, [t80 GLOBAL] ; xmm6 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
+ pxor xmm3, [t80 GLOBAL] ; xmm3 = f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 15 06
+%endmacro
+
+%macro MBV_TRANSPOSE 0
+ movdqa xmm0, [rdx] ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+ movdqa xmm1, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+
+ punpcklbw xmm0, xmm6 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
+ punpckhbw xmm1, xmm6 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
+
+ movdqa xmm2, [rdx+32] ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+ movdqa xmm6, xmm2 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+
+ punpcklbw xmm2, [rdx+48] ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
+ punpckhbw xmm6, [rdx+48] ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+
+ movdqa xmm5, xmm0 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
+ punpcklwd xmm0, xmm2 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
+
+ punpckhwd xmm5, xmm2 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
+ movdqa xmm4, xmm1 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
+
+ punpcklwd xmm1, xmm6 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
+ punpckhwd xmm4, xmm6 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
+
+ movdqa xmm2, [rdx+64] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+ punpcklbw xmm2, [rdx+80] ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
+
+ movdqa xmm6, xmm3 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
+ punpcklbw xmm6, [rdx+112] ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06
+
+ movdqa xmm7, xmm2 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
+ punpcklwd xmm2, xmm6 ; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04
+
+ punpckhwd xmm7, xmm6 ; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44
+ movdqa xmm6, xmm0 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
+
+ punpckldq xmm0, xmm2 ; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00
+ punpckhdq xmm6, xmm2 ; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20
+%endmacro
+
+%macro MBV_WRITEBACK_1 0
+ movq QWORD PTR [rsi+rax*4], xmm0
+ psrldq xmm0, 8
+
+ movq QWORD PTR [rsi+rax*2], xmm6
+ psrldq xmm6, 8
+
+ movq QWORD PTR [rdi+rax*4], xmm0
+ movq QWORD PTR [rsi+rax], xmm6
+
+ movdqa xmm0, xmm5 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
+ punpckldq xmm0, xmm7 ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40
+
+ punpckhdq xmm5, xmm7 ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60
+
+ movq QWORD PTR [rsi], xmm0
+ psrldq xmm0, 8
+
+ movq QWORD PTR [rsi+rcx*2], xmm5
+ psrldq xmm5, 8
+
+ movq QWORD PTR [rsi+rcx], xmm0
+ movq QWORD PTR [rdi+rcx*2], xmm5
+
+ movdqa xmm2, [rdx+64] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+ punpckhbw xmm2, [rdx+80] ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
+
+ punpckhbw xmm3, [rdx+112] ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86
+ movdqa xmm0, xmm2
+
+ punpcklwd xmm0, xmm3 ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84
+ punpckhwd xmm2, xmm3 ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4
+
+ movdqa xmm3, xmm1 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
+ punpckldq xmm1, xmm0 ; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80
+
+ punpckhdq xmm3, xmm0 ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0
+%endmacro
+
+%macro MBV_WRITEBACK_2 0
+ movq QWORD PTR [rsi+rax*4], xmm1
+ psrldq xmm1, 8
+
+ movq QWORD PTR [rsi+rax*2], xmm3
+ psrldq xmm3, 8
+
+ movq QWORD PTR [rdi+rax*4], xmm1
+ movq QWORD PTR [rsi+rax], xmm3
+
+ movdqa xmm1, xmm4 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
+ punpckldq xmm1, xmm2 ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0
+
+ punpckhdq xmm4, xmm2 ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0
+ movq QWORD PTR [rsi], xmm1
+
+ psrldq xmm1, 8
+
+ movq QWORD PTR [rsi+rcx*2], xmm4
+ psrldq xmm4, 8
+
+ movq QWORD PTR [rsi+rcx], xmm1
+ movq QWORD PTR [rdi+rcx*2], xmm4
+%endmacro
+
+
;void vp8_mbloop_filter_vertical_edge_sse2
;(
; unsigned char *src_ptr,
@@ -1039,531 +1569,116 @@
%define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
%define srct [rsp + 32] ;__declspec(align(16)) char srct[128];
-
mov rsi, arg(0) ;src_ptr
- movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
+ movsxd rax, dword ptr arg(1) ;src_pixel_step
lea rsi, [rsi + rax*4 - 4]
- lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
-
+ lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
mov rcx, rax
- neg rcx
+ neg rax
; Transpose
- movq xmm0, QWORD PTR [rdi+rax*2] ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70
- movq xmm7, QWORD PTR [rsi+rax*2] ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60
+ TRANSPOSE_16X8_1
- punpcklbw xmm7, xmm0 ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60
- movq xmm0, QWORD PTR [rsi+rax] ;
-
- movq xmm5, QWORD PTR [rsi] ;
- punpcklbw xmm5, xmm0 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
-
- movdqa xmm6, xmm5 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
- punpcklwd xmm5, xmm7 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
-
- punpckhwd xmm6, xmm7 ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44
- movq xmm7, QWORD PTR [rsi + rcx] ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30
-
- movq xmm0, QWORD PTR [rsi + rcx*2] ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20
- punpcklbw xmm0, xmm7 ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20
-
- movq xmm4, QWORD PTR [rsi + rcx*4] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00
- movq xmm7, QWORD PTR [rdi + rcx*4] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10
-
- punpcklbw xmm4, xmm7 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
- movdqa xmm3, xmm4 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
-
- punpcklwd xmm3, xmm0 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
- punpckhwd xmm4, xmm0 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
-
- movdqa xmm7, xmm4 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
- movdqa xmm2, xmm3 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
-
- punpckhdq xmm7, xmm6 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
- punpckldq xmm4, xmm6 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
-
- punpckhdq xmm3, xmm5 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
- punpckldq xmm2, xmm5 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
-
- movdqa t0, xmm2 ; save to free XMM2
- ;movdqa t1, xmm3
-
- ; XMM3 XMM4 XMM7 in use
- lea rsi, [rsi+rax*8]
- lea rdi, [rdi+rax*8]
-
- movq xmm6, QWORD PTR [rdi+rax*2] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0
- movq xmm5, QWORD PTR [rsi+rax*2] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0
-
- punpcklbw xmm5, xmm6 ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0
- movq xmm6, QWORD PTR [rsi+rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0
-
- movq xmm1, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0
- punpcklbw xmm1, xmm6 ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0
-
- movdqa xmm6, xmm1 ;
- punpckhwd xmm6, xmm5 ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4
-
- punpcklwd xmm1, xmm5 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
- movq xmm5, QWORD PTR [rsi+rcx] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0
-
- movq xmm0, QWORD PTR [rsi+rcx*2] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0
- punpcklbw xmm0, xmm5 ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0
-
- movq xmm2, QWORD PTR [rsi+rcx*4] ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80
- movq xmm5, QWORD PTR [rdi+rcx*4] ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90
-
- punpcklbw xmm2, xmm5 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
- movdqa xmm5, xmm2 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
-
- punpcklwd xmm5, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
- punpckhwd xmm2, xmm0 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
-
- movdqa xmm0, xmm5
- punpckldq xmm0, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
-
-
- punpckhdq xmm5, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
- movdqa xmm1, xmm2 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
-
- punpckldq xmm1, xmm6 ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84
- punpckhdq xmm2, xmm6 ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86
-
- movdqa xmm6, xmm7 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
- punpcklqdq xmm6, xmm2 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
-
-
+ lea rsi, [rsi+rcx*8]
+ lea rdi, [rdi+rcx*8]
lea rdx, srct
- punpckhqdq xmm7, xmm2 ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07
+ TRANSPOSE_16X8_2 0
- movdqa [rdx+112], xmm7 ; save 7
- movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
-
- movdqa [rdx+96], xmm6 ; save 6
- punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
-
- punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
- movdqa [rdx+32], xmm2 ; save 2
-
- movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
- punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
-
- movdqa [rdx+48], xmm3 ; save 3
- punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
-
- movdqa [rdx+64], xmm4 ; save 4
- movdqa [rdx+80], xmm5 ; save 5
-
- movdqa xmm1, t0 ; get
- movdqa xmm2, xmm1 ;
-
- punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
- punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
-
- movdqa [rdx+16], xmm1
- movdqa [rdx], xmm2
-
- movdqa xmm0, xmm6 ; q2
- psubusb xmm0, xmm7 ; q2-q3
-
- psubusb xmm7, xmm6 ; q3-q2
- por xmm7, xmm0 ; abs (q3-q2)
-
- movdqa xmm1, xmm5 ; q1
- psubusb xmm1, xmm6 ; q1-q2
-
- psubusb xmm6, xmm5 ; q2-q1
- por xmm6, xmm1 ; abs (q2-q1)
-
- ;/*
- ;movdqa xmm0, xmm4 ; q0
- ;psubusb xmm0 xmm5 ; q0-q1
- ;
- ;pusbusb xmm5, xmm4 ; q1-q0
- ;por xmm5, xmm0 ; abs (q1-q0)
- ;*/
-
- movdqa xmm1, [rdx+16] ; p2
- movdqa xmm0, xmm1
-
- psubusb xmm0, xmm2 ; p2 - p3;
- psubusb xmm2, xmm1 ; p3 - p2;
-
- por xmm0, xmm2 ; abs(p2-p3)
-
- movdqa xmm2, [rdx+32] ; p1
- movdqa xmm5, xmm2 ; p1
-
- psubusb xmm5, xmm1 ; p1-p2
- psubusb xmm1, xmm2 ; p2-p1
-
- por xmm1, xmm5 ; abs(p2-p1)
- mov rdx, arg(3) ;limit
-
- movdqa xmm4, [rdx] ; limit
- psubusb xmm7, xmm4 ;
-
-
- psubusb xmm0, xmm4 ; abs(p3-p2) > limit
- psubusb xmm1, xmm4 ; abs(p2-p1) > limit
-
- psubusb xmm6, xmm4 ; abs(q2-q1) > limit
- por xmm7, xmm6 ; or
-
- por xmm0, xmm1 ;
- por xmm0, xmm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
-
- movdqa xmm1, xmm2 ; p1
-
- movdqa xmm7, xmm3 ; p0
- psubusb xmm7, xmm2 ; p0-p1
-
- psubusb xmm2, xmm3 ; p1-p0
- por xmm2, xmm7 ; abs(p1-p0)
-
- movdqa t0, xmm2 ; save abs(p1-p0)
- lea rdx, srct
-
- psubusb xmm2, xmm4 ; abs(p1-p0)>limit
- por xmm0, xmm2 ; mask
-
- movdqa xmm5, [rdx+64] ; q0
- movdqa xmm7, [rdx+80] ; q1
-
- movdqa xmm6, xmm5 ; q0
- movdqa xmm2, xmm7 ; q1
- psubusb xmm5, xmm7 ; q0-q1
-
- psubusb xmm7, xmm6 ; q1-q0
- por xmm7, xmm5 ; abs(q1-q0)
-
- movdqa t1, xmm7 ; save abs(q1-q0)
- psubusb xmm7, xmm4 ; abs(q1-q0)> limit
-
- por xmm0, xmm7 ; mask
-
- movdqa xmm5, xmm2 ; q1
- psubusb xmm5, xmm1 ; q1-=p1
- psubusb xmm1, xmm2 ; p1-=q1
- por xmm5, xmm1 ; abs(p1-q1)
- pand xmm5, [tfe GLOBAL] ; set lsb of each byte to zero
- psrlw xmm5, 1 ; abs(p1-q1)/2
-
- mov rdx, arg(2) ;flimit ;
- movdqa xmm2, [rdx] ; flimit
-
- movdqa xmm1, xmm3 ; p0
- movdqa xmm7, xmm6 ; q0
- psubusb xmm1, xmm7 ; p0-q0
- psubusb xmm7, xmm3 ; q0-p0
- por xmm1, xmm7 ; abs(q0-p0)
- paddusb xmm1, xmm1 ; abs(q0-p0)*2
- paddusb xmm1, xmm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
- paddb xmm2, xmm2 ; flimit*2 (less than 255)
- paddb xmm4, xmm2 ; flimit * 2 + limit (less than 255)
-
- psubusb xmm1, xmm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit
- por xmm1, xmm0; ; mask
- pxor xmm0, xmm0
- pcmpeqb xmm1, xmm0
-
+ ; calculate filter mask
+ LFV_FILTER_MASK 0
; calculate high edge variance
- mov rdx, arg(4) ;thresh ; get thresh
- movdqa xmm7, [rdx]
-
- movdqa xmm4, t0 ; get abs (q1 - q0)
- psubusb xmm4, xmm7 ; abs(q1 - q0) > thresh
-
- movdqa xmm3, t1 ; get abs (p1 - p0)
- psubusb xmm3, xmm7 ; abs(p1 - p0)> thresh
-
- por xmm4, xmm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
- pcmpeqb xmm4, xmm0
-
- pcmpeqb xmm0, xmm0
- pxor xmm4, xmm0
-
+ LFV_HEV_MASK
; start work on filters
- lea rdx, srct
-
- ; start work on filters
- movdqa xmm2, [rdx+32] ; p1
- movdqa xmm7, [rdx+80] ; q1
-
- pxor xmm2, [t80 GLOBAL] ; p1 offset to convert to signed values
- pxor xmm7, [t80 GLOBAL] ; q1 offset to convert to signed values
-
- psubsb xmm2, xmm7 ; p1 - q1
- movdqa xmm6, [rdx+48] ; p0
-
- movdqa xmm0, [rdx+64] ; q0
- pxor xmm6, [t80 GLOBAL] ; offset to convert to signed values
-
- pxor xmm0, [t80 GLOBAL] ; offset to convert to signed values
- movdqa xmm3, xmm0 ; q0
-
- psubsb xmm0, xmm6 ; q0 - p0
- paddsb xmm2, xmm0 ; 1 * (q0 - p0) + (p1 - q1)
-
- paddsb xmm2, xmm0 ; 2 * (q0 - p0)
- paddsb xmm2, xmm0 ; 3 * (q0 - p0)+ (p1 - q1)
-
- pand xmm1, xmm2 ; mask filter values we don't care about
-
- ; xmm1 = vp8_filter, xmm4=hev, xmm6=ps0, xmm3=qs0
- movdqa xmm2, xmm1 ; vp8_filter
- pand xmm2, xmm4; ; Filter2 = vp8_filter & hev
-
- movdqa xmm5, xmm2
- paddsb xmm5, [t3 GLOBAL]
-
- pxor xmm0, xmm0 ; 0
- pxor xmm7, xmm7 ; 0
-
- punpcklbw xmm0, xmm5 ; e0f0g0h0
- psraw xmm0, 11 ; sign extended shift right by 3
-
- punpckhbw xmm7, xmm5 ; a0b0c0d0
- psraw xmm7, 11 ; sign extended shift right by 3
-
- packsswb xmm0, xmm7 ; Filter2 >>=3;
- movdqa xmm5, xmm0 ; Filter2
-
- paddsb xmm2, [t4 GLOBAL] ; vp8_signed_char_clamp(Filter2 + 4)
- pxor xmm0, xmm0 ; 0
-
- pxor xmm7, xmm7 ; 0
- punpcklbw xmm0, xmm2 ; e0f0g0h0
-
- psraw xmm0, 11 ; sign extended shift right by 3
- punpckhbw xmm7, xmm2 ; a0b0c0d0
-
- psraw xmm7, 11 ; sign extended shift right by 3
- packsswb xmm0, xmm7 ; Filter2 >>=3;
-
- ; xmm0= filter2 xmm1 = vp8_filter, xmm3 =qs0 xmm5=s xmm4 =hev xmm6=ps0
- psubsb xmm3, xmm0 ; qs0 =qs0 - filter1
- paddsb xmm6, xmm5 ; ps0 =ps0 + Fitler2
-
-
- ; xmm1=vp8_filter, xmm3=qs0, xmm4 =hev xmm6=ps0
- ; vp8_filter &= ~hev;
- ; Filter2 = vp8_filter;
- pandn xmm4, xmm1 ; vp8_filter&=~hev
-
- ; xmm3=qs0, xmm4=filter2, xmm6=ps0
- ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
- ; s = vp8_signed_char_clamp(qs0 - u);
- ; *oq0 = s^0x80;
- ; s = vp8_signed_char_clamp(ps0 + u);
- ; *op0 = s^0x80;
- pxor xmm0, xmm0
- pxor xmm1, xmm1
-
- pxor xmm2, xmm2
- punpcklbw xmm1, xmm4
-
- punpckhbw xmm2, xmm4
- pmulhw xmm1, [s27 GLOBAL]
-
- pmulhw xmm2, [s27 GLOBAL]
- paddw xmm1, [s63 GLOBAL]
-
- paddw xmm2, [s63 GLOBAL]
- psraw xmm1, 7
-
- psraw xmm2, 7
- packsswb xmm1, xmm2
-
- psubsb xmm3, xmm1
- paddsb xmm6, xmm1
-
- pxor xmm3, [t80 GLOBAL]
- pxor xmm6, [t80 GLOBAL]
-
- movdqa [rdx+48], xmm6
- movdqa [rdx+64], xmm3
-
- ; roughly 2/7th difference across boundary
- ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
- ; s = vp8_signed_char_clamp(qs1 - u);
- ; *oq1 = s^0x80;
- ; s = vp8_signed_char_clamp(ps1 + u);
- ; *op1 = s^0x80;
- pxor xmm1, xmm1
- pxor xmm2, xmm2
-
- punpcklbw xmm1, xmm4
- punpckhbw xmm2, xmm4
-
- pmulhw xmm1, [s18 GLOBAL]
- pmulhw xmm2, [s18 GLOBAL]
-
- paddw xmm1, [s63 GLOBAL]
- paddw xmm2, [s63 GLOBAL]
-
- psraw xmm1, 7
- psraw xmm2, 7
-
- packsswb xmm1, xmm2
-
- movdqa xmm3, [rdx + 80] ;/q1
- movdqa xmm6, [rdx + 32] ; p1
-
- pxor xmm3, [t80 GLOBAL]
- pxor xmm6, [t80 GLOBAL]
-
- paddsb xmm6, xmm1
- psubsb xmm3, xmm1
-
- pxor xmm6, [t80 GLOBAL]
- pxor xmm3, [t80 GLOBAL]
-
- movdqa [rdx + 80], xmm3
- movdqa [rdx + 32], xmm6
-
-
- ; roughly 1/7th difference across boundary
- ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
- ; s = vp8_signed_char_clamp(qs2 - u);
- ; *oq2 = s^0x80;
- ; s = vp8_signed_char_clamp(ps2 + u);
- ; *op2 = s^0x80;
- pxor xmm1, xmm1
- pxor xmm2, xmm2
-
- punpcklbw xmm1, xmm4
- punpckhbw xmm2, xmm4
-
- pmulhw xmm1, [s9 GLOBAL]
- pmulhw xmm2, [s9 GLOBAL]
-
- paddw xmm1, [s63 GLOBAL]
- paddw xmm2, [s63 GLOBAL]
-
- psraw xmm1, 7
- psraw xmm2, 7
-
- packsswb xmm1, xmm2
-
- movdqa xmm6, [rdx+16]
- movdqa xmm3, [rdx+96]
-
- pxor xmm6, [t80 GLOBAL]
- pxor xmm3, [t80 GLOBAL]
-
- paddsb xmm6, xmm1
- psubsb xmm3, xmm1
-
- pxor xmm6, [t80 GLOBAL] ; xmm6 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
- pxor xmm3, [t80 GLOBAL] ; xmm3 = f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 15 06
-
+ MBV_FILTER
; transpose and write back
- movdqa xmm0, [rdx] ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
- movdqa xmm1, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+ MBV_TRANSPOSE
- punpcklbw xmm0, xmm6 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
- punpckhbw xmm1, xmm6 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
+ lea rsi, [rsi+rax*8]
+ lea rdi, [rdi+rax*8]
+ MBV_WRITEBACK_1
- movdqa xmm2, [rdx+32] ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
- movdqa xmm6, xmm2 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+ lea rsi, [rsi+rcx*8]
+ lea rdi, [rdi+rcx*8]
+ MBV_WRITEBACK_2
- punpcklbw xmm2, [rdx+48] ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
- punpckhbw xmm6, [rdx+48] ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+ add rsp, 160
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
- movdqa xmm5, xmm0 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
- punpcklwd xmm0, xmm2 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
- punpckhwd xmm5, xmm2 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
- movdqa xmm4, xmm1 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
+;void vp8_mbloop_filter_vertical_edge_uv_sse2
+;(
+; unsigned char *u,
+; int src_pixel_step,
+; const char *flimit,
+; const char *limit,
+; const char *thresh,
+; unsigned char *v
+;)
+global sym(vp8_mbloop_filter_vertical_edge_uv_sse2)
+sym(vp8_mbloop_filter_vertical_edge_uv_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
- punpcklwd xmm1, xmm6 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
- punpckhwd xmm4, xmm6 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
+ ALIGN_STACK 16, rax
+ sub rsp, 160 ; reserve 160 bytes
+ %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
+ %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
+ %define srct [rsp + 32] ;__declspec(align(16)) char srct[128];
- movdqa xmm2, [rdx+64] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
- punpcklbw xmm2, [rdx+80] ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
+ mov rsi, arg(0) ;u_ptr
+ movsxd rax, dword ptr arg(1) ; src_pixel_step
- movdqa xmm6, xmm3 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
- punpcklbw xmm6, [rdx+112] ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06
+ lea rsi, [rsi + rax*4 - 4]
+ lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
+ mov rcx, rax
+ neg rax
- movdqa xmm7, xmm2 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
- punpcklwd xmm2, xmm6 ; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04
+ ; Transpose
+ TRANSPOSE_16X8_1
- punpckhwd xmm7, xmm6 ; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44
- movdqa xmm6, xmm0 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
+ ; XMM3 XMM4 XMM7 in use
+ mov rsi, arg(5) ;v_ptr
+ lea rsi, [rsi + rcx*4 - 4]
+ lea rdi, [rsi + rcx]
+ lea rdx, srct
+ TRANSPOSE_16X8_2 0
- punpckldq xmm0, xmm2 ; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00
- punpckhdq xmm6, xmm2 ; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20
+ ; calculate filter mask
+ LFV_FILTER_MASK 0
+ ; calculate high edge variance
+ LFV_HEV_MASK
- lea rsi, [rsi+rcx*8]
- lea rdi, [rdi+rcx*8]
+ ; start work on filters
+ MBV_FILTER
- movq QWORD PTR [rsi+rcx*4], xmm0
- psrldq xmm0, 8
+ ; transpose and write back
+ MBV_TRANSPOSE
- movq QWORD PTR [rsi+rcx*2], xmm6
- psrldq xmm6, 8
-
- movq QWORD PTR [rdi+rcx*4], xmm0
- movq QWORD PTR [rsi+rcx], xmm6
-
- movdqa xmm0, xmm5 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
- punpckldq xmm0, xmm7 ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40
-
- punpckhdq xmm5, xmm7 ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60
-
- movq QWORD PTR [rsi], xmm0
- psrldq xmm0, 8
-
- movq QWORD PTR [rsi+rax*2], xmm5
- psrldq xmm5, 8
-
- movq QWORD PTR [rsi+rax], xmm0
- movq QWORD PTR [rdi+rax*2], xmm5
-
- movdqa xmm2, [rdx+64] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
- punpckhbw xmm2, [rdx+80] ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
-
- punpckhbw xmm3, [rdx+112] ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86
- movdqa xmm0, xmm2
-
- punpcklwd xmm0, xmm3 ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84
- punpckhwd xmm2, xmm3 ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4
-
- movdqa xmm3, xmm1 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
- punpckldq xmm1, xmm0 ; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80
-
- punpckhdq xmm3, xmm0 ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0
-
- lea rsi, [rsi+rax*8]
- lea rdi, [rdi+rax*8]
-
- movq QWORD PTR [rsi+rcx*4], xmm1
- psrldq xmm1, 8
-
- movq QWORD PTR [rsi+rcx*2], xmm3
- psrldq xmm3, 8
-
- movq QWORD PTR [rdi+rcx*4], xmm1
- movq QWORD PTR [rsi+rcx], xmm3
-
- movdqa xmm1, xmm4 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
- punpckldq xmm1, xmm2 ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0
-
- punpckhdq xmm4, xmm2 ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0
- movq QWORD PTR [rsi], xmm1
-
- psrldq xmm1, 8
-
- movq QWORD PTR [rsi+rax*2], xmm4
- psrldq xmm4, 8
-
- movq QWORD PTR [rsi+rax], xmm1
- movq QWORD PTR [rdi+rax*2], xmm4
+ mov rsi, arg(0) ;u_ptr
+ lea rsi, [rsi + rcx*4 - 4]
+ lea rdi, [rsi + rcx]
+ MBV_WRITEBACK_1
+ mov rsi, arg(5) ;v_ptr
+ lea rsi, [rsi + rcx*4 - 4]
+ lea rdi, [rsi + rcx]
+ MBV_WRITEBACK_2
add rsp, 160
pop rsp
diff --git a/vp8/common/x86/loopfilter_x86.c b/vp8/common/x86/loopfilter_x86.c
index 3a9437e..16498ab 100644
--- a/vp8/common/x86/loopfilter_x86.c
+++ b/vp8/common/x86/loopfilter_x86.c
@@ -34,6 +34,11 @@
prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_sse2);
prototype_loopfilter(vp8_fast_loop_filter_vertical_edges_sse2);
+extern loop_filter_uvfunction vp8_loop_filter_horizontal_edge_uv_sse2;
+extern loop_filter_uvfunction vp8_loop_filter_vertical_edge_uv_sse2;
+extern loop_filter_uvfunction vp8_mbloop_filter_horizontal_edge_uv_sse2;
+extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_sse2;
+
#if HAVE_MMX
// Horizontal MB filtering
void vp8_loop_filter_mbh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -157,10 +162,7 @@
vp8_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
if (u_ptr)
- vp8_mbloop_filter_horizontal_edge_mmx(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
-
- if (v_ptr)
- vp8_mbloop_filter_horizontal_edge_mmx(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+ vp8_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, v_ptr);
}
@@ -183,10 +185,7 @@
vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
if (u_ptr)
- vp8_mbloop_filter_vertical_edge_mmx(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
-
- if (v_ptr)
- vp8_mbloop_filter_vertical_edge_mmx(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+ vp8_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, v_ptr);
}
@@ -211,10 +210,7 @@
vp8_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
if (u_ptr)
- vp8_loop_filter_horizontal_edge_mmx(u_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
-
- if (v_ptr)
- vp8_loop_filter_horizontal_edge_mmx(v_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+ vp8_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, v_ptr + 4 * uv_stride);
}
@@ -241,10 +237,7 @@
vp8_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
if (u_ptr)
- vp8_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
-
- if (v_ptr)
- vp8_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+ vp8_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, v_ptr + 4);
}
diff --git a/vp8/decoder/onyxd_if.c b/vp8/decoder/onyxd_if.c
index 3a237de..60ca74a 100644
--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c
@@ -293,17 +293,16 @@
// Apply the loop filter if appropriate.
if (cm->filter_level > 0)
- {
vp8_loop_filter_frame(cm, &pbi->mb, cm->filter_level);
- cm->last_frame_type = cm->frame_type;
- cm->last_filter_type = cm->filter_type;
- cm->last_sharpness_level = cm->sharpness_level;
-
- }
vpx_usec_timer_mark(&lpftimer);
pbi->time_loop_filtering += vpx_usec_timer_elapsed(&lpftimer);
}
+ if (cm->filter_level > 0) {
+ cm->last_frame_type = cm->frame_type;
+ cm->last_filter_type = cm->filter_type;
+ cm->last_sharpness_level = cm->sharpness_level;
+ }
vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show);
diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c
index 38d6042..18c8da0 100644
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -281,11 +281,11 @@
YV12_BUFFER_CONFIG *post = &cm->new_frame;
loop_filter_info *lfi = cm->lf_info;
+ int frame_type = cm->frame_type;
int mb_row;
int mb_col;
-
int baseline_filter_level[MAX_MB_SEGMENTS];
int filter_level;
int alt_flt_enabled = mbd->segmentation_enabled;
@@ -319,7 +319,10 @@
}
// Initialize the loop filter for this frame.
- vp8_init_loop_filter(cm);
+ if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level))
+ vp8_init_loop_filter(cm);
+ else if (frame_type != cm->last_frame_type)
+ vp8_frame_init_loop_filter(lfi, frame_type);
// Set up the buffer pointers
y_ptr = post->y_buffer;
diff --git a/vp8/encoder/block.h b/vp8/encoder/block.h
index c1fcfe2..b55bc51 100644
--- a/vp8/encoder/block.h
+++ b/vp8/encoder/block.h
@@ -100,14 +100,9 @@
void (*vp8_short_fdct4x4)(short *input, short *output, int pitch);
void (*vp8_short_fdct8x4)(short *input, short *output, int pitch);
- void (*short_fdct4x4rd)(short *input, short *output, int pitch);
- void (*short_fdct8x4rd)(short *input, short *output, int pitch);
void (*short_walsh4x4)(short *input, short *output, int pitch);
-
void (*quantize_b)(BLOCK *b, BLOCKD *d);
-
-
} MACROBLOCK;
diff --git a/vp8/encoder/dct.c b/vp8/encoder/dct.c
index 3075e58..58e3610 100644
--- a/vp8/encoder/dct.c
+++ b/vp8/encoder/dct.c
@@ -11,163 +11,54 @@
#include <math.h>
-
-static const short dct_matrix2[4][4] =
-{
- { 23170, 30274, 23170, 12540 },
- { 23170, 12540, -23170, -30274 },
- { 23170, -12540, -23170, 30274 },
- { 23170, -30274, 23170, -12540 }
-};
-
-static const short dct_matrix1[4][4] =
-{
- { 23170, 23170, 23170, 23170 },
- { 30274, 12540, -12540, -30274 },
- { 23170, -23170, -23170, 23170 },
- { 12540, -30274, 30274, -12540 }
-};
-
-
-#define _1STSTAGESHIFT 14
-#define _1STSTAGEROUNDING (1<<( _1STSTAGESHIFT-1))
-#define _2NDSTAGESHIFT 16
-#define _2NDSTAGEROUNDING (1<<( _2NDSTAGESHIFT-1))
-
-// using matrix multiply
void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
{
- int i, j, k;
- short temp[4][4];
- int sumtemp;
- pitch >>= 1;
-
- for (i = 0; i < 4; i++)
- {
- for (j = 0; j < 4; j++)
- {
- sumtemp = 0;
-
- for (k = 0; k < 4; k++)
- {
- sumtemp += input[i*pitch+k] * dct_matrix2[k][j];
-
- }
-
- temp[i][j] = (short)((sumtemp + _1STSTAGEROUNDING) >> _1STSTAGESHIFT);
- }
- }
-
-
- for (i = 0; i < 4; i++)
- {
- for (j = 0; j < 4; j++)
- {
- sumtemp = 0;
-
- for (k = 0; k < 4; k++)
- {
- sumtemp += dct_matrix1[i][ k] * temp[k][ j];
- }
-
- output[i*4+j] = (short)((sumtemp + _2NDSTAGEROUNDING) >> _2NDSTAGESHIFT);
- }
- }
-
-}
-
-
-void vp8_short_fdct8x4_c(short *input, short *output, int pitch)
-{
- vp8_short_fdct4x4_c(input, output, pitch);
- vp8_short_fdct4x4_c(input + 4, output + 16, pitch);
-}
-
-
-static const signed short x_c1 = 60547;
-static const signed short x_c2 = 46341;
-static const signed short x_c3 = 25080;
-
-void vp8_fast_fdct4x4_c(short *input, short *output, int pitch)
-{
int i;
int a1, b1, c1, d1;
- int a2, b2, c2, d2;
short *ip = input;
-
short *op = output;
- int temp1, temp2;
for (i = 0; i < 4; i++)
{
- a1 = (ip[0] + ip[3]) * 2;
- b1 = (ip[1] + ip[2]) * 2;
- c1 = (ip[1] - ip[2]) * 2;
- d1 = (ip[0] - ip[3]) * 2;
+ a1 = ((ip[0] + ip[3])<<3);
+ b1 = ((ip[1] + ip[2])<<3);
+ c1 = ((ip[1] - ip[2])<<3);
+ d1 = ((ip[0] - ip[3])<<3);
- temp1 = a1 + b1;
- temp2 = a1 - b1;
+ op[0] = a1 + b1;
+ op[2] = a1 - b1;
- op[0] = ((temp1 * x_c2) >> 16) + temp1;
- op[2] = ((temp2 * x_c2) >> 16) + temp2;
-
- temp1 = (c1 * x_c3) >> 16;
- temp2 = ((d1 * x_c1) >> 16) + d1;
-
- op[1] = temp1 + temp2;
-
- temp1 = (d1 * x_c3) >> 16;
- temp2 = ((c1 * x_c1) >> 16) + c1;
-
- op[3] = temp1 - temp2;
+ op[1] = (c1 * 2217 + d1 * 5352 + 14500)>>12;
+ op[3] = (d1 * 2217 - c1 * 5352 + 7500)>>12;
ip += pitch / 2;
op += 4;
- }
+ }
ip = output;
op = output;
-
for (i = 0; i < 4; i++)
{
-
a1 = ip[0] + ip[12];
b1 = ip[4] + ip[8];
c1 = ip[4] - ip[8];
d1 = ip[0] - ip[12];
+ op[0] = ( a1 + b1 + 7)>>4;
+ op[8] = ( a1 - b1 + 7)>>4;
- temp1 = a1 + b1;
- temp2 = a1 - b1;
-
- a2 = ((temp1 * x_c2) >> 16) + temp1;
- c2 = ((temp2 * x_c2) >> 16) + temp2;
-
- temp1 = (c1 * x_c3) >> 16;
- temp2 = ((d1 * x_c1) >> 16) + d1;
-
- b2 = temp1 + temp2;
-
- temp1 = (d1 * x_c3) >> 16;
- temp2 = ((c1 * x_c1) >> 16) + c1;
-
- d2 = temp1 - temp2;
-
-
- op[0] = (a2 + 1) >> 1;
- op[4] = (b2 + 1) >> 1;
- op[8] = (c2 + 1) >> 1;
- op[12] = (d2 + 1) >> 1;
+ op[4] =((c1 * 2217 + d1 * 5352 + 12000)>>16) + (d1!=0);
+ op[12] = (d1 * 2217 - c1 * 5352 + 51000)>>16;
ip++;
op++;
}
}
-void vp8_fast_fdct8x4_c(short *input, short *output, int pitch)
+void vp8_short_fdct8x4_c(short *input, short *output, int pitch)
{
- vp8_fast_fdct4x4_c(input, output, pitch);
- vp8_fast_fdct4x4_c(input + 4, output + 16, pitch);
+ vp8_short_fdct4x4_c(input, output, pitch);
+ vp8_short_fdct4x4_c(input + 4, output + 16, pitch);
}
void vp8_short_walsh4x4_c(short *input, short *output, int pitch)
diff --git a/vp8/encoder/dct.h b/vp8/encoder/dct.h
index f79dba4..0ab40b3 100644
--- a/vp8/encoder/dct.h
+++ b/vp8/encoder/dct.h
@@ -32,16 +32,6 @@
#endif
extern prototype_fdct(vp8_fdct_short8x4);
-#ifndef vp8_fdct_fast4x4
-#define vp8_fdct_fast4x4 vp8_fast_fdct4x4_c
-#endif
-extern prototype_fdct(vp8_fdct_fast4x4);
-
-#ifndef vp8_fdct_fast8x4
-#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_c
-#endif
-extern prototype_fdct(vp8_fdct_fast8x4);
-
#ifndef vp8_fdct_walsh_short4x4
#define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_c
#endif
diff --git a/vp8/encoder/encodeintra.c b/vp8/encoder/encodeintra.c
index 0e16093..870cb58 100644
--- a/vp8/encoder/encodeintra.c
+++ b/vp8/encoder/encodeintra.c
@@ -66,7 +66,7 @@
ENCODEMB_INVOKE(&rtcd->encodemb, subb)(be, b, 16);
- x->short_fdct4x4rd(be->src_diff, be->coeff, 32);
+ x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32);
x->quantize_b(be, b);
diff --git a/vp8/encoder/encodemb.c b/vp8/encoder/encodemb.c
index 824850c..8bc01df 100644
--- a/vp8/encoder/encodemb.c
+++ b/vp8/encoder/encodemb.c
@@ -130,7 +130,8 @@
for (i = 16; i < 24; i += 2)
{
- x->short_fdct8x4rd(&x->block[i].src_diff[0], &x->block[i].coeff[0], 16);
+ x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
+ &x->block[i].coeff[0], 16);
}
}
@@ -140,14 +141,16 @@
for (i = 0; i < 16; i += 2)
{
- x->vp8_short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32);
+ x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
+ &x->block[i].coeff[0], 32);
}
// build dc block from 16 y dc values
vp8_build_dcblock(x);
// do 2nd order transform on the dc block
- x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8);
+ x->short_walsh4x4(&x->block[24].src_diff[0],
+ &x->block[24].coeff[0], 8);
}
@@ -157,14 +160,16 @@
for (i = 0; i < 16; i += 2)
{
- x->short_fdct8x4rd(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32);
+ x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
+ &x->block[i].coeff[0], 32);
}
// build dc block from 16 y dc values
vp8_build_dcblock(x);
// do 2nd order transform on the dc block
- x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8);
+ x->short_walsh4x4(&x->block[24].src_diff[0],
+ &x->block[24].coeff[0], 8);
}
void vp8_transform_mb(MACROBLOCK *x)
@@ -173,7 +178,8 @@
for (i = 0; i < 16; i += 2)
{
- x->vp8_short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32);
+ x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
+ &x->block[i].coeff[0], 32);
}
// build dc block from 16 y dc values
@@ -182,12 +188,14 @@
for (i = 16; i < 24; i += 2)
{
- x->vp8_short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 16);
+ x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
+ &x->block[i].coeff[0], 16);
}
// do 2nd order transform on the dc block
if (x->e_mbd.mbmi.mode != SPLITMV)
- x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8);
+ x->short_walsh4x4(&x->block[24].src_diff[0],
+ &x->block[24].coeff[0], 8);
}
@@ -197,14 +205,16 @@
for (i = 0; i < 16; i += 2)
{
- x->vp8_short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32);
+ x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
+ &x->block[i].coeff[0], 32);
}
// build dc block from 16 y dc values
if (x->e_mbd.mbmi.mode != SPLITMV)
{
vp8_build_dcblock(x);
- x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8);
+ x->short_walsh4x4(&x->block[24].src_diff[0],
+ &x->block[24].coeff[0], 8);
}
}
@@ -214,7 +224,8 @@
for (i = 0; i < 16; i += 2)
{
- x->short_fdct8x4rd(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32);
+ x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
+ &x->block[i].coeff[0], 32);
}
// build dc block from 16 y dc values
@@ -223,12 +234,14 @@
for (i = 16; i < 24; i += 2)
{
- x->short_fdct8x4rd(&x->block[i].src_diff[0], &x->block[i].coeff[0], 16);
+ x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
+ &x->block[i].coeff[0], 16);
}
// do 2nd order transform on the dc block
if (x->e_mbd.mbmi.mode != SPLITMV)
- x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8);
+ x->short_walsh4x4(&x->block[24].src_diff[0],
+ &x->block[24].coeff[0], 8);
}
void vp8_stuff_inter16x16(MACROBLOCK *x)
diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c
index a205667..dd98a09 100644
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -257,9 +257,6 @@
z->vp8_short_fdct4x4 = x->vp8_short_fdct4x4;
z->vp8_short_fdct8x4 = x->vp8_short_fdct8x4;
- z->short_fdct4x4rd = x->short_fdct4x4rd;
- z->short_fdct8x4rd = x->short_fdct8x4rd;
- z->short_fdct8x4rd = x->short_fdct8x4rd;
z->short_walsh4x4 = x->short_walsh4x4;
z->quantize_b = x->quantize_b;
diff --git a/vp8/encoder/generic/csystemdependent.c b/vp8/encoder/generic/csystemdependent.c
index e68d650..dd89f1a 100644
--- a/vp8/encoder/generic/csystemdependent.c
+++ b/vp8/encoder/generic/csystemdependent.c
@@ -68,8 +68,8 @@
cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_c;
cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_c;
- cpi->rtcd.fdct.fast4x4 = vp8_fast_fdct4x4_c;
- cpi->rtcd.fdct.fast8x4 = vp8_fast_fdct8x4_c;
+ cpi->rtcd.fdct.fast4x4 = vp8_short_fdct4x4_c;
+ cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_c;
cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_c;
cpi->rtcd.encodemb.berr = vp8_block_error_c;
diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c
index 4c3edd7..156578b 100644
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -997,7 +997,7 @@
int tot_steps;
MV this_mv;
- unsigned int bestsad = UINT_MAX;
+ int bestsad = INT_MAX;
int best_site = 0;
int last_site = 0;
@@ -1238,7 +1238,7 @@
unsigned char *bestaddress;
MV *best_mv = &d->bmi.mv.as_mv;
MV this_mv;
- unsigned int bestsad = UINT_MAX;
+ int bestsad = INT_MAX;
int r, c;
unsigned char *check_here;
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index f3456a7..f331a4b 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -137,8 +137,6 @@
extern void (*vp8_short_fdct4x4)(short *input, short *output, int pitch);
extern void (*vp8_short_fdct8x4)(short *input, short *output, int pitch);
-extern void (*vp8_fast_fdct4x4)(short *input, short *output, int pitch);
-extern void (*vp8_fast_fdct8x4)(short *input, short *output, int pitch);
extern const int vp8_bits_per_mb[2][QINDEX_RANGE];
@@ -1136,15 +1134,11 @@
{
cpi->mb.vp8_short_fdct8x4 = FDCT_INVOKE(&cpi->rtcd.fdct, short8x4);
cpi->mb.vp8_short_fdct4x4 = FDCT_INVOKE(&cpi->rtcd.fdct, short4x4);
- cpi->mb.short_fdct8x4rd = FDCT_INVOKE(&cpi->rtcd.fdct, short8x4);
- cpi->mb.short_fdct4x4rd = FDCT_INVOKE(&cpi->rtcd.fdct, short4x4);
}
else
{
cpi->mb.vp8_short_fdct8x4 = FDCT_INVOKE(&cpi->rtcd.fdct, fast8x4);
cpi->mb.vp8_short_fdct4x4 = FDCT_INVOKE(&cpi->rtcd.fdct, fast4x4);
- cpi->mb.short_fdct8x4rd = FDCT_INVOKE(&cpi->rtcd.fdct, fast8x4);
- cpi->mb.short_fdct4x4rd = FDCT_INVOKE(&cpi->rtcd.fdct, fast4x4);
}
cpi->mb.short_walsh4x4 = FDCT_INVOKE(&cpi->rtcd.fdct, walsh_short4x4);
@@ -3237,7 +3231,7 @@
unsigned char block_size
)
{
- int byte = 0; // Buffer offset for the current pixel value being filtered
+ int byte = 0; // Buffer offset for current pixel being filtered
int frame = 0;
int modifier = 0;
int i, j, k;
@@ -3270,9 +3264,9 @@
for (frame = 0; frame < frame_count; frame++)
{
// get current frame pixel value
- int pixel_value = frames[frame][byte]; // int pixel_value = *frameptr;
+ int pixel_value = frames[frame][byte];
- modifier = src_byte; // modifier = s[byte];
+ modifier = src_byte;
modifier -= pixel_value;
modifier *= modifier;
modifier >>= strength;
@@ -3289,10 +3283,10 @@
}
accumulator += (count >> 1);
- accumulator *= fixed_divide[count]; // accumulator *= ppi->fixed_divide[count];
+ accumulator *= fixed_divide[count];
accumulator >>= 16;
- dst[byte] = accumulator; // d[byte] = accumulator;
+ dst[byte] = accumulator;
// move to next pixel
byte++;
@@ -3398,7 +3392,8 @@
{
if ((frames_to_blur_backward + frames_to_blur_forward) >= max_frames)
{
- frames_to_blur_backward = max_frames - frames_to_blur_forward - 1;
+ frames_to_blur_backward
+ = max_frames - frames_to_blur_forward - 1;
}
}
else
@@ -3455,7 +3450,7 @@
for (frame = 0; frame < frames_to_blur; frame++)
{
- int which_buffer = cpi->last_alt_ref_sei - frame;
+ int which_buffer = start_frame - frame;
if (which_buffer < 0)
which_buffer += cpi->oxcf.lag_in_frames;
@@ -3479,7 +3474,7 @@
for (frame = 0; frame < frames_to_blur; frame++)
{
- int which_buffer = cpi->last_alt_ref_sei - frame;
+ int which_buffer = start_frame - frame;
if (which_buffer < 0)
which_buffer += cpi->oxcf.lag_in_frames;
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 2d6dee1..70cf122 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -1028,7 +1028,7 @@
vp8_build_inter_predictors_b(bd, 16, x->e_mbd.subpixel_predict);
ENCODEMB_INVOKE(rtcd, subb)(be, bd, 16);
- x->short_fdct4x4rd(be->src_diff, be->coeff, 32);
+ x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32);
// set to 0 no way to account for 2nd order DC so discount
//be->coeff[0] = 0;
@@ -1056,7 +1056,7 @@
// Fdct and building the 2nd order block
for (beptr = mb->block; beptr < mb->block + 16; beptr += 2)
{
- mb->short_fdct8x4rd(beptr->src_diff, beptr->coeff, 32);
+ mb->vp8_short_fdct8x4(beptr->src_diff, beptr->coeff, 32);
*Y2DCPtr++ = beptr->coeff[0];
*Y2DCPtr++ = beptr->coeff[16];
}
diff --git a/vp8/encoder/x86/csystemdependent.c b/vp8/encoder/x86/csystemdependent.c
index 6aeac50..bf12fee 100644
--- a/vp8/encoder/x86/csystemdependent.c
+++ b/vp8/encoder/x86/csystemdependent.c
@@ -181,10 +181,17 @@
// Willamette instruction set available:
vp8_mbuverror = vp8_mbuverror_xmm;
vp8_fast_quantize_b = vp8_fast_quantize_b_sse;
+#if 0 //new fdct
vp8_short_fdct4x4 = vp8_short_fdct4x4_mmx;
vp8_short_fdct8x4 = vp8_short_fdct8x4_mmx;
- vp8_fast_fdct4x4 = vp8_fast_fdct4x4_mmx;
- vp8_fast_fdct8x4 = vp8_fast_fdct8x4_wmt;
+ vp8_fast_fdct4x4 = vp8_short_fdct4x4_mmx;
+ vp8_fast_fdct8x4 = vp8_short_fdct8x4_wmt;
+#else
+ vp8_short_fdct4x4 = vp8_short_fdct4x4_c;
+ vp8_short_fdct8x4 = vp8_short_fdct8x4_c;
+ vp8_fast_fdct4x4 = vp8_short_fdct4x4_c;
+ vp8_fast_fdct8x4 = vp8_fast_fdct8x4_c;
+#endif
vp8_subtract_b = vp8_subtract_b_mmx;
vp8_subtract_mbuv = vp8_subtract_mbuv_mmx;
vp8_variance4x4 = vp8_variance4x4_mmx;
@@ -218,10 +225,17 @@
// MMX instruction set available:
vp8_mbuverror = vp8_mbuverror_mmx;
vp8_fast_quantize_b = vp8_fast_quantize_b_mmx;
+#if 0 // new fdct
vp8_short_fdct4x4 = vp8_short_fdct4x4_mmx;
vp8_short_fdct8x4 = vp8_short_fdct8x4_mmx;
- vp8_fast_fdct4x4 = vp8_fast_fdct4x4_mmx;
- vp8_fast_fdct8x4 = vp8_fast_fdct8x4_mmx;
+ vp8_fast_fdct4x4 = vp8_short_fdct4x4_mmx;
+ vp8_fast_fdct8x4 = vp8_short_fdct8x4_mmx;
+#else
+ vp8_short_fdct4x4 = vp8_short_fdct4x4_c;
+ vp8_short_fdct8x4 = vp8_short_fdct8x4_c;
+ vp8_fast_fdct4x4 = vp8_short_fdct4x4_c;
+ vp8_fast_fdct8x4 = vp8_fast_fdct8x4_c;
+#endif
vp8_subtract_b = vp8_subtract_b_mmx;
vp8_subtract_mbuv = vp8_subtract_mbuv_mmx;
vp8_variance4x4 = vp8_variance4x4_mmx;
@@ -254,10 +268,10 @@
{
// Pure C:
vp8_mbuverror = vp8_mbuverror_c;
- vp8_fast_quantize_b = vp8_fast_quantize_b_c;
+ vp8_fast_quantize_b = vp8_fast_quantize_b_c;
vp8_short_fdct4x4 = vp8_short_fdct4x4_c;
vp8_short_fdct8x4 = vp8_short_fdct8x4_c;
- vp8_fast_fdct4x4 = vp8_fast_fdct4x4_c;
+ vp8_fast_fdct4x4 = vp8_short_fdct4x4_c;
vp8_fast_fdct8x4 = vp8_fast_fdct8x4_c;
vp8_subtract_b = vp8_subtract_b_c;
vp8_subtract_mbuv = vp8_subtract_mbuv_c;
diff --git a/vp8/encoder/x86/dct_mmx.asm b/vp8/encoder/x86/dct_mmx.asm
index 32d6610..ff96c49 100644
--- a/vp8/encoder/x86/dct_mmx.asm
+++ b/vp8/encoder/x86/dct_mmx.asm
@@ -13,8 +13,7 @@
section .text
global sym(vp8_short_fdct4x4_mmx)
- global sym(vp8_fast_fdct4x4_mmx)
- global sym(vp8_fast_fdct8x4_wmt)
+ global sym(vp8_short_fdct8x4_wmt)
%define DCTCONSTANTSBITS (16)
@@ -24,10 +23,6 @@
%define x_c3 (25080) ; cos(pi*3/8) * (1<<15)
-%define _1STSTAGESHIFT 14
-%define _2NDSTAGESHIFT 16
-
-; using matrix multiply with source and destbuffer has a pitch
;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch)
sym(vp8_short_fdct4x4_mmx):
push rbp
@@ -37,333 +32,6 @@
push rsi
push rdi
; end prolog
-
- mov rsi, arg(0) ;input
- mov rdi, arg(1) ;output
-
- movsxd rax, dword ptr arg(2) ;pitch
- lea rdx, [dct_matrix GLOBAL]
-
- movq mm0, [rsi ]
- movq mm1, [rsi + rax]
-
- movq mm2, [rsi + rax*2]
- lea rsi, [rsi + rax*2]
-
- movq mm3, [rsi + rax]
-
- ; first column
- movq mm4, mm0
- movq mm7, [rdx]
-
- pmaddwd mm4, mm7
- movq mm5, mm1
-
- pmaddwd mm5, mm7
- movq mm6, mm4
-
- punpckldq mm4, mm5
- punpckhdq mm6, mm5
-
- paddd mm4, mm6
- movq mm5, mm2
-
-
- pmaddwd mm5, mm7
- movq mm6, mm3
-
- pmaddwd mm6, mm7
- movq mm7, mm5
-
- punpckldq mm5, mm6
- punpckhdq mm7, mm6
-
- paddd mm5, mm7
- movq mm6, [dct1st_stage_rounding_mmx GLOBAL]
-
- paddd mm4, mm6
- paddd mm5, mm6
-
- psrad mm4, _1STSTAGESHIFT
- psrad mm5, _1STSTAGESHIFT
-
- packssdw mm4, mm5
- movq [rdi], mm4
-
- ;second column
- movq mm4, mm0
-
- pmaddwd mm4, [rdx+8]
- movq mm5, mm1
-
- pmaddwd mm5, [rdx+8]
- movq mm6, mm4
-
- punpckldq mm4, mm5
- punpckhdq mm6, mm5
-
- paddd mm4, mm6
- movq mm5, mm2
-
- pmaddwd mm5, [rdx+8]
- movq mm6, mm3
-
- pmaddwd mm6, [rdx+8]
- movq mm7, mm5
-
- punpckldq mm5, mm6
- punpckhdq mm7, mm6
-
- paddd mm5, mm7
- movq mm6, [dct1st_stage_rounding_mmx GLOBAL]
-
- paddd mm4, mm6
- paddd mm5, mm6
-
- psrad mm4, _1STSTAGESHIFT
- psrad mm5, _1STSTAGESHIFT
-
- packssdw mm4, mm5
- movq [rdi+8], mm4
-
-
- ;third column
- movq mm4, mm0
-
- pmaddwd mm4, [rdx+16]
- movq mm5, mm1
-
- pmaddwd mm5, [rdx+16]
- movq mm6, mm4
-
- punpckldq mm4, mm5
- punpckhdq mm6, mm5
-
- paddd mm4, mm6
- movq mm5, mm2
-
- pmaddwd mm5, [rdx+16]
- movq mm6, mm3
-
- pmaddwd mm6, [rdx+16]
- movq mm7, mm5
-
- punpckldq mm5, mm6
- punpckhdq mm7, mm6
-
- paddd mm5, mm7
- movq mm6, [dct1st_stage_rounding_mmx GLOBAL]
-
- paddd mm4, mm6
- paddd mm5, mm6
-
- psrad mm4, _1STSTAGESHIFT
- psrad mm5, _1STSTAGESHIFT
-
- packssdw mm4, mm5
- movq [rdi+16], mm4
-
- ;fourth column (this is the last column, so we do not have save the source any more)
-
- pmaddwd mm0, [rdx+24]
-
- pmaddwd mm1, [rdx+24]
- movq mm6, mm0
-
- punpckldq mm0, mm1
- punpckhdq mm6, mm1
-
- paddd mm0, mm6
-
- pmaddwd mm2, [rdx+24]
-
- pmaddwd mm3, [rdx+24]
- movq mm7, mm2
-
- punpckldq mm2, mm3
- punpckhdq mm7, mm3
-
- paddd mm2, mm7
- movq mm6, [dct1st_stage_rounding_mmx GLOBAL]
-
- paddd mm0, mm6
- paddd mm2, mm6
-
- psrad mm0, _1STSTAGESHIFT
- psrad mm2, _1STSTAGESHIFT
-
- packssdw mm0, mm2
-
- movq mm3, mm0
-
- ; done with one pass
- ; now start second pass
- movq mm0, [rdi ]
- movq mm1, [rdi+ 8]
- movq mm2, [rdi+ 16]
-
- movq mm4, mm0
-
- pmaddwd mm4, [rdx]
- movq mm5, mm1
-
- pmaddwd mm5, [rdx]
- movq mm6, mm4
-
- punpckldq mm4, mm5
- punpckhdq mm6, mm5
-
- paddd mm4, mm6
- movq mm5, mm2
-
- pmaddwd mm5, [rdx]
- movq mm6, mm3
-
- pmaddwd mm6, [rdx]
- movq mm7, mm5
-
- punpckldq mm5, mm6
- punpckhdq mm7, mm6
-
- paddd mm5, mm7
- movq mm6, [dct2nd_stage_rounding_mmx GLOBAL]
-
- paddd mm4, mm6
- paddd mm5, mm6
-
- psrad mm4, _2NDSTAGESHIFT
- psrad mm5, _2NDSTAGESHIFT
-
- packssdw mm4, mm5
- movq [rdi], mm4
-
- ;second column
- movq mm4, mm0
-
- pmaddwd mm4, [rdx+8]
- movq mm5, mm1
-
- pmaddwd mm5, [rdx+8]
- movq mm6, mm4
-
- punpckldq mm4, mm5
- punpckhdq mm6, mm5
-
- paddd mm4, mm6
- movq mm5, mm2
-
- pmaddwd mm5, [rdx+8]
- movq mm6, mm3
-
- pmaddwd mm6, [rdx+8]
- movq mm7, mm5
-
- punpckldq mm5, mm6
- punpckhdq mm7, mm6
-
- paddd mm5, mm7
- movq mm6, [dct2nd_stage_rounding_mmx GLOBAL]
-
- paddd mm4, mm6
- paddd mm5, mm6
-
- psrad mm4, _2NDSTAGESHIFT
- psrad mm5, _2NDSTAGESHIFT
-
- packssdw mm4, mm5
- movq [rdi+8], mm4
-
-
- ;third column
- movq mm4, mm0
-
- pmaddwd mm4, [rdx+16]
- movq mm5, mm1
-
- pmaddwd mm5, [rdx+16]
- movq mm6, mm4
-
- punpckldq mm4, mm5
- punpckhdq mm6, mm5
-
- paddd mm4, mm6
- movq mm5, mm2
-
- pmaddwd mm5, [rdx+16]
- movq mm6, mm3
-
- pmaddwd mm6, [rdx+16]
- movq mm7, mm5
-
- punpckldq mm5, mm6
- punpckhdq mm7, mm6
-
- paddd mm5, mm7
- movq mm6, [dct2nd_stage_rounding_mmx GLOBAL]
-
- paddd mm4, mm6
- paddd mm5, mm6
-
- psrad mm4, _2NDSTAGESHIFT
- psrad mm5, _2NDSTAGESHIFT
-
- packssdw mm4, mm5
- movq [rdi+16], mm4
-
- ;fourth column
- movq mm4, mm0
-
- pmaddwd mm4, [rdx+24]
- movq mm5, mm1
-
- pmaddwd mm5, [rdx+24]
- movq mm6, mm4
-
- punpckldq mm4, mm5
- punpckhdq mm6, mm5
-
- paddd mm4, mm6
- movq mm5, mm2
-
- pmaddwd mm5, [rdx+24]
- movq mm6, mm3
-
- pmaddwd mm6, [rdx+24]
- movq mm7, mm5
-
- punpckldq mm5, mm6
- punpckhdq mm7, mm6
-
- paddd mm5, mm7
- movq mm6, [dct2nd_stage_rounding_mmx GLOBAL]
-
- paddd mm4, mm6
- paddd mm5, mm6
-
- psrad mm4, _2NDSTAGESHIFT
- psrad mm5, _2NDSTAGESHIFT
-
- packssdw mm4, mm5
- movq [rdi+24], mm4
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp8_fast_fdct4x4_mmx(short *input, short *output, int pitch)
-sym(vp8_fast_fdct4x4_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 3
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
mov rsi, arg(0) ;input
mov rdi, arg(1) ;output
@@ -379,11 +47,11 @@
movq mm3, [rcx + rax]
; get the constants
;shift to left by 1 for prescision
- paddw mm0, mm0
- paddw mm1, mm1
+ psllw mm0, 3
+ psllw mm1, 3
- psllw mm2, 1
- psllw mm3, 1
+ psllw mm2, 3
+ psllw mm3, 3
; transpose for the second stage
movq mm4, mm0 ; 00 01 02 03
@@ -531,20 +199,23 @@
movq mm3, mm5
; done with vertical
- pcmpeqw mm4, mm4
- pcmpeqw mm5, mm5
- psrlw mm4, 15
- psrlw mm5, 15
+ pcmpeqw mm4, mm4
+ pcmpeqw mm5, mm5
+ psrlw mm4, 15
+ psrlw mm5, 15
+
+ psllw mm4, 2
+ psllw mm5, 2
paddw mm0, mm4
paddw mm1, mm5
paddw mm2, mm4
paddw mm3, mm5
- psraw mm0, 1
- psraw mm1, 1
- psraw mm2, 1
- psraw mm3, 1
+ psraw mm0, 3
+ psraw mm1, 3
+ psraw mm2, 3
+ psraw mm3, 3
movq [rdi ], mm0
movq [rdi+ 8], mm1
@@ -560,8 +231,8 @@
ret
-;void vp8_fast_fdct8x4_wmt(short *input, short *output, int pitch)
-sym(vp8_fast_fdct8x4_wmt):
+;void vp8_short_fdct8x4_wmt(short *input, short *output, int pitch)
+sym(vp8_short_fdct8x4_wmt):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 3
@@ -584,11 +255,11 @@
movdqa xmm3, [rcx + rax]
; get the constants
;shift to left by 1 for prescision
- psllw xmm0, 1
- psllw xmm2, 1
+ psllw xmm0, 3
+ psllw xmm2, 3
- psllw xmm4, 1
- psllw xmm3, 1
+ psllw xmm4, 3
+ psllw xmm3, 3
; transpose for the second stage
movdqa xmm1, xmm0 ; 00 01 02 03 04 05 06 07
@@ -758,20 +429,23 @@
; done with vertical
- pcmpeqw xmm4, xmm4
- pcmpeqw xmm5, xmm5;
- psrlw xmm4, 15
- psrlw xmm5, 15
+ pcmpeqw xmm4, xmm4
+ pcmpeqw xmm5, xmm5;
+ psrlw xmm4, 15
+ psrlw xmm5, 15
+
+ psllw xmm4, 2
+ psllw xmm5, 2
paddw xmm0, xmm4
paddw xmm1, xmm5
paddw xmm2, xmm4
paddw xmm3, xmm5
- psraw xmm0, 1
- psraw xmm1, 1
- psraw xmm2, 1
- psraw xmm3, 1
+ psraw xmm0, 3
+ psraw xmm1, 3
+ psraw xmm2, 3
+ psraw xmm3, 3
movq QWORD PTR[rdi ], xmm0
movq QWORD PTR[rdi+ 8], xmm1
diff --git a/vp8/encoder/x86/dct_sse2.asm b/vp8/encoder/x86/dct_sse2.asm
index 1cd137d..0e8cfcf 100644
--- a/vp8/encoder/x86/dct_sse2.asm
+++ b/vp8/encoder/x86/dct_sse2.asm
@@ -11,251 +11,179 @@
%include "vpx_ports/x86_abi_support.asm"
-global sym(vp8_short_fdct4x4_wmt)
-
-%define DCTCONSTANTSBITS (16)
-%define DCTROUNDINGVALUE (1<< (DCTCONSTANTSBITS-1))
-%define x_c1 (60547) ; cos(pi /8) * (1<<15)
-%define x_c2 (46341) ; cos(pi*2/8) * (1<<15)
-%define x_c3 (25080) ; cos(pi*3/8) * (1<<15)
-
-%define _1STSTAGESHIFT 14
-%define _2NDSTAGESHIFT 16
-
-
-;; using matrix multiply
-;void vp8_short_fdct4x4_wmt(short *input, short *output)
-sym(vp8_short_fdct4x4_wmt):
+;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch)
+global sym(vp8_short_fdct4x4_sse2)
+sym(vp8_short_fdct4x4_sse2):
push rbp
mov rbp, rsp
- SHADOW_ARGS_TO_STACK 2
+ SHADOW_ARGS_TO_STACK 3
+;; SAVE_XMM
GET_GOT rbx
+ push rsi
+ push rdi
; end prolog
- mov rax, arg(0) ;input
- mov rcx, arg(1) ;output
+ mov rsi, arg(0)
+ movsxd rax, DWORD PTR arg(2)
+ lea rdi, [rsi + rax*2]
- lea rdx, [dct_matrix_sse2 GLOBAL]
+ movq xmm0, MMWORD PTR[rsi ] ;03 02 01 00
+ movq xmm2, MMWORD PTR[rsi + rax] ;13 12 11 10
+ movq xmm1, MMWORD PTR[rsi + rax*2] ;23 22 21 20
+ movq xmm3, MMWORD PTR[rdi + rax] ;33 32 31 30
- movdqu xmm0, [rax ]
- movdqu xmm1, [rax+16]
+ punpcklqdq xmm0, xmm2 ;13 12 11 10 03 02 01 00
+ punpcklqdq xmm1, xmm3 ;33 32 31 30 23 22 21 20
- ; first column
- movdqa xmm2, xmm0
- movdqa xmm7, [rdx]
+ mov rdi, arg(1)
- pmaddwd xmm2, xmm7
- movdqa xmm3, xmm1
+ movdqa xmm2, xmm0
+ punpckldq xmm0, xmm1 ;23 22 03 02 21 20 01 00
+ punpckhdq xmm2, xmm1 ;33 32 13 12 31 30 11 10
+ movdqa xmm1, xmm0
+ punpckldq xmm0, xmm2 ;31 21 30 20 11 10 01 00
+ pshufhw xmm1, xmm1, 0b1h ;22 23 02 03 xx xx xx xx
+ pshufhw xmm2, xmm2, 0b1h ;32 33 12 13 xx xx xx xx
- pmaddwd xmm3, xmm7
- movdqa xmm4, xmm2
+ punpckhdq xmm1, xmm2 ;32 33 22 23 12 13 02 03
+ movdqa xmm3, xmm0
+ paddw xmm0, xmm1 ;b1 a1 b1 a1 b1 a1 b1 a1
+ psubw xmm3, xmm1 ;c1 d1 c1 d1 c1 d1 c1 d1
+ psllw xmm0, 3 ;b1 <<= 3 a1 <<= 3
+ psllw xmm3, 3 ;c1 <<= 3 d1 <<= 3
+ movdqa xmm1, xmm0
+ pmaddwd xmm0, XMMWORD PTR[_mult_add GLOBAL] ;a1 + b1
+ pmaddwd xmm1, XMMWORD PTR[_mult_sub GLOBAL] ;a1 - b1
+ movdqa xmm4, xmm3
+ pmaddwd xmm3, XMMWORD PTR[_5352_2217 GLOBAL] ;c1*2217 + d1*5352
+ pmaddwd xmm4, XMMWORD PTR[_2217_neg5352 GLOBAL] ;d1*2217 - c1*5352
- punpckldq xmm2, xmm3
- punpckhdq xmm4, xmm3
+ paddd xmm3, XMMWORD PTR[_14500 GLOBAL]
+ paddd xmm4, XMMWORD PTR[_7500 GLOBAL]
+ psrad xmm3, 12 ;(c1 * 2217 + d1 * 5352 + 14500)>>12
+ psrad xmm4, 12 ;(d1 * 2217 - c1 * 5352 + 7500)>>12
- movdqa xmm3, xmm2
- punpckldq xmm2, xmm4
+ packssdw xmm0, xmm1 ;op[2] op[0]
+ packssdw xmm3, xmm4 ;op[3] op[1]
+ ; 23 22 21 20 03 02 01 00
+ ;
+ ; 33 32 31 30 13 12 11 10
+ ;
+ movdqa xmm2, xmm0
+ punpcklqdq xmm0, xmm3 ;13 12 11 10 03 02 01 00
+ punpckhqdq xmm2, xmm3 ;23 22 21 20 33 32 31 30
- punpckhdq xmm3, xmm4
- paddd xmm2, xmm3
+ movdqa xmm3, xmm0
+ punpcklwd xmm0, xmm2 ;32 30 22 20 12 10 02 00
+ punpckhwd xmm3, xmm2 ;33 31 23 21 13 11 03 01
+ movdqa xmm2, xmm0
+ punpcklwd xmm0, xmm3 ;13 12 11 10 03 02 01 00
+ punpckhwd xmm2, xmm3 ;33 32 31 30 23 22 21 20
+ movdqa xmm5, XMMWORD PTR[_7 GLOBAL]
+ pshufd xmm2, xmm2, 04eh
+ movdqa xmm3, xmm0
+ paddw xmm0, xmm2 ;b1 b1 b1 b1 a1 a1 a1 a1
+ psubw xmm3, xmm2 ;c1 c1 c1 c1 d1 d1 d1 d1
- paddd xmm2, XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL]
- psrad xmm2, _1STSTAGESHIFT
- ;second column
- movdqa xmm3, xmm0
- pmaddwd xmm3, [rdx+16]
+ pshufd xmm0, xmm0, 0d8h ;b1 b1 a1 a1 b1 b1 a1 a1
+ movdqa xmm2, xmm3 ;save d1 for compare
+ pshufd xmm3, xmm3, 0d8h ;c1 c1 d1 d1 c1 c1 d1 d1
+ pshuflw xmm0, xmm0, 0d8h ;b1 b1 a1 a1 b1 a1 b1 a1
+ pshuflw xmm3, xmm3, 0d8h ;c1 c1 d1 d1 c1 d1 c1 d1
+ pshufhw xmm0, xmm0, 0d8h ;b1 a1 b1 a1 b1 a1 b1 a1
+ pshufhw xmm3, xmm3, 0d8h ;c1 d1 c1 d1 c1 d1 c1 d1
+ movdqa xmm1, xmm0
+ pmaddwd xmm0, XMMWORD PTR[_mult_add GLOBAL] ;a1 + b1
+ pmaddwd xmm1, XMMWORD PTR[_mult_sub GLOBAL] ;a1 - b1
- movdqa xmm4, xmm1
- pmaddwd xmm4, [rdx+16]
+ pxor xmm4, xmm4 ;zero out for compare
+ paddd xmm0, xmm5
+ paddd xmm1, xmm5
+ pcmpeqw xmm2, xmm4
+ psrad xmm0, 4 ;(a1 + b1 + 7)>>4
+ psrad xmm1, 4 ;(a1 - b1 + 7)>>4
+ pandn xmm2, XMMWORD PTR[_cmp_mask GLOBAL] ;clear upper,
+ ;and keep bit 0 of lower
- movdqa xmm5, xmm3
- punpckldq xmm3, xmm4
+ movdqa xmm4, xmm3
+ pmaddwd xmm3, XMMWORD PTR[_5352_2217 GLOBAL] ;c1*2217 + d1*5352
+ pmaddwd xmm4, XMMWORD PTR[_2217_neg5352 GLOBAL] ;d1*2217 - c1*5352
+ paddd xmm3, XMMWORD PTR[_12000 GLOBAL]
+ paddd xmm4, XMMWORD PTR[_51000 GLOBAL]
+ packssdw xmm0, xmm1 ;op[8] op[0]
+ psrad xmm3, 16 ;(c1 * 2217 + d1 * 5352 + 12000)>>16
+ psrad xmm4, 16 ;(d1 * 2217 - c1 * 5352 + 51000)>>16
- punpckhdq xmm5, xmm4
- movdqa xmm4, xmm3
+ packssdw xmm3, xmm4 ;op[12] op[4]
+ movdqa xmm1, xmm0
+ paddw xmm3, xmm2 ;op[4] += (d1!=0)
+ punpcklqdq xmm0, xmm3 ;op[4] op[0]
+ punpckhqdq xmm1, xmm3 ;op[12] op[8]
- punpckldq xmm3, xmm5
- punpckhdq xmm4, xmm5
+ movdqa XMMWORD PTR[rdi + 0], xmm0
+ movdqa XMMWORD PTR[rdi + 16], xmm1
- paddd xmm3, xmm4
- paddd xmm3, XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL]
-
-
- psrad xmm3, _1STSTAGESHIFT
- packssdw xmm2, xmm3
-
- ;third column
- movdqa xmm3, xmm0
- pmaddwd xmm3, [rdx+32]
-
- movdqa xmm4, xmm1
- pmaddwd xmm4, [rdx+32]
-
- movdqa xmm5, xmm3
- punpckldq xmm3, xmm4
-
- punpckhdq xmm5, xmm4
- movdqa xmm4, xmm3
-
- punpckldq xmm3, xmm5
- punpckhdq xmm4, xmm5
-
- paddd xmm3, xmm4
- paddd xmm3, XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL]
-
- psrad xmm3, _1STSTAGESHIFT
-
- ;fourth column (this is the last column, so we do not have save the source any more)
- pmaddwd xmm0, [rdx+48]
- pmaddwd xmm1, [rdx+48]
-
- movdqa xmm4, xmm0
- punpckldq xmm0, xmm1
-
- punpckhdq xmm4, xmm1
- movdqa xmm1, xmm0
-
- punpckldq xmm0, xmm4
- punpckhdq xmm1, xmm4
-
- paddd xmm0, xmm1
- paddd xmm0, XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL]
-
-
- psrad xmm0, _1STSTAGESHIFT
- packssdw xmm3, xmm0
- ; done with one pass
- ; now start second pass
- movdqa xmm0, xmm2
- movdqa xmm1, xmm3
-
- pmaddwd xmm2, xmm7
- pmaddwd xmm3, xmm7
-
- movdqa xmm4, xmm2
- punpckldq xmm2, xmm3
-
- punpckhdq xmm4, xmm3
- movdqa xmm3, xmm2
-
- punpckldq xmm2, xmm4
- punpckhdq xmm3, xmm4
-
- paddd xmm2, xmm3
- paddd xmm2, XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL]
-
- psrad xmm2, _2NDSTAGESHIFT
-
- ;second column
- movdqa xmm3, xmm0
- pmaddwd xmm3, [rdx+16]
-
- movdqa xmm4, xmm1
- pmaddwd xmm4, [rdx+16]
-
- movdqa xmm5, xmm3
- punpckldq xmm3, xmm4
-
- punpckhdq xmm5, xmm4
- movdqa xmm4, xmm3
-
- punpckldq xmm3, xmm5
- punpckhdq xmm4, xmm5
-
- paddd xmm3, xmm4
- paddd xmm3, XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL]
-
- psrad xmm3, _2NDSTAGESHIFT
- packssdw xmm2, xmm3
-
- movdqu [rcx], xmm2
- ;third column
- movdqa xmm3, xmm0
- pmaddwd xmm3, [rdx+32]
-
- movdqa xmm4, xmm1
- pmaddwd xmm4, [rdx+32]
-
- movdqa xmm5, xmm3
- punpckldq xmm3, xmm4
-
- punpckhdq xmm5, xmm4
- movdqa xmm4, xmm3
-
- punpckldq xmm3, xmm5
- punpckhdq xmm4, xmm5
-
- paddd xmm3, xmm4
- paddd xmm3, XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL]
-
- psrad xmm3, _2NDSTAGESHIFT
- ;fourth column
- pmaddwd xmm0, [rdx+48]
- pmaddwd xmm1, [rdx+48]
-
- movdqa xmm4, xmm0
- punpckldq xmm0, xmm1
-
- punpckhdq xmm4, xmm1
- movdqa xmm1, xmm0
-
- punpckldq xmm0, xmm4
- punpckhdq xmm1, xmm4
-
- paddd xmm0, xmm1
- paddd xmm0, XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL]
-
- psrad xmm0, _2NDSTAGESHIFT
- packssdw xmm3, xmm0
-
- movdqu [rcx+16], xmm3
-
- mov rsp, rbp
; begin epilog
+ pop rdi
+ pop rsi
RESTORE_GOT
+;; RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
-
SECTION_RODATA
-;static unsigned int dct1st_stage_rounding_sse2[4] =
align 16
-dct1st_stage_rounding_sse2:
- times 4 dd 8192
-
-
-;static unsigned int dct2nd_stage_rounding_sse2[4] =
+_5352_2217:
+ dw 5352
+ dw 2217
+ dw 5352
+ dw 2217
+ dw 5352
+ dw 2217
+ dw 5352
+ dw 2217
align 16
-dct2nd_stage_rounding_sse2:
- times 4 dd 32768
-
-;static short dct_matrix_sse2[4][8]=
+_2217_neg5352:
+ dw 2217
+ dw -5352
+ dw 2217
+ dw -5352
+ dw 2217
+ dw -5352
+ dw 2217
+ dw -5352
align 16
-dct_matrix_sse2:
- times 8 dw 23170
+_mult_add:
+ times 8 dw 1
+align 16
+_cmp_mask:
+ times 4 dw 1
+ times 4 dw 0
- dw 30274
- dw 12540
- dw -12540
- dw -30274
- dw 30274
- dw 12540
- dw -12540
- dw -30274
-
- dw 23170
- times 2 dw -23170
- times 2 dw 23170
- times 2 dw -23170
- dw 23170
-
- dw 12540
- dw -30274
- dw 30274
- dw -12540
- dw 12540
- dw -30274
- dw 30274
- dw -12540
+align 16
+_mult_sub:
+ dw 1
+ dw -1
+ dw 1
+ dw -1
+ dw 1
+ dw -1
+ dw 1
+ dw -1
+align 16
+_7:
+ times 4 dd 7
+align 16
+_14500:
+ times 4 dd 14500
+align 16
+_7500:
+ times 4 dd 7500
+align 16
+_12000:
+ times 4 dd 12000
+align 16
+_51000:
+ times 4 dd 51000
diff --git a/vp8/encoder/x86/dct_x86.h b/vp8/encoder/x86/dct_x86.h
index 05d0180..bff52e1 100644
--- a/vp8/encoder/x86/dct_x86.h
+++ b/vp8/encoder/x86/dct_x86.h
@@ -22,46 +22,41 @@
#if HAVE_MMX
extern prototype_fdct(vp8_short_fdct4x4_mmx);
extern prototype_fdct(vp8_short_fdct8x4_mmx);
-extern prototype_fdct(vp8_fast_fdct4x4_mmx);
-extern prototype_fdct(vp8_fast_fdct8x4_mmx);
#if !CONFIG_RUNTIME_CPU_DETECT
+#if 0
#undef vp8_fdct_short4x4
#define vp8_fdct_short4x4 vp8_short_fdct4x4_mmx
#undef vp8_fdct_short8x4
#define vp8_fdct_short8x4 vp8_short_fdct8x4_mmx
-
-#undef vp8_fdct_fast4x4
-#define vp8_fdct_fast4x4 vp8_fast_fdct4x4_mmx
-
-#undef vp8_fdct_fast8x4
-#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_mmx
+#endif
#endif
#endif
#if HAVE_SSE2
-extern prototype_fdct(vp8_short_fdct4x4_wmt);
extern prototype_fdct(vp8_short_fdct8x4_wmt);
-extern prototype_fdct(vp8_fast_fdct8x4_wmt);
-
extern prototype_fdct(vp8_short_walsh4x4_sse2);
-#if !CONFIG_RUNTIME_CPU_DETECT
+extern prototype_fdct(vp8_short_fdct4x4_sse2);
-#if 0
+#if !CONFIG_RUNTIME_CPU_DETECT
+#if 1
/* short SSE2 DCT currently disabled, does not match the MMX version */
#undef vp8_fdct_short4x4
-#define vp8_fdct_short4x4 vp8_short_fdct4x4_wmt
+#define vp8_fdct_short4x4 vp8_short_fdct4x4_sse2
#undef vp8_fdct_short8x4
-#define vp8_fdct_short8x4 vp8_short_fdct8x4_wmt
+#define vp8_fdct_short8x4 vp8_short_fdct8x4_sse2
#endif
+#undef vp8_fdct_fast4x4
+#define vp8_fdct_fast4x4 vp8_short_fdct4x4_sse2
+
#undef vp8_fdct_fast8x4
-#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_wmt
+#define vp8_fdct_fast8x4 vp8_short_fdct8x4_sse2
#undef vp8_fdct_walsh_short4x4
#define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_sse2
diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c
index f375045..4d05156 100644
--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ b/vp8/encoder/x86/x86_csystemdependent.c
@@ -18,15 +18,10 @@
#if HAVE_MMX
void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch)
{
- vp8_short_fdct4x4_mmx(input, output, pitch);
- vp8_short_fdct4x4_mmx(input + 4, output + 16, pitch);
+ vp8_short_fdct4x4_c(input, output, pitch);
+ vp8_short_fdct4x4_c(input + 4, output + 16, pitch);
}
-void vp8_fast_fdct8x4_mmx(short *input, short *output, int pitch)
-{
- vp8_fast_fdct4x4_mmx(input, output , pitch);
- vp8_fast_fdct4x4_mmx(input + 4, output + 16, pitch);
-}
int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
short *qcoeff_ptr, short *dequant_ptr,
@@ -87,10 +82,10 @@
#endif
#if HAVE_SSE2
-void vp8_short_fdct8x4_wmt(short *input, short *output, int pitch)
+void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch)
{
- vp8_short_fdct4x4_wmt(input, output, pitch);
- vp8_short_fdct4x4_wmt(input + 4, output + 16, pitch);
+ vp8_short_fdct4x4_sse2(input, output, pitch);
+ vp8_short_fdct4x4_sse2(input + 4, output + 16, pitch);
}
int vp8_fast_quantize_b_impl_sse(short *coeff_ptr, short *zbin_ptr,
@@ -221,11 +216,19 @@
cpi->rtcd.variance.get8x8var = vp8_get8x8var_mmx;
cpi->rtcd.variance.get16x16var = vp8_get16x16var_mmx;
cpi->rtcd.variance.get4x4sse_cs = vp8_get4x4sse_cs_mmx;
-
+#if 0 // new fdct
cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_mmx;
cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_mmx;
- cpi->rtcd.fdct.fast4x4 = vp8_fast_fdct4x4_mmx;
- cpi->rtcd.fdct.fast8x4 = vp8_fast_fdct8x4_mmx;
+ cpi->rtcd.fdct.fast4x4 = vp8_short_fdct4x4_mmx;
+ cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_mmx;
+#else
+ cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_c;
+ cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_c;
+ cpi->rtcd.fdct.fast4x4 = vp8_short_fdct4x4_c;
+ cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_c;
+
+#endif
+
cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_c;
cpi->rtcd.encodemb.berr = vp8_block_error_mmx;
@@ -270,13 +273,11 @@
cpi->rtcd.variance.get16x16var = vp8_get16x16var_sse2;
/* cpi->rtcd.variance.get4x4sse_cs not implemented for wmt */;
-#if 0
- /* short SSE2 DCT currently disabled, does not match the MMX version */
- cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_wmt;
- cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_wmt;
-#endif
- /* cpi->rtcd.fdct.fast4x4 not implemented for wmt */;
- cpi->rtcd.fdct.fast8x4 = vp8_fast_fdct8x4_wmt;
+ cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_sse2;
+ cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_sse2;
+ cpi->rtcd.fdct.fast4x4 = vp8_short_fdct4x4_sse2;
+ cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_sse2;
+
cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_sse2;
cpi->rtcd.encodemb.berr = vp8_block_error_xmm;
diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk
index f09f258..c88df47 100644
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -93,10 +93,10 @@
VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/sad_mmx.asm
VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/dct_mmx.asm
VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/subtract_mmx.asm
+VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/variance_sse2.c
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/variance_impl_sse2.asm
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/sad_sse2.asm
-VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.asm
VP8_CX_SRCS-$(HAVE_SSE3) += encoder/x86/sad_sse3.asm