Merge "Remove INLINE/FORCEINLINE"

commit: b3eb3d216394a2ee9ec5a304accd839bd0851433 [log] [tgz]
author: John Koleszar <jkoleszar@google.com> Wed Jun 30 07:59:39 2010 -0700
committer: Code Review <code-review@webmproject.org> Wed Jun 30 07:59:39 2010 -0700
tree: 5103544df3119384a42b37769b0119f66c46cba7
parent: 308e867f91ec1b76d59009060ada99f52a73c602 [diff]
parent: 5e34461448c3b68385d5cc92d6b4d4c95be394fb [diff]
diff --git a/vp8/common/arm/loopfilter_arm.c b/vp8/common/arm/loopfilter_arm.c
index bb4af22..12e56ab 100644
--- a/vp8/common/arm/loopfilter_arm.c
+++ b/vp8/common/arm/loopfilter_arm.c

@@ -14,16 +14,6 @@
 #include "loopfilter.h"
 #include "onyxc_int.h"
 
-typedef void loop_filter_uvfunction
-(
-    unsigned char *u,   // source pointer
-    int p,              // pitch
-    const signed char *flimit,
-    const signed char *limit,
-    const signed char *thresh,
-    unsigned char *v
-);
-
 extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_armv6);
 extern prototype_loopfilter(vp8_loop_filter_vertical_edge_armv6);
 extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_armv6);

diff --git a/vp8/common/loopfilter.h b/vp8/common/loopfilter.h
index f051a31..66185d1 100644
--- a/vp8/common/loopfilter.h
+++ b/vp8/common/loopfilter.h

@@ -117,5 +117,14 @@
 #define LF_INVOKE(ctx,fn) vp8_lf_##fn
 #endif
 
+typedef void loop_filter_uvfunction
+(
+    unsigned char *u,   // source pointer
+    int p,              // pitch
+    const signed char *flimit,
+    const signed char *limit,
+    const signed char *thresh,
+    unsigned char *v
+);
 
 #endif

diff --git a/vp8/common/onyxc_int.h b/vp8/common/onyxc_int.h
index 4d5d987..33a5433 100644
--- a/vp8/common/onyxc_int.h
+++ b/vp8/common/onyxc_int.h

@@ -201,6 +201,7 @@
 
 void vp8_adjust_mb_lf_value(MACROBLOCKD *mbd, int *filter_level);
 void vp8_init_loop_filter(VP8_COMMON *cm);
+void vp8_frame_init_loop_filter(loop_filter_info *lfi, int frame_type);
 extern void vp8_loop_filter_frame(VP8_COMMON *cm,    MACROBLOCKD *mbd,  int filt_val);
 
 #endif

diff --git a/vp8/common/x86/loopfilter_sse2.asm b/vp8/common/x86/loopfilter_sse2.asm
index f11fcad..ad2f36c 100644
--- a/vp8/common/x86/loopfilter_sse2.asm
+++ b/vp8/common/x86/loopfilter_sse2.asm

@@ -12,6 +12,283 @@
 %include "vpx_ports/x86_abi_support.asm"
 
 
+%macro LFH_FILTER_MASK 1
+%if %1
+        movdqa      xmm2,                   [rdi+2*rax]       ; q3
+        movdqa      xmm1,                   [rsi+2*rax]       ; q2
+%else
+        movq        xmm0,                   [rsi + rcx*2]     ; q3
+        movq        xmm2,                   [rdi + rcx*2]
+        pslldq      xmm2,                   8
+        por         xmm2,                   xmm0
+        movq        xmm1,                   [rsi + rcx]       ; q2
+        movq        xmm3,                   [rdi + rcx]
+        pslldq      xmm3,                   8
+        por         xmm1,                   xmm3
+        movdqa      XMMWORD PTR [rsp],      xmm1              ; store q2
+%endif
+
+        movdqa      xmm6,                   xmm1              ; q2
+        psubusb     xmm1,                   xmm2              ; q2-=q3
+        psubusb     xmm2,                   xmm6              ; q3-=q2
+        por         xmm1,                   xmm2              ; abs(q3-q2)
+
+        psubusb     xmm1,                   xmm7
+
+%if %1
+        movdqa      xmm4,                   [rsi+rax]         ; q1
+%else
+        movq        xmm0,                   [rsi]             ; q1
+        movq        xmm4,                   [rdi]
+        pslldq      xmm4,                   8
+        por         xmm4,                   xmm0
+        movdqa      XMMWORD PTR [rsp + 16], xmm4              ; store q1
+%endif
+
+        movdqa      xmm3,                   xmm4              ; q1
+        psubusb     xmm4,                   xmm6              ; q1-=q2
+        psubusb     xmm6,                   xmm3              ; q2-=q1
+        por         xmm4,                   xmm6              ; abs(q2-q1)
+        psubusb     xmm4,                   xmm7
+
+        por         xmm1,                   xmm4
+
+%if %1
+        movdqa      xmm4,                   [rsi]             ; q0
+%else
+        movq        xmm4,                   [rsi + rax]       ; q0
+        movq        xmm0,                   [rdi + rax]
+        pslldq      xmm0,                   8
+        por         xmm4,                   xmm0
+%endif
+
+        movdqa      xmm0,                   xmm4              ; q0
+        psubusb     xmm4,                   xmm3              ; q0-=q1
+        psubusb     xmm3,                   xmm0              ; q1-=q0
+        por         xmm4,                   xmm3              ; abs(q0-q1)
+        movdqa      t0,                     xmm4              ; save to t0
+
+        psubusb     xmm4,                   xmm7
+        por         xmm1,                   xmm4
+
+%if %1
+        neg         rax                     ; negate pitch to deal with above border
+
+        movdqa      xmm2,                   [rsi+4*rax]       ; p3
+        movdqa      xmm4,                   [rdi+4*rax]       ; p2
+%else
+        lea         rsi,                    [rsi + rax*4]
+        lea         rdi,                    [rdi + rax*4]
+
+        movq        xmm2,                   [rsi + rax]       ; p3
+        movq        xmm3,                   [rdi + rax]
+        pslldq      xmm3,                   8
+        por         xmm2,                   xmm3
+        movq        xmm4,                   [rsi]             ; p2
+        movq        xmm5,                   [rdi]
+        pslldq      xmm5,                   8
+        por         xmm4,                   xmm5
+        movdqa      XMMWORD PTR [rsp + 32], xmm4              ; store p2
+%endif
+
+        movdqa      xmm5,                   xmm4              ; p2
+        psubusb     xmm4,                   xmm2              ; p2-=p3
+        psubusb     xmm2,                   xmm5              ; p3-=p2
+        por         xmm4,                   xmm2              ; abs(p3 - p2)
+
+        psubusb     xmm4,                   xmm7
+        por         xmm1,                   xmm4
+
+%if %1
+        movdqa      xmm4,                   [rsi+2*rax]       ; p1
+%else
+        movq        xmm4,                   [rsi + rcx]       ; p1
+        movq        xmm3,                   [rdi + rcx]
+        pslldq      xmm3,                   8
+        por         xmm4,                   xmm3
+        movdqa      XMMWORD PTR [rsp + 48], xmm4              ; store p1
+%endif
+
+        movdqa      xmm3,                   xmm4              ; p1
+        psubusb     xmm4,                   xmm5              ; p1-=p2
+        psubusb     xmm5,                   xmm3              ; p2-=p1
+        por         xmm4,                   xmm5              ; abs(p2 - p1)
+        psubusb     xmm4,                   xmm7
+
+        por         xmm1,                   xmm4
+        movdqa      xmm2,                   xmm3              ; p1
+
+%if %1
+        movdqa      xmm4,                   [rsi+rax]         ; p0
+%else
+        movq        xmm4,                   [rsi + rcx*2]     ; p0
+        movq        xmm5,                   [rdi + rcx*2]
+        pslldq      xmm5,                   8
+        por         xmm4,                   xmm5
+%endif
+
+        movdqa      xmm5,                   xmm4              ; p0
+        psubusb     xmm4,                   xmm3              ; p0-=p1
+        psubusb     xmm3,                   xmm5              ; p1-=p0
+        por         xmm4,                   xmm3              ; abs(p1 - p0)
+        movdqa        t1,                   xmm4              ; save to t1
+
+        psubusb     xmm4,                   xmm7
+        por         xmm1,                   xmm4
+
+%if %1
+        movdqa      xmm3,                   [rdi]             ; q1
+%else
+        movdqa      xmm3,                   q1                ; q1
+%endif
+
+        movdqa      xmm4,                   xmm3              ; q1
+        psubusb     xmm3,                   xmm2              ; q1-=p1
+        psubusb     xmm2,                   xmm4              ; p1-=q1
+        por         xmm2,                   xmm3              ; abs(p1-q1)
+        pand        xmm2,                   [tfe GLOBAL]      ; set lsb of each byte to zero
+        psrlw       xmm2,                   1                 ; abs(p1-q1)/2
+
+        movdqa      xmm6,                   xmm5              ; p0
+        movdqa      xmm3,                   xmm0              ; q0
+        psubusb     xmm5,                   xmm3              ; p0-=q0
+        psubusb     xmm3,                   xmm6              ; q0-=p0
+        por         xmm5,                   xmm3              ; abs(p0 - q0)
+        paddusb     xmm5,                   xmm5              ; abs(p0-q0)*2
+        paddusb     xmm5,                   xmm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+        mov         rdx,                    arg(2)            ; get flimit
+        movdqa      xmm2,                   XMMWORD PTR [rdx]
+        paddb       xmm2,                   xmm2              ; flimit*2 (less than 255)
+        paddb       xmm7,                   xmm2              ; flimit * 2 + limit (less than 255)
+
+        psubusb     xmm5,                   xmm7              ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
+        por         xmm1,                   xmm5
+        pxor        xmm5,                   xmm5
+        pcmpeqb     xmm1,                   xmm5              ; mask mm1
+%endmacro
+
+%macro LFH_HEV_MASK 0
+        mov         rdx,                    arg(4)            ; get thresh
+        movdqa      xmm7,                   XMMWORD PTR [rdx]
+
+        movdqa      xmm4,                   t0                ; get abs (q1 - q0)
+        psubusb     xmm4,                   xmm7
+        movdqa      xmm3,                   t1                ; get abs (p1 - p0)
+        psubusb     xmm3,                   xmm7
+        paddb       xmm4,                   xmm3              ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+        pcmpeqb     xmm4,                   xmm5
+
+        pcmpeqb     xmm5,                   xmm5
+        pxor        xmm4,                   xmm5
+%endmacro
+
+%macro BH_FILTER 1
+%if %1
+        movdqa      xmm2,                   [rsi+2*rax]       ; p1
+        movdqa      xmm7,                   [rdi]             ; q1
+%else
+        movdqa      xmm2,                   p1                ; p1
+        movdqa      xmm7,                   q1                ; q1
+%endif
+
+        pxor        xmm2,                   [t80 GLOBAL]      ; p1 offset to convert to signed values
+        pxor        xmm7,                   [t80 GLOBAL]      ; q1 offset to convert to signed values
+
+        psubsb      xmm2,                   xmm7              ; p1 - q1
+        pand        xmm2,                   xmm4              ; high var mask (hvm)(p1 - q1)
+        pxor        xmm6,                   [t80 GLOBAL]      ; offset to convert to signed values
+
+        pxor        xmm0,                   [t80 GLOBAL]      ; offset to convert to signed values
+        movdqa      xmm3,                   xmm0              ; q0
+
+        psubsb      xmm0,                   xmm6              ; q0 - p0
+        paddsb      xmm2,                   xmm0              ; 1 * (q0 - p0) + hvm(p1 - q1)
+        paddsb      xmm2,                   xmm0              ; 2 * (q0 - p0) + hvm(p1 - q1)
+        paddsb      xmm2,                   xmm0              ; 3 * (q0 - p0) + hvm(p1 - q1)
+        pand        xmm1,                   xmm2              ; mask filter values we don't care about
+        movdqa      xmm2,                   xmm1
+        paddsb      xmm1,                   [t4 GLOBAL]       ; 3* (q0 - p0) + hvm(p1 - q1) + 4
+        paddsb      xmm2,                   [t3 GLOBAL]       ; 3* (q0 - p0) + hvm(p1 - q1) + 3
+
+        pxor        xmm0,                   xmm0
+        pxor        xmm5,                   xmm5
+        punpcklbw   xmm0,                   xmm2
+        punpckhbw   xmm5,                   xmm2
+        psraw       xmm0,                   11
+        psraw       xmm5,                   11
+        packsswb    xmm0,                   xmm5
+        movdqa      xmm2,                   xmm0              ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
+
+        pxor        xmm0,                   xmm0              ; 0
+        movdqa      xmm5,                   xmm1              ; abcdefgh
+        punpcklbw   xmm0,                   xmm1              ; e0f0g0h0
+        psraw       xmm0,                   11                ; sign extended shift right by 3
+        pxor        xmm1,                   xmm1              ; 0
+        punpckhbw   xmm1,                   xmm5              ; a0b0c0d0
+        psraw       xmm1,                   11                ; sign extended shift right by 3
+        movdqa      xmm5,                   xmm0              ; save results
+
+        packsswb    xmm0,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
+        paddsw      xmm5,                   [ones GLOBAL]
+        paddsw      xmm1,                   [ones GLOBAL]
+        psraw       xmm5,                   1                 ; partial shifted one more time for 2nd tap
+        psraw       xmm1,                   1                 ; partial shifted one more time for 2nd tap
+        packsswb    xmm5,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
+        pandn       xmm4,                   xmm5              ; high edge variance additive
+%endmacro
+
+%macro BH_WRITEBACK 1
+        paddsb      xmm6,                   xmm2              ; p0+= p0 add
+        pxor        xmm6,                   [t80 GLOBAL]      ; unoffset
+%if %1
+        movdqa      [rsi+rax],              xmm6              ; write back
+%else
+        lea         rsi,                    [rsi + rcx*2]
+        lea         rdi,                    [rdi + rcx*2]
+        movq        MMWORD PTR [rsi],       xmm6              ; p0
+        psrldq      xmm6,                   8
+        movq        MMWORD PTR [rdi],       xmm6
+%endif
+
+%if %1
+        movdqa      xmm6,                   [rsi+2*rax]       ; p1
+%else
+        movdqa      xmm6,                   p1                ; p1
+%endif
+        pxor        xmm6,                   [t80 GLOBAL]      ; reoffset
+        paddsb      xmm6,                   xmm4               ; p1+= p1 add
+        pxor        xmm6,                   [t80 GLOBAL]      ; unoffset
+%if %1
+        movdqa      [rsi+2*rax],            xmm6              ; write back
+%else
+        movq        MMWORD PTR [rsi + rax], xmm6              ; p1
+        psrldq      xmm6,                   8
+        movq        MMWORD PTR [rdi + rax], xmm6
+%endif
+
+        psubsb      xmm3,                   xmm0              ; q0-= q0 add
+        pxor        xmm3,                   [t80 GLOBAL]      ; unoffset
+%if %1
+        movdqa      [rsi],                  xmm3              ; write back
+%else
+        movq        MMWORD PTR [rsi + rcx], xmm3              ; q0
+        psrldq      xmm3,                   8
+        movq        MMWORD PTR [rdi + rcx], xmm3
+%endif
+
+        psubsb      xmm7,                   xmm4              ; q1-= q1 add
+        pxor        xmm7,                   [t80 GLOBAL]      ; unoffset
+%if %1
+        movdqa      [rdi],                  xmm7              ; write back
+%else
+        movq        MMWORD PTR [rsi + rcx*2],xmm7             ; q1
+        psrldq      xmm7,                   8
+        movq        MMWORD PTR [rdi + rcx*2],xmm7
+%endif
+%endmacro
+
+
 ;void vp8_loop_filter_horizontal_edge_sse2
 ;(
 ;    unsigned char *src_ptr,
@@ -33,179 +310,28 @@
     ; end prolog
 
     ALIGN_STACK 16, rax
-    sub         rsp, 32                         ; reserve 32 bytes
+    sub         rsp, 32     ; reserve 32 bytes
     %define t0 [rsp + 0]    ;__declspec(align(16)) char t0[16];
     %define t1 [rsp + 16]   ;__declspec(align(16)) char t1[16];
 
-        mov         rsi, arg(0) ;src_ptr
-        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
+        mov         rsi,                    arg(0)           ;src_ptr
+        movsxd      rax,                    dword ptr arg(1) ;src_pixel_step
 
-        mov         rdx,    arg(3) ;limit
-        movdqa      xmm7,   XMMWORD PTR [rdx]
-        mov         rdi,    rsi           ; rdi points to row +1 for indirect addressing
-        add         rdi,    rax
+        mov         rdx,                    arg(3)           ;limit
+        movdqa      xmm7,                   XMMWORD PTR [rdx]
+
+        lea         rdi,                    [rsi+rax]        ; rdi points to row +1 for indirect addressing
 
         ; calculate breakout conditions
-        movdqu      xmm2,   [rdi+2*rax]      ; q3
-        movdqu      xmm1,   [rsi+2*rax]      ; q2
-        movdqa      xmm6,   xmm1              ; q2
-        psubusb     xmm1,   xmm2              ; q2-=q3
-        psubusb     xmm2,   xmm6              ; q3-=q2
-        por         xmm1,   xmm2              ; abs(q3-q2)
-        psubusb     xmm1,   xmm7              ;
-
-
-        movdqu      xmm4,   [rsi+rax]         ; q1
-        movdqa      xmm3,   xmm4              ; q1
-        psubusb     xmm4,   xmm6              ; q1-=q2
-        psubusb     xmm6,   xmm3              ; q2-=q1
-        por         xmm4,   xmm6              ; abs(q2-q1)
-
-        psubusb     xmm4,   xmm7
-        por         xmm1,   xmm4
-
-        movdqu      xmm4,   [rsi]             ; q0
-        movdqa      xmm0,   xmm4              ; q0
-        psubusb     xmm4,   xmm3              ; q0-=q1
-        psubusb     xmm3,   xmm0              ; q1-=q0
-        por         xmm4,   xmm3              ; abs(q0-q1)
-        movdqa        t0,       xmm4                  ; save to t0
-        psubusb     xmm4,   xmm7
-        por         xmm1,   xmm4
-
-        neg         rax                   ; negate pitch to deal with above border
-        movdqu      xmm2,   [rsi+4*rax]      ; p3
-        movdqu      xmm4,   [rdi+4*rax]      ; p2
-        movdqa      xmm5,   xmm4              ; p2
-        psubusb     xmm4,   xmm2              ; p2-=p3
-        psubusb     xmm2,   xmm5              ; p3-=p2
-        por         xmm4,   xmm2              ; abs(p3 - p2)
-        psubusb     xmm4,   xmm7
-        por         xmm1,   xmm4
-
-
-        movdqu      xmm4,   [rsi+2*rax]      ; p1
-        movdqa      xmm3,   xmm4              ; p1
-        psubusb     xmm4,   xmm5              ; p1-=p2
-        psubusb     xmm5,   xmm3              ; p2-=p1
-        por         xmm4,   xmm5              ; abs(p2 - p1)
-        psubusb     xmm4,   xmm7
-        por         xmm1,   xmm4
-
-        movdqa      xmm2,   xmm3              ; p1
-
-        movdqu      xmm4,   [rsi+rax]         ; p0
-        movdqa      xmm5,   xmm4              ; p0
-        psubusb     xmm4,   xmm3              ; p0-=p1
-        psubusb     xmm3,   xmm5              ; p1-=p0
-        por         xmm4,   xmm3              ; abs(p1 - p0)
-        movdqa      t1,     xmm4                  ; save to t1
-        psubusb     xmm4,   xmm7
-        por         xmm1,    xmm4
-
-        movdqu      xmm3,   [rdi]             ; q1
-        movdqa      xmm4,   xmm3              ; q1
-        psubusb     xmm3,   xmm2              ; q1-=p1
-        psubusb     xmm2,   xmm4              ; p1-=q1
-        por         xmm2,   xmm3              ; abs(p1-q1)
-        pand        xmm2,   [tfe GLOBAL]      ; set lsb of each byte to zero
-        psrlw       xmm2,   1                 ; abs(p1-q1)/2
-
-        movdqa      xmm6,   xmm5              ; p0
-        movdqu      xmm3,   [rsi]             ; q0
-        psubusb     xmm5,   xmm3              ; p0-=q0
-        psubusb     xmm3,   xmm6              ; q0-=p0
-        por         xmm5,   xmm3              ; abs(p0 - q0)
-        paddusb     xmm5,   xmm5              ; abs(p0-q0)*2
-        paddusb     xmm5,   xmm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
-        mov         rdx,    arg(2) ;flimit            ; get flimit
-        movdqa      xmm2,   [rdx]             ;
-
-        paddb       xmm2,   xmm2              ; flimit*2 (less than 255)
-        paddb       xmm7,   xmm2              ; flimit * 2 + limit (less than 255)
-
-        psubusb     xmm5,    xmm7             ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
-        por         xmm1,    xmm5
-        pxor        xmm5,    xmm5
-        pcmpeqb     xmm1,    xmm5             ; mask mm1
-
+        LFH_FILTER_MASK 1
 
         ; calculate high edge variance
-        mov         rdx,    arg(4) ;thresh            ; get thresh
-        movdqa      xmm7,   [rdx]             ;
-        movdqa      xmm4,   t0                ; get abs (q1 - q0)
-        psubusb     xmm4,   xmm7
-        movdqa      xmm3,   t1                ; get abs (p1 - p0)
-        psubusb     xmm3,   xmm7
-        paddb       xmm4,   xmm3              ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
-        pcmpeqb     xmm4,        xmm5
-        pcmpeqb     xmm5,        xmm5
-        pxor        xmm4,        xmm5
-
+        LFH_HEV_MASK
 
         ; start work on filters
-        movdqu      xmm2, [rsi+2*rax]     ; p1
-        movdqu      xmm7, [rdi]           ; q1
-        pxor        xmm2, [t80 GLOBAL]    ; p1 offset to convert to signed values
-        pxor        xmm7, [t80 GLOBAL]    ; q1 offset to convert to signed values
-        psubsb      xmm2, xmm7            ; p1 - q1
-        pand        xmm2, xmm4            ; high var mask (hvm)(p1 - q1)
-        pxor        xmm6, [t80 GLOBAL]    ; offset to convert to signed values
-        pxor        xmm0, [t80 GLOBAL]    ; offset to convert to signed values
-        movdqa      xmm3, xmm0            ; q0
-        psubsb      xmm0, xmm6            ; q0 - p0
-        paddsb      xmm2, xmm0            ; 1 * (q0 - p0) + hvm(p1 - q1)
-        paddsb      xmm2, xmm0            ; 2 * (q0 - p0) + hvm(p1 - q1)
-        paddsb      xmm2, xmm0            ; 3 * (q0 - p0) + hvm(p1 - q1)
-        pand        xmm1, xmm2            ; mask filter values we don't care about
-        movdqa      xmm2, xmm1
-        paddsb      xmm1, [t4 GLOBAL]         ; 3* (q0 - p0) + hvm(p1 - q1) + 4
-        paddsb      xmm2, [t3 GLOBAL]         ; 3* (q0 - p0) + hvm(p1 - q1) + 3
-
-        pxor        xmm0, xmm0           ;
-        pxor        xmm5, xmm5
-        punpcklbw   xmm0, xmm2          ;
-        punpckhbw   xmm5, xmm2          ;
-        psraw       xmm0, 11                ;
-        psraw       xmm5, 11
-        packsswb    xmm0, xmm5
-        movdqa      xmm2, xmm0          ;  (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
-
-        pxor        xmm0, xmm0            ; 0
-        movdqa      xmm5, xmm1            ; abcdefgh
-        punpcklbw   xmm0, xmm1            ; e0f0g0h0
-        psraw       xmm0, 11                  ; sign extended shift right by 3
-        pxor        xmm1, xmm1            ; 0
-        punpckhbw   xmm1, xmm5            ; a0b0c0d0
-        psraw       xmm1, 11                  ; sign extended shift right by 3
-        movdqa      xmm5, xmm0              ; save results
-
-        packsswb    xmm0, xmm1            ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
-        paddsw      xmm5, [ones GLOBAL]
-        paddsw      xmm1, [ones GLOBAL]
-        psraw       xmm5, 1               ; partial shifted one more time for 2nd tap
-        psraw       xmm1, 1               ; partial shifted one more time for 2nd tap
-        packsswb    xmm5, xmm1            ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
-        pandn       xmm4, xmm5            ; high edge variance additive
-
-        paddsb      xmm6, xmm2            ; p0+= p0 add
-        pxor        xmm6, [t80 GLOBAL]    ; unoffset
-        movdqu      [rsi+rax], xmm6       ; write back
-
-        movdqu      xmm6, [rsi+2*rax]     ; p1
-        pxor        xmm6, [t80 GLOBAL]    ; reoffset
-        paddsb      xmm6, xmm4            ; p1+= p1 add
-        pxor        xmm6, [t80 GLOBAL]    ; unoffset
-        movdqu      [rsi+2*rax], xmm6     ; write back
-
-        psubsb      xmm3, xmm0            ; q0-= q0 add
-        pxor        xmm3, [t80 GLOBAL]    ; unoffset
-        movdqu      [rsi], xmm3           ; write back
-
-        psubsb      xmm7, xmm4            ; q1-= q1 add
-        pxor        xmm7, [t80 GLOBAL]    ; unoffset
-        movdqu      [rdi], xmm7           ; write back
+        BH_FILTER 1
+        ; write back the result
+        BH_WRITEBACK 1
 
     add rsp, 32
     pop rsp
@@ -219,7 +345,7 @@
     ret
 
 
-;void vp8_loop_filter_vertical_edge_sse2
+;void vp8_loop_filter_horizontal_edge_uv_sse2
 ;(
 ;    unsigned char *src_ptr,
 ;    int            src_pixel_step,
@@ -228,8 +354,8 @@
 ;    const char    *thresh,
 ;    int            count
 ;)
-global sym(vp8_loop_filter_vertical_edge_sse2)
-sym(vp8_loop_filter_vertical_edge_sse2):
+global sym(vp8_loop_filter_horizontal_edge_uv_sse2)
+sym(vp8_loop_filter_horizontal_edge_uv_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -240,414 +366,35 @@
     ; end prolog
 
     ALIGN_STACK 16, rax
-    sub          rsp, 96      ; reserve 96 bytes
-    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[16];
-    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[16];
-    %define srct [rsp + 32]   ;__declspec(align(16)) char srct[64];
+    sub         rsp, 96       ; reserve 96 bytes
+    %define q2  [rsp + 0]     ;__declspec(align(16)) char q2[16];
+    %define q1  [rsp + 16]    ;__declspec(align(16)) char q1[16];
+    %define p2  [rsp + 32]    ;__declspec(align(16)) char p2[16];
+    %define p1  [rsp + 48]    ;__declspec(align(16)) char p1[16];
+    %define t0  [rsp + 64]    ;__declspec(align(16)) char t0[16];
+    %define t1  [rsp + 80]    ;__declspec(align(16)) char t1[16];
 
-        mov         rsi,        arg(0) ;src_ptr
-        movsxd      rax,        dword ptr arg(1) ;src_pixel_step     ; destination pitch?
+        mov         rsi,                    arg(0)             ; u
+        mov         rdi,                    arg(5)             ; v
+        movsxd      rax,                    dword ptr arg(1)   ; src_pixel_step
+        mov         rcx,                    rax
+        neg         rax                     ; negate pitch to deal with above border
 
-        lea         rsi,        [rsi + rax*4 - 4]
-        mov         rdi,        rsi           ; rdi points to row +1 for indirect addressing
+        mov         rdx,                    arg(3)             ;limit
+        movdqa      xmm7,                   XMMWORD PTR [rdx]
 
-        add         rdi,        rax
-        lea         rcx,        [rdi + rax *8]
+        lea         rsi,                    [rsi + rcx]
+        lea         rdi,                    [rdi + rcx]
 
-        ;transpose
-        movq        xmm7,       QWORD PTR [rsi+2*rax]                 ; 67 66 65 64 63 62 61 60
-        movq        xmm6,       QWORD PTR [rdi+2*rax]                 ; 77 76 75 74 73 72 71 70
-
-        punpcklbw   xmm7,       xmm6                        ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60
-        movq        xmm5,       QWORD PTR [rsi]                       ; 47 46 45 44 43 42 41 40
-
-        movq        xmm4,       QWORD PTR [rsi+rax]                   ; 57 56 55 54 53 52 51 50
-        punpcklbw   xmm5,       xmm4                        ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
-
-        movdqa      xmm3,       xmm5                        ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
-        punpckhwd   xmm5,       xmm7                        ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44
-
-        lea         rsi,        [rsi+ rax*8]
-
-        punpcklwd   xmm3,       xmm7                        ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
-        movq        xmm6,       QWORD PTR [rsi + 2*rax]               ; e7 e6 e5 e4 e3 e2 e1 e0
-
-        movq        xmm7,       QWORD PTR [rcx + 2*rax]               ; f7 f6 f5 f4 f3 f2 f1 f0
-        punpcklbw   xmm6,       xmm7                        ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0
-
-        movq        xmm4,       QWORD PTR [rsi]                       ; c7 c6 c5 c4 c3 c2 c1 c0
-        movq        xmm7,       QWORD PTR [rsi + rax]                 ; d7 d6 d5 d4 d3 d2 d1 d0
-
-        punpcklbw   xmm4,       xmm7                        ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 c1 d0 c0
-        movdqa      xmm7,       xmm4                        ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 c1 d0 c0
-
-        punpckhwd   xmm7,       xmm6                        ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4
-        punpcklwd   xmm4,       xmm6                        ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
-
-        ; xmm3 xmm4, xmm5 xmm7   in use
-        neg         rax
-
-        lea         rsi,        [rsi+rax*8]
-        movq        xmm6,       QWORD PTR [rsi+rax*2]                 ; 27 26 25 24 23 22 21 20
-
-        movq        xmm1,       QWORD PTR [rsi+rax  ]                 ; 37 36 35 34 33 32 31 30
-        punpcklbw   xmm6,       xmm1                        ; 37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20
-
-        movq        xmm2,       QWORD PTR [rsi+rax*4]                 ; 07 06 05 04 03 02 01 00
-        movq        xmm1,       QWORD PTR [rdi+rax*4]                 ; 17 16 15 14 13 12 11 10
-
-        punpcklbw   xmm2,       xmm1                        ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
-        movdqa      xmm0,       xmm2
-
-        punpckhwd   xmm2,       xmm6                        ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
-        punpcklwd   xmm0,       xmm6                        ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
-
-        movdqa      xmm6,       xmm2
-        punpckldq   xmm2,       xmm5                        ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
-
-        punpckhdq   xmm6,       xmm5                        ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
-        ;xmm0 xmm2 xmm3 xmm4, xmm6, xmm7
-
-        movdqa      xmm5,       xmm0                        ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
-        punpckhdq   xmm5,       xmm3                        ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
-
-        punpckldq   xmm0,       xmm3                        ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
-        lea         rsi,        [rcx+rax]
-        ; xmm1, xmm3 free
-        movq        xmm1,       QWORD PTR [rsi+rax*2]                 ; a7 a6 a5 a4 a3 a2 a1 a0
-        movq        xmm3,       QWORD PTR [rsi+rax]                   ; b7 b6 b5 b4 b3 b2 b1 b0
-
-        punpcklbw   xmm1,       xmm3                        ;
-        lea         rdx,        srct                        ;
-
-        movdqa      [rdx+16],   xmm1                        ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0
-        movq        xmm3,       QWORD PTR [rsi+rax*4]                 ; 87 86 85 84 83 82 81 80
-
-        movq        xmm1,       QWORD PTR [rcx+rax*4]
-        punpcklbw   xmm3,       xmm1                        ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
-
-        movdqa      [rdx],      xmm3                        ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
-
-        punpckhwd   xmm3,       [rdx+16]                    ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
-        movdqa      xmm1,       xmm3                        ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
-
-        punpckhdq   xmm1,       xmm7                        ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86
-        punpckldq   xmm3,       xmm7                        ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84
-
-        movdqa      xmm7,       xmm2                        ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
-        punpcklqdq  xmm7,       xmm3                        ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
-
-        punpckhqdq  xmm2,       xmm3                        ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
-        movdqa      [rdx+32],   xmm7                        ; save 4s
-
-        movdqa      [rdx+48],   xmm2                        ; save 5s
-        movdqa      xmm7,       xmm6                        ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
-
-        punpckhqdq  xmm7,       xmm1                        ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07 = q3
-        punpcklqdq  xmm6,       xmm1                        ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06 = q2
-
-        ; free 1, 3   xmm7-7s xmm6-6s, xmm2-5s
-        movq        xmm1,       QWORD PTR [rdx]                       ; 93 83 92 82 91 81 90 80
-        movq        xmm3,       QWORD PTR [rdx+16]                    ; b3 a3 b2 a2 b1 a1 b0 a0
-
-        punpcklwd   xmm1,       xmm3                        ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
-        movdqa      xmm3,       xmm1                        ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
-
-        punpckhdq   xmm3,       xmm4                        ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
-        punpckldq   xmm1,       xmm4                        ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
-
-        movdqa      xmm4,       xmm5                        ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
-        punpcklqdq  xmm5,       xmm3                        ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
-
-        punpckhqdq  xmm4,       xmm3                        ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
-        movdqa      [rdx],      xmm5                        ; save 2s
-
-        movdqa      [rdx+16],   xmm4                        ; save 3s
-
-        movdqa      xmm3,       xmm6                        ;
-        psubusb     xmm3,       xmm7                        ; q3 - q2
-
-        psubusb     xmm7,       xmm6                        ; q2 - q3
-        por         xmm7,       xmm3                        ; abs(q3-q2)
-
-        movdqa      xmm3,       xmm2                        ; q1
-        psubusb     xmm3,       xmm6                        ; q1 - q2
-
-        psubusb     xmm6,       xmm2                        ; q2 - q1
-        por         xmm6,       xmm3                        ; abs(q2-q1)
-
-
-        movdqa      xmm3,       xmm0                        ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
-        punpcklqdq  xmm0,       xmm1                        ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
-
-        punpckhqdq  xmm3,       xmm1                        ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
-        movdqa      xmm1,       xmm3
-
-        psubusb     xmm3,       xmm0                        ; p2-p3
-        psubusb     xmm0,       xmm1                        ; p3-p2
-
-        por         xmm0,       xmm3                        ; abs(p3-p2)
-        movdqa      xmm3,       xmm5                        ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
-
-        psubusb     xmm3,       xmm1                        ; p1-p2
-        psubusb     xmm1,       xmm5                        ; p2-p1
-
-        por         xmm1,       xmm3                        ; abs(p1-p2)
-        mov         rdx,        arg(3) ;limit
-
-        movdqa      xmm3,       [rdx]                       ; limit
-
-        psubusb     xmm7,       xmm3
-        psubusb     xmm0,       xmm3
-
-        psubusb     xmm1,       xmm3
-        psubusb     xmm6,       xmm3
-
-        por         xmm7,       xmm6
-        por         xmm0,       xmm1
-
-        por         xmm0,       xmm7                         ;   abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
-
-        movdqa      xmm1,       xmm5                         ; p1
-
-        movdqa      xmm7,       xmm4                        ; xmm4 xmm7 = p0
-
-        psubusb     xmm7,       xmm5                        ; p0 - p1
-        psubusb     xmm5,       xmm4                        ; p1 - p0
-
-        por         xmm5,       xmm7                        ; abs(p1-p0)
-        movdqa        t0,       xmm5                        ; save abs(p1-p0)
-
-        lea         rdx,        srct
-        psubusb     xmm5,       xmm3
-
-        por         xmm0,       xmm5                        ; xmm0=mask
-        movdqa      xmm5,       [rdx+32]                    ; xmm5=q0
-
-        movdqa      xmm7,       [rdx+48]                    ; xmm7=q1
-        movdqa      xmm6,       xmm5                        ; mm6=q0
-
-        movdqa      xmm2,       xmm7                        ; q1
-
-        psubusb     xmm5,       xmm7                        ; q0-q1
-        psubusb     xmm7,       xmm6                        ; q1-q0
-
-        por         xmm7,       xmm5                        ; abs(q1-q0)
-        movdqa        t1,       xmm7                        ; save abs(q1-q0)
-
-        psubusb     xmm7,       xmm3
-        por         xmm0,       xmm7                        ; mask
-
-        movdqa      xmm5,       xmm2                        ; q1
-        psubusb     xmm5,       xmm1                        ; q1-=p1
-        psubusb     xmm1,       xmm2                        ; p1-=q1
-        por         xmm5,       xmm1                        ; abs(p1-q1)
-        pand        xmm5,       [tfe GLOBAL]                ; set lsb of each byte to zero
-        psrlw       xmm5,       1                           ; abs(p1-q1)/2
-
-        mov         rdx,        arg(2) ;flimit                      ;
-        movdqa        xmm2,       [rdx]                       ;flimit  xmm2
-
-        movdqa      xmm1,       xmm4                        ; xmm1=xmm4=p0
-
-        movdqa      xmm7,       xmm6                        ; xmm7=xmm6=q0
-        psubusb     xmm1,       xmm7                        ; p0-q0
-
-        psubusb     xmm7,       xmm4                        ; q0-p0
-        por         xmm1,       xmm7                        ; abs(q0-p0)
-        paddusb     xmm1,       xmm1                        ; abs(q0-p0)*2
-        paddusb     xmm1,       xmm5                        ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
-        paddb       xmm2,       xmm2                        ; flimit*2 (less than 255)
-        paddb       xmm3,       xmm2                        ; flimit * 2 + limit (less than 255)
-
-        psubusb     xmm1,       xmm3                         ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
-
-        por         xmm1,       xmm0;                       ; mask
-
-        pxor        xmm0,       xmm0
-        pcmpeqb     xmm1,       xmm0
+        ; calculate breakout conditions
+        LFH_FILTER_MASK 0
         ; calculate high edge variance
-        mov         rdx,        arg(4) ;thresh            ; get thresh
-        movdqa      xmm7,       [rdx]
-
-        ;
-        movdqa      xmm4,       t0              ; get abs (q1 - q0)
-        psubusb     xmm4,       xmm7
-
-        movdqa      xmm3,       t1              ; get abs (p1 - p0)
-        psubusb     xmm3,       xmm7
-
-        por         xmm4,       xmm3            ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
-        pcmpeqb     xmm4,       xmm0
-
-        pcmpeqb     xmm0,       xmm0
-        pxor        xmm4,       xmm0
+        LFH_HEV_MASK
 
         ; start work on filters
-        lea         rdx,        srct
-
-        movdqa      xmm2,       [rdx]           ; p1
-        movdqa      xmm7,       [rdx+48]        ; q1
-
-        movdqa      xmm6,       [rdx+16]        ; p0
-        movdqa      xmm0,       [rdx+32]        ; q0
-
-        pxor        xmm2,       [t80 GLOBAL]    ; p1 offset to convert to signed values
-        pxor        xmm7,       [t80 GLOBAL]    ; q1 offset to convert to signed values
-
-        psubsb      xmm2,       xmm7            ; p1 - q1
-        pand        xmm2,       xmm4            ; high var mask (hvm)(p1 - q1)
-
-        pxor        xmm6,       [t80 GLOBAL]    ; offset to convert to signed values
-        pxor        xmm0,       [t80 GLOBAL]    ; offset to convert to signed values
-
-        movdqa      xmm3,       xmm0            ; q0
-        psubsb      xmm0,       xmm6            ; q0 - p0
-
-        paddsb      xmm2,       xmm0            ; 1 * (q0 - p0) + hvm(p1 - q1)
-        paddsb      xmm2,       xmm0            ; 2 * (q0 - p0) + hvm(p1 - q1)
-
-        paddsb      xmm2,       xmm0            ; 3 * (q0 - p0) + hvm(p1 - q1)
-        pand        xmm1,       xmm2            ; mask filter values we don't care about
-
-        movdqa      xmm2,       xmm1
-        paddsb      xmm1,       [t4 GLOBAL]       ; 3* (q0 - p0) + hvm(p1 - q1) + 4
-
-        paddsb      xmm2,       [t3 GLOBAL]       ; 3* (q0 - p0) + hvm(p1 - q1) + 3
-        pxor        xmm0,       xmm0             ;
-
-        pxor        xmm5,       xmm5
-        punpcklbw   xmm0,       xmm2            ;
-
-        punpckhbw   xmm5,       xmm2            ;
-        psraw       xmm0,       11              ;
-
-        psraw       xmm5,       11
-        packsswb    xmm0,       xmm5
-
-        movdqa      xmm2,       xmm0            ;  (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
-
-        pxor        xmm0,       xmm0              ; 0
-        movdqa      xmm5,       xmm1              ; abcdefgh
-
-        punpcklbw   xmm0,       xmm1              ; e0f0g0h0
-        psraw       xmm0,       11                ; sign extended shift right by 3
-
-        pxor        xmm1,       xmm1              ; 0
-        punpckhbw   xmm1,       xmm5              ; a0b0c0d0
-
-        psraw       xmm1,       11                ; sign extended shift right by 3
-        movdqa      xmm5,       xmm0              ; save results
-
-        packsswb    xmm0,       xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
-        paddsw      xmm5,       [ones GLOBAL]
-
-        paddsw      xmm1,       [ones GLOBAL]
-        psraw       xmm5,       1                 ; partial shifted one more time for 2nd tap
-
-        psraw       xmm1,       1                 ; partial shifted one more time for 2nd tap
-        packsswb    xmm5,       xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
-
-        pandn       xmm4,       xmm5            ; high edge variance additive
-
-        paddsb      xmm6,       xmm2            ; p0+= p0 add
-        pxor        xmm6,       [t80 GLOBAL]    ; unoffset
-
-        ; mm6=p0                               ;
-        movdqa      xmm1,       [rdx]           ; p1
-        pxor        xmm1,       [t80 GLOBAL]    ; reoffset
-
-        paddsb      xmm1,       xmm4            ; p1+= p1 add
-        pxor        xmm1,       [t80 GLOBAL]    ; unoffset
-        ; mm6 = p0 mm1 = p1
-
-        psubsb      xmm3,       xmm0            ; q0-= q0 add
-        pxor        xmm3,       [t80 GLOBAL]    ; unoffset
-
-        ; mm3 = q0
-        psubsb      xmm7,       xmm4            ; q1-= q1 add
-        pxor        xmm7,       [t80 GLOBAL]    ; unoffset
-        ; mm7 = q1
-
-        ; tranpose and write back
-        ; xmm1 =    f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
-        ; xmm6 =    f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
-        ; xmm3 =    f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
-        ; xmm7 =    f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
-        movdqa      xmm2,       xmm1            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
-        punpcklbw   xmm2,       xmm6            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
-
-        movdqa      xmm4,       xmm3            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
-        punpckhbw   xmm1,       xmm6            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
-
-        punpcklbw   xmm4,       xmm7            ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
-        punpckhbw   xmm3,       xmm7            ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
-
-        movdqa      xmm6,       xmm2            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
-        punpcklwd   xmm2,       xmm4            ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
-
-        punpckhwd   xmm6,       xmm4            ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
-        movdqa      xmm5,       xmm1            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
-
-        punpcklwd   xmm1,       xmm3            ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
-        punpckhwd   xmm5,       xmm3            ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
-
-        ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
-        ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
-        ; xmm5 = f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
-        ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
-        lea         rsi,           [rsi+rax*8]
-
-        movd        [rsi+rax*4+2], xmm2
-        psrldq      xmm2,   4
-
-        movd        [rdi+rax*4+2], xmm2
-        psrldq      xmm2,   4
-
-        movd        [rsi+rax*2+2], xmm2
-        psrldq      xmm2,   4
-
-        movd        [rdi+rax*2+2], xmm2
-        movd        [rsi+2],       xmm6
-
-        psrldq      xmm6,   4
-        movd        [rdi+2],       xmm6
-
-        psrldq      xmm6,   4
-        neg         rax
-
-        movd        [rdi+rax+2],    xmm6
-        psrldq      xmm6,   4
-
-        movd        [rdi+rax*2+2],  xmm6
-        lea         rsi,       [rsi+rax*8]
-
-        neg         rax
-        ;;;;;;;;;;;;;;;;;;;;/
-        movd        [rsi+rax*4+2], xmm1
-        psrldq      xmm1,   4
-
-        movd        [rcx+rax*4+2], xmm1
-        psrldq      xmm1,   4
-
-        movd        [rsi+rax*2+2], xmm1
-        psrldq      xmm1,   4
-
-        movd        [rcx+rax*2+2], xmm1
-        psrldq      xmm1,   4
-
-        movd        [rsi+2],       xmm5
-        psrldq      xmm5,   4
-
-        movd        [rcx+2],        xmm5
-        psrldq      xmm5,   4
-
-        neg         rax
-        movd        [rcx+rax+2],    xmm5
-
-        psrldq      xmm5,   4
-        movd        [rcx+rax*2+2],  xmm5
+        BH_FILTER 0
+        ; write back the result
+        BH_WRITEBACK 0
 
     add rsp, 96
     pop rsp
@@ -661,233 +408,58 @@
     ret
 
 
-;void vp8_mbloop_filter_horizontal_edge_sse2
-;(
-;    unsigned char *src_ptr,
-;    int            src_pixel_step,
-;    const char    *flimit,
-;    const char    *limit,
-;    const char    *thresh,
-;    int            count
-;)
-global sym(vp8_mbloop_filter_horizontal_edge_sse2)
-sym(vp8_mbloop_filter_horizontal_edge_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 32                         ; reserve 32 bytes
-    %define t0  [rsp + 0]    ;__declspec(align(16)) char t0[8];
-    %define t1  [rsp + 16]   ;__declspec(align(16)) char t1[8];
-
-        mov         rsi,                    arg(0) ;src_ptr
-        movsxd      rax,                    dword ptr arg(1) ;src_pixel_step     ; destination pitch?
-
-        mov         rdx,                    arg(3) ;limit
-        movdqa      xmm7,                   XMMWORD PTR [rdx]
-
-        mov         rdi,                    rsi           ; rdi points to row +1 for indirect addressing
-        add         rdi,                    rax
-
-        ; calculate breakout conditions
-        movdqa      xmm2,                   XMMWORD PTR [rdi+2*rax]      ; q3
-        movdqa      xmm1,                   XMMWORD PTR [rsi+2*rax]      ; q2
-
-        movdqa      xmm6,                   xmm1              ; q2
-        psubusb     xmm1,                   xmm2              ; q2-=q3
-
-
-        psubusb     xmm2,                   xmm6              ; q3-=q2
-        por         xmm1,                   xmm2              ; abs(q3-q2)
-
-        psubusb     xmm1,                   xmm7
-
-        ; mm1 = abs(q3-q2), mm6 =q2, mm7 = limit
-        movdqa      xmm4,                   XMMWORD PTR [rsi+rax]         ; q1
-        movdqa      xmm3,                   xmm4              ; q1
-
-        psubusb     xmm4,                   xmm6              ; q1-=q2
-        psubusb     xmm6,                   xmm3              ; q2-=q1
-
-        por         xmm4,                   xmm6              ; abs(q2-q1)
-        psubusb     xmm4,                   xmm7
-
-        por         xmm1,                   xmm4
-        ; mm1 = mask,      mm3=q1, mm7 = limit
-
-        movdqa      xmm4,                   XMMWORD PTR [rsi]             ; q0
-        movdqa      xmm0,                   xmm4              ; q0
-
-        psubusb     xmm4,                   xmm3              ; q0-=q1
-        psubusb     xmm3,                   xmm0              ; q1-=q0
-
-        por         xmm4,                   xmm3              ; abs(q0-q1)
-        movdqa      t0,                     xmm4                  ; save to t0
-
-        psubusb     xmm4,                   xmm7
-        por         xmm1,                   xmm4
-
-        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1)
-        neg         rax                   ; negate pitch to deal with above border
-
-        movdqa      xmm2,                   XMMWORD PTR [rsi+4*rax]      ; p3
-        movdqa      xmm4,                   XMMWORD PTR [rdi+4*rax]      ; p2
-
-        movdqa      xmm5,                   xmm4              ; p2
-        psubusb     xmm4,                   xmm2              ; p2-=p3
-
-        psubusb     xmm2,                   xmm5              ; p3-=p2
-        por         xmm4,                   xmm2              ; abs(p3 - p2)
-
-        psubusb     xmm4,                   xmm7
-        por         xmm1,                   xmm4
-
-        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1)
-        movdqa      xmm4,                   XMMWORD PTR [rsi+2*rax]      ; p1
-        movdqa      xmm3,                   xmm4              ; p1
-
-        psubusb     xmm4,                   xmm5              ; p1-=p2
-        psubusb     xmm5,                   xmm3              ; p2-=p1
-
-        por         xmm4,                   xmm5              ; abs(p2 - p1)
-        psubusb     xmm4,                   xmm7
-
-        por         xmm1,                   xmm4
-
-        movdqa      xmm2,                   xmm3              ; p1
-
-        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1)
-        movdqa      xmm4,                   XMMWORD PTR [rsi+rax]         ; p0
-        movdqa      xmm5,                   xmm4              ; p0
-
-        psubusb     xmm4,                   xmm3              ; p0-=p1
-        psubusb     xmm3,                   xmm5              ; p1-=p0
-
-        por         xmm4,                   xmm3              ; abs(p1 - p0)
-        movdqa        t1,                   xmm4                  ; save to t1
-
-        psubusb     xmm4,                   xmm7
-        por         xmm1,                   xmm4
-
-        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1) t1 = abs(p1-p0)
-        ; mm5 = p0
-        movdqa      xmm3,                   XMMWORD PTR [rdi] ; q1
-        movdqa      xmm4,                   xmm3              ; q1
-        psubusb     xmm3,                   xmm2              ; q1-=p1
-        psubusb     xmm2,                   xmm4              ; p1-=q1
-        por         xmm2,                   xmm3              ; abs(p1-q1)
-        pand        xmm2,                   [tfe GLOBAL]      ; set lsb of each byte to zero
-        psrlw       xmm2,                   1                 ; abs(p1-q1)/2
-
-        movdqa      xmm6,                   xmm5              ; p0
-        movdqa      xmm3,                   xmm0              ; q0
-
-        psubusb     xmm5,                   xmm3              ; p0-=q0
-        psubusb     xmm3,                   xmm6              ; q0-=p0
-
-        por         xmm5,                   xmm3              ; abs(p0 - q0)
-        paddusb     xmm5,                   xmm5              ; abs(p0-q0)*2
-        paddusb     xmm5,                   xmm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
-        mov         rdx,                    arg(2) ;flimit            ; get flimit
-        movdqa      xmm2,                   XMMWORD PTR [rdx]             ;
-        paddb       xmm2,                   xmm2              ; flimit*2 (less than 255)
-        paddb       xmm7,                   xmm2              ; flimit * 2 + limit (less than 255)
-
-        psubusb     xmm5,                   xmm7              ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
-        por         xmm1,                   xmm5
-        pxor        xmm5,                   xmm5
-        pcmpeqb     xmm1,                   xmm5               ; mask mm1
-        ; mm1 = mask, mm0=q0,  mm7 = flimit, t0 = abs(q0-q1) t1 = abs(p1-p0)
-        ; mm6 = p0,
-
-        ; calculate high edge variance
-        mov         rdx,                    arg(4) ;thresh            ; get thresh
-        movdqa      xmm7,                   XMMWORD PTR [rdx]             ;
-
-        movdqa      xmm4,                   t0                ; get abs (q1 - q0)
-        psubusb     xmm4,                   xmm7
-
-        movdqa      xmm3,                   t1                ; get abs (p1 - p0)
-        psubusb     xmm3,                   xmm7
-
-        paddb       xmm4,                   xmm3              ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
-        pcmpeqb     xmm4,                   xmm5
-
-        pcmpeqb     xmm5,                   xmm5
-        pxor        xmm4,                   xmm5
-        ; mm1 = mask, mm0=q0,  mm7 = thresh, t0 = abs(q0-q1) t1 = abs(p1-p0)
-        ; mm6 = p0, mm4=hev
-        ; start work on filters
-        movdqa      xmm2,                   XMMWORD PTR [rsi+2*rax]   ; p1
-        movdqa      xmm7,                   XMMWORD PTR [rdi]             ; q1
-
-        pxor        xmm2,                   [t80 GLOBAL]  ; p1 offset to convert to signed values
-        pxor        xmm7,                   [t80 GLOBAL]  ; q1 offset to convert to signed values
+%macro MBH_FILTER 1
+%if %1
+        movdqa      xmm2,                   [rsi+2*rax]       ; p1
+        movdqa      xmm7,                   [rdi]             ; q1
+%else
+        movdqa      xmm2,                   p1                ; p1
+        movdqa      xmm7,                   q1                ; q1
+%endif
+        pxor        xmm2,                   [t80 GLOBAL]      ; p1 offset to convert to signed values
+        pxor        xmm7,                   [t80 GLOBAL]      ; q1 offset to convert to signed values
 
         psubsb      xmm2,                   xmm7              ; p1 - q1
-        pxor        xmm6,                   [t80 GLOBAL]  ; offset to convert to signed values
-
-        pxor        xmm0,                   [t80 GLOBAL]  ; offset to convert to signed values
+        pxor        xmm6,                   [t80 GLOBAL]      ; offset to convert to signed values
+        pxor        xmm0,                   [t80 GLOBAL]      ; offset to convert to signed values
         movdqa      xmm3,                   xmm0              ; q0
-
         psubsb      xmm0,                   xmm6              ; q0 - p0
         paddsb      xmm2,                   xmm0              ; 1 * (q0 - p0) + (p1 - q1)
-
         paddsb      xmm2,                   xmm0              ; 2 * (q0 - p0)
         paddsb      xmm2,                   xmm0              ; 3 * (q0 - p0) + (p1 - q1)
 
         pand        xmm1,                   xmm2              ; mask filter values we don't care about
-        ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0
         movdqa      xmm2,                   xmm1              ; vp8_filter
         pand        xmm2,                   xmm4;             ; Filter2 = vp8_filter & hev
 
-
-        movdqa      xmm5,                   xmm2          ;
-        paddsb      xmm5,                   [t3 GLOBAL];
+        movdqa      xmm5,                   xmm2
+        paddsb      xmm5,                   [t3 GLOBAL]
 
         pxor        xmm0,                   xmm0              ; 0
         pxor        xmm7,                   xmm7              ; 0
-
         punpcklbw   xmm0,                   xmm5              ; e0f0g0h0
         psraw       xmm0,                   11                ; sign extended shift right by 3
-
         punpckhbw   xmm7,                   xmm5              ; a0b0c0d0
         psraw       xmm7,                   11                ; sign extended shift right by 3
-
         packsswb    xmm0,                   xmm7              ; Filter2 >>=3;
         movdqa      xmm5,                   xmm0              ; Filter2
-
         paddsb      xmm2,                   [t4 GLOBAL]      ; vp8_signed_char_clamp(Filter2 + 4)
-        pxor        xmm0,                   xmm0              ; 0
 
+        pxor        xmm0,                   xmm0              ; 0
         pxor        xmm7,                   xmm7              ; 0
         punpcklbw   xmm0,                   xmm2              ; e0f0g0h0
-
         psraw       xmm0,                   11                ; sign extended shift right by 3
         punpckhbw   xmm7,                   xmm2              ; a0b0c0d0
-
         psraw       xmm7,                   11                ; sign extended shift right by 3
         packsswb    xmm0,                   xmm7              ; Filter2 >>=3;
 
-        ; mm0= filter2 mm1 = vp8_filter,  mm3 =qs0 mm5=s mm4 =hev mm6=ps0
         psubsb      xmm3,                   xmm0              ; qs0 =qs0 - filter1
         paddsb      xmm6,                   xmm5              ; ps0 =ps0 + Fitler2
 
-        ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0
-        ; vp8_filter &= ~hev;
-        ; Filter2 = vp8_filter;
         pandn       xmm4,                   xmm1              ; vp8_filter&=~hev
+%endmacro
 
-
-        ; mm3=qs0, mm4=filter2, mm6=ps0
-
+%macro MBH_WRITEBACK 1
         ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
         ; s = vp8_signed_char_clamp(qs0 - u);
         ; *oq0 = s^0x80;
@@ -917,8 +489,20 @@
         pxor        xmm3,                   [t80 GLOBAL]
         pxor        xmm6,                   [t80 GLOBAL]
 
+%if %1
         movdqa      XMMWORD PTR [rsi+rax],  xmm6
         movdqa      XMMWORD PTR [rsi],      xmm3
+%else
+        lea         rsi,                    [rsi + rcx*2]
+        lea         rdi,                    [rdi + rcx*2]
+
+        movq        MMWORD PTR [rsi],       xmm6              ; p0
+        psrldq      xmm6,                   8
+        movq        MMWORD PTR [rdi],       xmm6
+        movq        MMWORD PTR [rsi + rcx], xmm3              ; q0
+        psrldq      xmm3,                   8
+        movq        MMWORD PTR [rdi + rcx], xmm3
+%endif
 
         ; roughly 2/7th difference across boundary
         ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
@@ -943,8 +527,13 @@
 
         packsswb    xmm1,                   xmm2
 
+%if %1
         movdqa      xmm3,                   XMMWORD PTR [rdi]
-        movdqa      xmm6,                   XMMWORD PTR [rsi+rax*2]       ; p1
+        movdqa      xmm6,                   XMMWORD PTR [rsi+rax*2] ; p1
+%else
+        movdqa      xmm3,                   q1                ; q1
+        movdqa      xmm6,                   p1                ; p1
+%endif
 
         pxor        xmm3,                   [t80 GLOBAL]
         pxor        xmm6,                   [t80 GLOBAL]
@@ -955,9 +544,18 @@
         pxor        xmm6,                   [t80 GLOBAL]
         pxor        xmm3,                   [t80 GLOBAL]
 
+%if %1
         movdqa      XMMWORD PTR [rdi],      xmm3
         movdqa      XMMWORD PTR [rsi+rax*2],xmm6
+%else
+        movq        MMWORD PTR [rsi + rcx*2],xmm3             ; q1
+        psrldq      xmm3,                   8
+        movq        MMWORD PTR [rdi + rcx*2],xmm3
 
+        movq        MMWORD PTR [rsi + rax], xmm6              ; p1
+        psrldq      xmm6,                   8
+        movq        MMWORD PTR [rdi + rax], xmm6
+%endif
         ; roughly 1/7th difference across boundary
         ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
         ; s = vp8_signed_char_clamp(qs2 - u);
@@ -981,11 +579,15 @@
 
         packsswb    xmm1,                   xmm2
 
-
+%if %1
         movdqa      xmm6,                   XMMWORD PTR [rdi+rax*4]
         neg         rax
 
-        movdqa      xmm3,                   XMMWORD PTR [rdi+rax  ]
+        movdqa      xmm3,                   XMMWORD PTR [rdi+rax]
+%else
+        movdqa      xmm6,                   p2                ; p2
+        movdqa      xmm3,                   q2                ; q2
+%endif
 
         pxor        xmm6,                   [t80 GLOBAL]
         pxor        xmm3,                   [t80 GLOBAL]
@@ -995,11 +597,68 @@
 
         pxor        xmm6,                   [t80 GLOBAL]
         pxor        xmm3,                   [t80 GLOBAL]
-
-        movdqa      XMMWORD PTR [rdi+rax  ], xmm3
+%if %1
+        movdqa      XMMWORD PTR [rdi+rax  ],xmm3
         neg         rax
 
-        movdqa      XMMWORD PTR [rdi+rax*4], xmm6
+        movdqa      XMMWORD PTR [rdi+rax*4],xmm6
+%else
+        movq        MMWORD PTR [rsi+rax*2], xmm6              ; p2
+        psrldq      xmm6,                   8
+        movq        MMWORD PTR [rdi+rax*2], xmm6
+
+        lea         rsi,                    [rsi + rcx]
+        lea         rdi,                    [rdi + rcx]
+        movq        MMWORD PTR [rsi+rcx*2  ],xmm3             ; q2
+        psrldq      xmm3,                   8
+        movq        MMWORD PTR [rdi+rcx*2  ],xmm3
+%endif
+%endmacro
+
+
+;void vp8_mbloop_filter_horizontal_edge_sse2
+;(
+;    unsigned char *src_ptr,
+;    int            src_pixel_step,
+;    const char    *flimit,
+;    const char    *limit,
+;    const char    *thresh,
+;    int            count
+;)
+global sym(vp8_mbloop_filter_horizontal_edge_sse2)
+sym(vp8_mbloop_filter_horizontal_edge_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 32     ; reserve 32 bytes
+    %define t0 [rsp + 0]    ;__declspec(align(16)) char t0[16];
+    %define t1 [rsp + 16]   ;__declspec(align(16)) char t1[16];
+
+        mov         rsi,                    arg(0)            ;src_ptr
+        movsxd      rax,                    dword ptr arg(1)  ;src_pixel_step
+
+        mov         rdx,                    arg(3)            ;limit
+        movdqa      xmm7,                   XMMWORD PTR [rdx]
+
+        lea         rdi,                    [rsi+rax]         ; rdi points to row +1 for indirect addressing
+
+        ; calculate breakout conditions
+        LFH_FILTER_MASK 1
+
+        ; calculate high edge variance
+        LFH_HEV_MASK
+
+        ; start work on filters
+        MBH_FILTER 1
+        ; write back the result
+        MBH_WRITEBACK 1
 
     add rsp, 32
     pop rsp
@@ -1013,6 +672,877 @@
     ret
 
 
+;void vp8_mbloop_filter_horizontal_edge_uv_sse2
+;(
+;    unsigned char *u,
+;    int            src_pixel_step,
+;    const char    *flimit,
+;    const char    *limit,
+;    const char    *thresh,
+;    unsigned char *v
+;)
+global sym(vp8_mbloop_filter_horizontal_edge_uv_sse2)
+sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 96       ; reserve 96 bytes
+    %define q2  [rsp + 0]     ;__declspec(align(16)) char q2[16];
+    %define q1  [rsp + 16]    ;__declspec(align(16)) char q1[16];
+    %define p2  [rsp + 32]    ;__declspec(align(16)) char p2[16];
+    %define p1  [rsp + 48]    ;__declspec(align(16)) char p1[16];
+    %define t0  [rsp + 64]    ;__declspec(align(16)) char t0[16];
+    %define t1  [rsp + 80]    ;__declspec(align(16)) char t1[16];
+
+        mov         rsi,                    arg(0)             ; u
+        mov         rdi,                    arg(5)             ; v
+        movsxd      rax,                    dword ptr arg(1)   ; src_pixel_step
+        mov         rcx,                    rax
+        neg         rax                     ; negate pitch to deal with above border
+
+        mov         rdx,                    arg(3)             ;limit
+        movdqa      xmm7,                   XMMWORD PTR [rdx]
+
+        lea         rsi,                    [rsi + rcx]
+        lea         rdi,                    [rdi + rcx]
+
+        ; calculate breakout conditions
+        LFH_FILTER_MASK 0
+
+        ; calculate high edge variance
+        LFH_HEV_MASK
+
+        ; start work on filters
+        MBH_FILTER 0
+        ; write back the result
+        MBH_WRITEBACK 0
+
+    add rsp, 96
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+%macro TRANSPOSE_16X8_1 0
+        movq        xmm0,               QWORD PTR [rdi+rcx*2]   ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70
+        movq        xmm7,               QWORD PTR [rsi+rcx*2]   ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60
+
+        punpcklbw   xmm7,               xmm0            ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60
+        movq        xmm0,               QWORD PTR [rsi+rcx]
+
+        movq        xmm5,               QWORD PTR [rsi] ;
+        punpcklbw   xmm5,               xmm0            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
+
+        movdqa      xmm6,               xmm5            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
+        punpcklwd   xmm5,               xmm7            ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
+
+        punpckhwd   xmm6,               xmm7            ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44
+        movq        xmm7,               QWORD PTR [rsi + rax]   ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30
+
+        movq        xmm0,               QWORD PTR [rsi + rax*2] ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20
+        punpcklbw   xmm0,               xmm7            ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20
+
+        movq        xmm4,               QWORD PTR [rsi + rax*4] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00
+        movq        xmm7,               QWORD PTR [rdi + rax*4] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10
+
+        punpcklbw   xmm4,               xmm7            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
+        movdqa      xmm3,               xmm4            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
+
+        punpcklwd   xmm3,               xmm0            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
+        punpckhwd   xmm4,               xmm0            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
+
+        movdqa      xmm7,               xmm4            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
+        movdqa      xmm2,               xmm3            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
+
+        punpckhdq   xmm7,               xmm6            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
+        punpckldq   xmm4,               xmm6            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
+
+        punpckhdq   xmm3,               xmm5            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
+        punpckldq   xmm2,               xmm5            ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
+
+        movdqa      t0,                 xmm2            ; save to free XMM2
+%endmacro
+
+%macro TRANSPOSE_16X8_2 1
+        movq        xmm6,               QWORD PTR [rdi+rcx*2] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0
+        movq        xmm5,               QWORD PTR [rsi+rcx*2] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0
+
+        punpcklbw   xmm5,               xmm6            ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0
+        movq        xmm6,               QWORD PTR [rsi+rcx]   ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0
+
+        movq        xmm1,               QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0
+        punpcklbw   xmm1,               xmm6            ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0
+
+        movdqa      xmm6,               xmm1            ;
+        punpckhwd   xmm6,               xmm5            ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4
+
+        punpcklwd   xmm1,               xmm5            ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
+        movq        xmm5,               QWORD PTR [rsi+rax]   ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0
+
+        movq        xmm0,               QWORD PTR [rsi+rax*2] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0
+        punpcklbw   xmm0,               xmm5                  ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0
+
+        movq        xmm2,               QWORD PTR [rsi+rax*4] ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80
+        movq        xmm5,               QWORD PTR [rdi+rax*4] ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90
+
+        punpcklbw   xmm2,               xmm5            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
+        movdqa      xmm5,               xmm2            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
+
+        punpcklwd   xmm5,               xmm0            ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
+        punpckhwd   xmm2,               xmm0            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
+
+        movdqa      xmm0,               xmm5
+        punpckldq   xmm0,               xmm1            ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
+
+
+        punpckhdq   xmm5,               xmm1            ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
+        movdqa      xmm1,               xmm2            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
+
+        punpckldq   xmm1,               xmm6            ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84
+        punpckhdq   xmm2,               xmm6            ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86
+
+        movdqa      xmm6,               xmm7            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
+        punpcklqdq  xmm6,               xmm2            ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
+
+        punpckhqdq  xmm7,               xmm2            ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07
+%if %1
+        movdqa      xmm2,               xmm3            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
+
+        punpcklqdq  xmm2,               xmm5            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+
+        punpckhqdq  xmm3,               xmm5            ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+        movdqa      [rdx],              xmm2            ; save 2
+
+        movdqa      xmm5,               xmm4            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
+        punpcklqdq  xmm4,               xmm1            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+
+        movdqa      [rdx+16],           xmm3            ; save 3
+        punpckhqdq  xmm5,               xmm1            ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
+
+        movdqa      [rdx+32],           xmm4            ; save 4
+        movdqa      [rdx+48],           xmm5            ; save 5
+
+        movdqa      xmm1,               t0              ; get
+        movdqa      xmm2,               xmm1            ;
+
+        punpckhqdq  xmm1,               xmm0            ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
+        punpcklqdq  xmm2,               xmm0            ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+%else
+        movdqa      [rdx+112],          xmm7            ; save 7
+        movdqa      xmm2,               xmm3            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
+
+        movdqa      [rdx+96],           xmm6            ; save 6
+        punpcklqdq  xmm2,               xmm5            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+
+        punpckhqdq  xmm3,               xmm5            ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+        movdqa      [rdx+32],           xmm2            ; save 2
+
+        movdqa      xmm5,               xmm4            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
+        punpcklqdq  xmm4,               xmm1            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+
+        movdqa      [rdx+48],           xmm3            ; save 3
+        punpckhqdq  xmm5,               xmm1            ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
+
+        movdqa      [rdx+64],           xmm4            ; save 4
+        movdqa      [rdx+80],           xmm5            ; save 5
+
+        movdqa      xmm1,               t0              ; get
+        movdqa      xmm2,               xmm1
+
+        punpckhqdq  xmm1,               xmm0            ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
+        punpcklqdq  xmm2,               xmm0            ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+
+        movdqa      [rdx+16],           xmm1
+        movdqa      [rdx],              xmm2
+%endif
+%endmacro
+
+%macro LFV_FILTER_MASK 1
+        movdqa      xmm0,               xmm6            ; q2
+        psubusb     xmm0,               xmm7            ; q2-q3
+
+        psubusb     xmm7,               xmm6            ; q3-q2
+        por         xmm7,               xmm0            ; abs (q3-q2)
+
+        movdqa      xmm4,               xmm5            ; q1
+        psubusb     xmm4,               xmm6            ; q1-q2
+
+        psubusb     xmm6,               xmm5            ; q2-q1
+        por         xmm6,               xmm4            ; abs (q2-q1)
+
+        movdqa      xmm0,               xmm1
+
+        psubusb     xmm0,               xmm2            ; p2 - p3;
+        psubusb     xmm2,               xmm1            ; p3 - p2;
+
+        por         xmm0,               xmm2            ; abs(p2-p3)
+%if %1
+        movdqa      xmm2,               [rdx]           ; p1
+%else
+        movdqa      xmm2,               [rdx+32]        ; p1
+%endif
+        movdqa      xmm5,               xmm2            ; p1
+
+        psubusb     xmm5,               xmm1            ; p1-p2
+        psubusb     xmm1,               xmm2            ; p2-p1
+
+        por         xmm1,               xmm5            ; abs(p2-p1)
+
+        mov         rdx,                arg(3)          ; limit
+        movdqa      xmm4,               [rdx]           ; limit
+
+        psubusb     xmm7,               xmm4
+
+        psubusb     xmm0,               xmm4            ; abs(p3-p2) > limit
+        psubusb     xmm1,               xmm4            ; abs(p2-p1) > limit
+
+        psubusb     xmm6,               xmm4            ; abs(q2-q1) > limit
+        por         xmm7,               xmm6            ; or
+
+        por         xmm0,               xmm1
+        por         xmm0,               xmm7            ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
+
+        movdqa      xmm1,               xmm2            ; p1
+
+        movdqa      xmm7,               xmm3            ; p0
+        psubusb     xmm7,               xmm2            ; p0-p1
+
+        psubusb     xmm2,               xmm3            ; p1-p0
+        por         xmm2,               xmm7            ; abs(p1-p0)
+
+        movdqa      t0,                 xmm2            ; save abs(p1-p0)
+        lea         rdx,                srct
+
+        psubusb     xmm2,               xmm4            ; abs(p1-p0)>limit
+        por         xmm0,               xmm2            ; mask
+%if %1
+        movdqa      xmm5,               [rdx+32]        ; q0
+        movdqa      xmm7,               [rdx+48]        ; q1
+%else
+        movdqa      xmm5,               [rdx+64]        ; q0
+        movdqa      xmm7,               [rdx+80]        ; q1
+%endif
+        movdqa      xmm6,               xmm5            ; q0
+        movdqa      xmm2,               xmm7            ; q1
+        psubusb     xmm5,               xmm7            ; q0-q1
+
+        psubusb     xmm7,               xmm6            ; q1-q0
+        por         xmm7,               xmm5            ; abs(q1-q0)
+
+        movdqa      t1,                 xmm7            ; save abs(q1-q0)
+        psubusb     xmm7,               xmm4            ; abs(q1-q0)> limit
+
+        por         xmm0,               xmm7            ; mask
+
+        movdqa      xmm5,               xmm2            ; q1
+        psubusb     xmm5,               xmm1            ; q1-=p1
+        psubusb     xmm1,               xmm2            ; p1-=q1
+        por         xmm5,               xmm1            ; abs(p1-q1)
+        pand        xmm5,               [tfe GLOBAL]    ; set lsb of each byte to zero
+        psrlw       xmm5,               1               ; abs(p1-q1)/2
+
+        mov         rdx,                arg(2)          ; flimit
+        movdqa      xmm2,               [rdx]           ; flimit
+
+        movdqa      xmm1,               xmm3            ; p0
+        movdqa      xmm7,               xmm6            ; q0
+        psubusb     xmm1,               xmm7            ; p0-q0
+        psubusb     xmm7,               xmm3            ; q0-p0
+        por         xmm1,               xmm7            ; abs(q0-p0)
+        paddusb     xmm1,               xmm1            ; abs(q0-p0)*2
+        paddusb     xmm1,               xmm5            ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+        paddb       xmm2,               xmm2            ; flimit*2 (less than 255)
+        paddb       xmm4,               xmm2            ; flimit * 2 + limit (less than 255)
+
+        psubusb     xmm1,               xmm4            ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
+        por         xmm1,               xmm0;           ; mask
+        pxor        xmm0,               xmm0
+        pcmpeqb     xmm1,               xmm0
+%endmacro
+
+%macro LFV_HEV_MASK 0
+        mov         rdx,                arg(4)          ; get thresh
+        movdqa      xmm7,               XMMWORD PTR [rdx]
+
+        movdqa      xmm4,               t0              ; get abs (q1 - q0)
+        psubusb     xmm4,               xmm7            ; abs(q1 - q0) > thresh
+
+        movdqa      xmm3,               t1              ; get abs (p1 - p0)
+        psubusb     xmm3,               xmm7            ; abs(p1 - p0)> thresh
+
+        por         xmm4,               xmm3            ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+        pcmpeqb     xmm4,               xmm0
+
+        pcmpeqb     xmm0,               xmm0
+        pxor        xmm4,               xmm0
+%endmacro
+
+%macro BV_FILTER 0
+        lea         rdx,                srct
+
+        movdqa      xmm2,               [rdx]           ; p1        lea         rsi,       [rsi+rcx*8]
+        lea         rdi,                [rsi+rcx]
+        movdqa      xmm7,               [rdx+48]        ; q1
+        movdqa      xmm6,               [rdx+16]        ; p0
+        movdqa      xmm0,               [rdx+32]        ; q0
+
+        pxor        xmm2,               [t80 GLOBAL]    ; p1 offset to convert to signed values
+        pxor        xmm7,               [t80 GLOBAL]    ; q1 offset to convert to signed values
+
+        psubsb      xmm2,               xmm7            ; p1 - q1
+        pand        xmm2,               xmm4            ; high var mask (hvm)(p1 - q1)
+
+        pxor        xmm6,               [t80 GLOBAL]    ; offset to convert to signed values
+        pxor        xmm0,               [t80 GLOBAL]    ; offset to convert to signed values
+
+        movdqa      xmm3,               xmm0            ; q0
+        psubsb      xmm0,               xmm6            ; q0 - p0
+
+        paddsb      xmm2,               xmm0            ; 1 * (q0 - p0) + hvm(p1 - q1)
+        paddsb      xmm2,               xmm0            ; 2 * (q0 - p0) + hvm(p1 - q1)
+
+        paddsb      xmm2,               xmm0            ; 3 * (q0 - p0) + hvm(p1 - q1)
+        pand        xmm1,               xmm2            ; mask filter values we don't care about
+
+        movdqa      xmm2,               xmm1
+        paddsb      xmm1,               [t4 GLOBAL]     ; 3* (q0 - p0) + hvm(p1 - q1) + 4
+
+        paddsb      xmm2,               [t3 GLOBAL]     ; 3* (q0 - p0) + hvm(p1 - q1) + 3
+        pxor        xmm0,               xmm0
+
+        pxor        xmm5,               xmm5
+        punpcklbw   xmm0,               xmm2
+
+        punpckhbw   xmm5,               xmm2
+        psraw       xmm0,               11
+
+        psraw       xmm5,               11
+        packsswb    xmm0,               xmm5
+
+        movdqa      xmm2,               xmm0            ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
+
+        pxor        xmm0,               xmm0            ; 0
+        movdqa      xmm5,               xmm1            ; abcdefgh
+
+        punpcklbw   xmm0,               xmm1            ; e0f0g0h0
+        psraw       xmm0,               11              ; sign extended shift right by 3
+
+        pxor        xmm1,               xmm1            ; 0
+        punpckhbw   xmm1,               xmm5            ; a0b0c0d0
+
+        psraw       xmm1,               11              ; sign extended shift right by 3
+        movdqa      xmm5,               xmm0            ; save results
+
+        packsswb    xmm0,               xmm1            ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
+        paddsw      xmm5,               [ones GLOBAL]
+
+        paddsw      xmm1,               [ones GLOBAL]
+        psraw       xmm5,               1               ; partial shifted one more time for 2nd tap
+
+        psraw       xmm1,               1               ; partial shifted one more time for 2nd tap
+        packsswb    xmm5,               xmm1            ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
+
+        pandn       xmm4,               xmm5            ; high edge variance additive
+
+        paddsb      xmm6,               xmm2            ; p0+= p0 add
+        pxor        xmm6,               [t80 GLOBAL]    ; unoffset
+
+        movdqa      xmm1,               [rdx]           ; p1
+        pxor        xmm1,               [t80 GLOBAL]    ; reoffset
+
+        paddsb      xmm1,               xmm4            ; p1+= p1 add
+        pxor        xmm1,               [t80 GLOBAL]    ; unoffset
+
+        psubsb      xmm3,               xmm0            ; q0-= q0 add
+        pxor        xmm3,               [t80 GLOBAL]    ; unoffset
+
+        psubsb      xmm7,               xmm4            ; q1-= q1 add
+        pxor        xmm7,               [t80 GLOBAL]    ; unoffset
+%endmacro
+
+%macro BV_TRANSPOSE 0
+        ; xmm1 =    f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+        ; xmm6 =    f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+        ; xmm3 =    f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+        ; xmm7 =    f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
+        movdqa      xmm2,               xmm1            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+        punpcklbw   xmm2,               xmm6            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
+
+        movdqa      xmm4,               xmm3            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+        punpckhbw   xmm1,               xmm6            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+
+        punpcklbw   xmm4,               xmm7            ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
+        punpckhbw   xmm3,               xmm7            ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
+
+        movdqa      xmm6,               xmm2            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
+        punpcklwd   xmm2,               xmm4            ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
+
+        punpckhwd   xmm6,               xmm4            ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
+        movdqa      xmm5,               xmm1            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+
+        punpcklwd   xmm1,               xmm3            ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
+        punpckhwd   xmm5,               xmm3            ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
+        ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
+        ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
+        ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
+        ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
+%endmacro
+
+%macro BV_WRITEBACK 2
+        movd        [rsi+rax*4+2],      %1
+        psrldq      %1,                 4
+
+        movd        [rdi+rax*4+2],      %1
+        psrldq      %1,                 4
+
+        movd        [rsi+rax*2+2],      %1
+        psrldq      %1,                 4
+
+        movd        [rdi+rax*2+2],      %1
+
+        movd        [rsi+2],            %2
+        psrldq      %2,                 4
+
+        movd        [rdi+2],            %2
+        psrldq      %2,                 4
+
+        movd        [rdi+rcx+2],        %2
+        psrldq      %2,                 4
+
+        movd        [rdi+rcx*2+2],      %2
+%endmacro
+
+
+;void vp8_loop_filter_vertical_edge_sse2
+;(
+;    unsigned char *src_ptr,
+;    int            src_pixel_step,
+;    const char    *flimit,
+;    const char    *limit,
+;    const char    *thresh,
+;    int            count
+;)
+global sym(vp8_loop_filter_vertical_edge_sse2)
+sym(vp8_loop_filter_vertical_edge_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub             rsp, 96      ; reserve 96 bytes
+    %define t0      [rsp + 0]    ;__declspec(align(16)) char t0[16];
+    %define t1      [rsp + 16]   ;__declspec(align(16)) char t1[16];
+    %define srct    [rsp + 32]   ;__declspec(align(16)) char srct[64];
+
+        mov         rsi,        arg(0)                  ; src_ptr
+        movsxd      rax,        dword ptr arg(1)        ; src_pixel_step
+
+        lea         rsi,        [rsi + rax*4 - 4]
+        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
+        mov         rcx,        rax
+        neg         rax
+
+        ;transpose 16x8 to 8x16, and store the 8-line result on stack.
+        TRANSPOSE_16X8_1
+
+        lea         rsi,        [rsi+rcx*8]
+        lea         rdi,        [rdi+rcx*8]
+        lea         rdx,        srct
+        TRANSPOSE_16X8_2 1
+
+        ; calculate filter mask
+        LFV_FILTER_MASK 1
+        ; calculate high edge variance
+        LFV_HEV_MASK
+
+        ; start work on filters
+        BV_FILTER
+
+        ; tranpose and write back - only work on q1, q0, p0, p1
+        BV_TRANSPOSE
+        ; store 16-line result
+        BV_WRITEBACK xmm1, xmm5
+
+        lea         rsi,        [rsi+rax*8]
+        lea         rdi,        [rsi+rcx]
+        BV_WRITEBACK xmm2, xmm6
+
+    add rsp, 96
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_loop_filter_vertical_edge_uv_sse2
+;(
+;    unsigned char *u,
+;    int            src_pixel_step,
+;    const char    *flimit,
+;    const char    *limit,
+;    const char    *thresh,
+;    unsigned char *v
+;)
+global sym(vp8_loop_filter_vertical_edge_uv_sse2)
+sym(vp8_loop_filter_vertical_edge_uv_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub             rsp, 96      ; reserve 96 bytes
+    %define t0      [rsp + 0]    ;__declspec(align(16)) char t0[16];
+    %define t1      [rsp + 16]   ;__declspec(align(16)) char t1[16];
+    %define srct    [rsp + 32]   ;__declspec(align(16)) char srct[64];
+
+        mov         rsi,        arg(0)                  ; u_ptr
+        movsxd      rax,        dword ptr arg(1)        ; src_pixel_step
+
+        lea         rsi,        [rsi + rax*4 - 4]
+        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
+        mov         rcx,        rax
+        neg         rax
+
+        ;transpose 16x8 to 8x16, and store the 8-line result on stack.
+        TRANSPOSE_16X8_1
+
+        mov         rsi,        arg(5)                   ; v_ptr
+        lea         rsi,        [rsi + rcx*4 - 4]
+        lea         rdi,        [rsi + rcx]              ; rdi points to row +1 for indirect addressing
+
+        lea         rdx,        srct
+        TRANSPOSE_16X8_2 1
+
+        ; calculate filter mask
+        LFV_FILTER_MASK 1
+        ; calculate high edge variance
+        LFV_HEV_MASK
+
+        ; start work on filters
+        BV_FILTER
+
+        ; tranpose and write back - only work on q1, q0, p0, p1
+        BV_TRANSPOSE
+        ; store 16-line result
+        BV_WRITEBACK xmm1, xmm5
+
+        mov         rsi,        arg(0)                   ;u_ptr
+        lea         rsi,        [rsi + rcx*4 - 4]
+        lea         rdi,        [rsi + rcx]
+        BV_WRITEBACK xmm2, xmm6
+
+    add rsp, 96
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+%macro MBV_FILTER 0
+        lea         rdx,                srct
+
+        movdqa      xmm2,               [rdx+32]        ; p1
+        movdqa      xmm7,               [rdx+80]        ; q1
+        movdqa      xmm6,               [rdx+48]        ; p0
+        movdqa      xmm0,               [rdx+64]        ; q0
+
+        pxor        xmm2,               [t80 GLOBAL]    ; p1 offset to convert to signed values
+        pxor        xmm7,               [t80 GLOBAL]    ; q1 offset to convert to signed values
+        pxor        xmm6,               [t80 GLOBAL]    ; offset to convert to signed values
+        pxor        xmm0,               [t80 GLOBAL]    ; offset to convert to signed values
+
+        psubsb      xmm2,               xmm7            ; p1 - q1
+
+        movdqa      xmm3,               xmm0            ; q0
+
+        psubsb      xmm0,               xmm6            ; q0 - p0
+        paddsb      xmm2,               xmm0            ; 1 * (q0 - p0) + (p1 - q1)
+
+        paddsb      xmm2,               xmm0            ; 2 * (q0 - p0)
+        paddsb      xmm2,               xmm0            ; 3 * (q0 - p0)+ (p1 - q1)
+
+        pand        xmm1,               xmm2            ; mask filter values we don't care about
+
+        movdqa      xmm2,               xmm1            ; vp8_filter
+        pand        xmm2,               xmm4;           ; Filter2 = vp8_filter & hev
+
+        movdqa      xmm5,               xmm2
+        paddsb      xmm5,               [t3 GLOBAL]
+
+        pxor        xmm0,               xmm0            ; 0
+        pxor        xmm7,               xmm7            ; 0
+
+        punpcklbw   xmm0,               xmm5            ; e0f0g0h0
+        psraw       xmm0,               11              ; sign extended shift right by 3
+
+        punpckhbw   xmm7,               xmm5            ; a0b0c0d0
+        psraw       xmm7,               11              ; sign extended shift right by 3
+
+        packsswb    xmm0,               xmm7            ; Filter2 >>=3;
+        movdqa      xmm5,               xmm0            ; Filter2
+
+        paddsb      xmm2,               [t4 GLOBAL]     ; vp8_signed_char_clamp(Filter2 + 4)
+        pxor        xmm0,               xmm0            ; 0
+
+        pxor        xmm7,               xmm7            ; 0
+        punpcklbw   xmm0,               xmm2            ; e0f0g0h0
+
+        psraw       xmm0,               11              ; sign extended shift right by 3
+        punpckhbw   xmm7,               xmm2            ; a0b0c0d0
+
+        psraw       xmm7,               11              ; sign extended shift right by 3
+        packsswb    xmm0,               xmm7            ; Filter2 >>=3;
+
+        psubsb      xmm3,               xmm0            ; qs0 =qs0 - filter1
+        paddsb      xmm6,               xmm5            ; ps0 =ps0 + Fitler2
+
+        ; vp8_filter &= ~hev;
+        ; Filter2 = vp8_filter;
+        pandn       xmm4,               xmm1            ; vp8_filter&=~hev
+
+        ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
+        ; s = vp8_signed_char_clamp(qs0 - u);
+        ; *oq0 = s^0x80;
+        ; s = vp8_signed_char_clamp(ps0 + u);
+        ; *op0 = s^0x80;
+        pxor        xmm0,               xmm0
+        pxor        xmm1,               xmm1
+
+        pxor        xmm2,               xmm2
+        punpcklbw   xmm1,               xmm4
+
+        punpckhbw   xmm2,               xmm4
+        pmulhw      xmm1,               [s27 GLOBAL]
+
+        pmulhw      xmm2,               [s27 GLOBAL]
+        paddw       xmm1,               [s63 GLOBAL]
+
+        paddw       xmm2,               [s63 GLOBAL]
+        psraw       xmm1,               7
+
+        psraw       xmm2,               7
+        packsswb    xmm1,               xmm2
+
+        psubsb      xmm3,               xmm1
+        paddsb      xmm6,               xmm1
+
+        pxor        xmm3,               [t80 GLOBAL]
+        pxor        xmm6,               [t80 GLOBAL]
+
+        movdqa      [rdx+48],           xmm6
+        movdqa      [rdx+64],           xmm3
+
+        ; roughly 2/7th difference across boundary
+        ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
+        ; s = vp8_signed_char_clamp(qs1 - u);
+        ; *oq1 = s^0x80;
+        ; s = vp8_signed_char_clamp(ps1 + u);
+        ; *op1 = s^0x80;
+        pxor        xmm1,               xmm1
+        pxor        xmm2,               xmm2
+
+        punpcklbw   xmm1,               xmm4
+        punpckhbw   xmm2,               xmm4
+
+        pmulhw      xmm1,               [s18 GLOBAL]
+        pmulhw      xmm2,               [s18 GLOBAL]
+
+        paddw       xmm1,               [s63 GLOBAL]
+        paddw       xmm2,               [s63 GLOBAL]
+
+        psraw       xmm1,               7
+        psraw       xmm2,               7
+
+        packsswb    xmm1,               xmm2
+
+        movdqa      xmm3,               [rdx + 80]              ; q1
+        movdqa      xmm6,               [rdx + 32]              ; p1
+
+        pxor        xmm3,               [t80 GLOBAL]
+        pxor        xmm6,               [t80 GLOBAL]
+
+        paddsb      xmm6,               xmm1
+        psubsb      xmm3,               xmm1
+
+        pxor        xmm6,               [t80 GLOBAL]
+        pxor        xmm3,               [t80 GLOBAL]
+
+        movdqa      [rdx + 80],         xmm3
+        movdqa      [rdx + 32],         xmm6
+
+        ; roughly 1/7th difference across boundary
+        ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
+        ; s = vp8_signed_char_clamp(qs2 - u);
+        ; *oq2 = s^0x80;
+        ; s = vp8_signed_char_clamp(ps2 + u);
+        ; *op2 = s^0x80;
+        pxor        xmm1,               xmm1
+        pxor        xmm2,               xmm2
+
+        punpcklbw   xmm1,               xmm4
+        punpckhbw   xmm2,               xmm4
+
+        pmulhw      xmm1,               [s9 GLOBAL]
+        pmulhw      xmm2,               [s9 GLOBAL]
+
+        paddw       xmm1,               [s63 GLOBAL]
+        paddw       xmm2,               [s63 GLOBAL]
+
+        psraw       xmm1,               7
+        psraw       xmm2,               7
+
+        packsswb    xmm1,               xmm2
+
+        movdqa      xmm6,               [rdx+16]
+        movdqa      xmm3,               [rdx+96]
+
+        pxor        xmm6,               [t80 GLOBAL]
+        pxor        xmm3,               [t80 GLOBAL]
+
+        paddsb      xmm6,               xmm1
+        psubsb      xmm3,               xmm1
+
+        pxor        xmm6,               [t80 GLOBAL]        ; xmm6 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
+        pxor        xmm3,               [t80 GLOBAL]        ; xmm3 = f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 15 06
+%endmacro
+
+%macro MBV_TRANSPOSE 0
+        movdqa      xmm0,               [rdx]               ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+        movdqa      xmm1,               xmm0                ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+
+        punpcklbw   xmm0,               xmm6                ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
+        punpckhbw   xmm1,               xmm6                ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
+
+        movdqa      xmm2,               [rdx+32]            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+        movdqa      xmm6,               xmm2                ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+
+        punpcklbw   xmm2,               [rdx+48]            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
+        punpckhbw   xmm6,               [rdx+48]            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+
+        movdqa      xmm5,               xmm0                ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
+        punpcklwd   xmm0,               xmm2                ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
+
+        punpckhwd   xmm5,               xmm2                ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
+        movdqa      xmm4,               xmm1                ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
+
+        punpcklwd   xmm1,               xmm6                ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
+        punpckhwd   xmm4,               xmm6                ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
+
+        movdqa      xmm2,               [rdx+64]            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+        punpcklbw   xmm2,               [rdx+80]            ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
+
+        movdqa      xmm6,               xmm3                ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
+        punpcklbw   xmm6,               [rdx+112]           ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06
+
+        movdqa      xmm7,               xmm2                ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
+        punpcklwd   xmm2,               xmm6                ; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04
+
+        punpckhwd   xmm7,               xmm6                ; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44
+        movdqa      xmm6,               xmm0                ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
+
+        punpckldq   xmm0,               xmm2                ; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00
+        punpckhdq   xmm6,               xmm2                ; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20
+%endmacro
+
+%macro MBV_WRITEBACK_1 0
+        movq        QWORD PTR [rsi+rax*4], xmm0
+        psrldq      xmm0,               8
+
+        movq        QWORD PTR [rsi+rax*2], xmm6
+        psrldq      xmm6,               8
+
+        movq        QWORD PTR [rdi+rax*4], xmm0
+        movq        QWORD PTR [rsi+rax],   xmm6
+
+        movdqa      xmm0,               xmm5                ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
+        punpckldq   xmm0,               xmm7                ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40
+
+        punpckhdq   xmm5,               xmm7                ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60
+
+        movq        QWORD PTR [rsi],    xmm0
+        psrldq      xmm0,               8
+
+        movq        QWORD PTR [rsi+rcx*2], xmm5
+        psrldq      xmm5,               8
+
+        movq        QWORD PTR [rsi+rcx],   xmm0
+        movq        QWORD PTR [rdi+rcx*2], xmm5
+
+        movdqa      xmm2,               [rdx+64]            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+        punpckhbw   xmm2,               [rdx+80]            ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
+
+        punpckhbw   xmm3,               [rdx+112]           ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86
+        movdqa      xmm0,               xmm2
+
+        punpcklwd   xmm0,               xmm3                ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84
+        punpckhwd   xmm2,               xmm3                ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4
+
+        movdqa      xmm3,               xmm1                ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
+        punpckldq   xmm1,               xmm0                ; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80
+
+        punpckhdq   xmm3,               xmm0                ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0
+%endmacro
+
+%macro MBV_WRITEBACK_2 0
+        movq        QWORD PTR [rsi+rax*4], xmm1
+        psrldq      xmm1,               8
+
+        movq        QWORD PTR [rsi+rax*2], xmm3
+        psrldq      xmm3,               8
+
+        movq        QWORD PTR [rdi+rax*4], xmm1
+        movq        QWORD PTR [rsi+rax],   xmm3
+
+        movdqa      xmm1,               xmm4                ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
+        punpckldq   xmm1,               xmm2                ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0
+
+        punpckhdq   xmm4,               xmm2                ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0
+        movq        QWORD PTR [rsi],    xmm1
+
+        psrldq      xmm1,               8
+
+        movq        QWORD PTR [rsi+rcx*2], xmm4
+        psrldq      xmm4,               8
+
+        movq        QWORD PTR [rsi+rcx], xmm1
+        movq        QWORD PTR [rdi+rcx*2], xmm4
+%endmacro
+
+
 ;void vp8_mbloop_filter_vertical_edge_sse2
 ;(
 ;    unsigned char *src_ptr,
@@ -1039,531 +1569,116 @@
     %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[16];
     %define srct [rsp + 32]   ;__declspec(align(16)) char srct[128];
 
-
         mov         rsi,                arg(0) ;src_ptr
-        movsxd      rax,                dword ptr arg(1) ;src_pixel_step                    ; destination pitch?
+        movsxd      rax,                dword ptr arg(1)   ;src_pixel_step
 
         lea         rsi,                [rsi + rax*4 - 4]
-        lea         rdi,                [rsi + rax]                     ; rdi points to row +1 for indirect addressing
-
+        lea         rdi,                [rsi + rax]        ; rdi points to row +1 for indirect addressing
         mov         rcx,                rax
-        neg         rcx
+        neg         rax
 
         ; Transpose
-        movq        xmm0,               QWORD PTR [rdi+rax*2]           ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70
-        movq        xmm7,               QWORD PTR [rsi+rax*2]           ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60
+        TRANSPOSE_16X8_1
 
-        punpcklbw   xmm7,               xmm0                            ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60
-        movq        xmm0,               QWORD PTR [rsi+rax]             ;
-
-        movq        xmm5,               QWORD PTR [rsi]                 ;
-        punpcklbw   xmm5,               xmm0                            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
-
-        movdqa      xmm6,               xmm5                            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
-        punpcklwd   xmm5,               xmm7                            ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
-
-        punpckhwd   xmm6,               xmm7                            ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44
-        movq        xmm7,               QWORD PTR [rsi + rcx]           ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30
-
-        movq        xmm0,               QWORD PTR [rsi + rcx*2]         ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20
-        punpcklbw   xmm0,               xmm7                            ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20
-
-        movq        xmm4,               QWORD PTR [rsi + rcx*4]         ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00
-        movq        xmm7,               QWORD PTR [rdi + rcx*4]         ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10
-
-        punpcklbw   xmm4,               xmm7                            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
-        movdqa      xmm3,               xmm4                            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
-
-        punpcklwd   xmm3,               xmm0                            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
-        punpckhwd   xmm4,               xmm0                            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
-
-        movdqa      xmm7,               xmm4                            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
-        movdqa      xmm2,               xmm3                            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
-
-        punpckhdq   xmm7,               xmm6                            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
-        punpckldq   xmm4,               xmm6                            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
-
-        punpckhdq   xmm3,               xmm5                            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
-        punpckldq   xmm2,               xmm5                            ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
-
-        movdqa      t0,                 xmm2                            ; save to free XMM2
-        ;movdqa        t1,                 xmm3
-
-        ; XMM3 XMM4 XMM7 in use
-        lea         rsi,                [rsi+rax*8]
-        lea         rdi,                [rdi+rax*8]
-
-        movq        xmm6,               QWORD PTR [rdi+rax*2]           ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0
-        movq        xmm5,               QWORD PTR [rsi+rax*2]           ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0
-
-        punpcklbw   xmm5,               xmm6                            ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0
-        movq        xmm6,               QWORD PTR [rsi+rax]             ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0
-
-        movq        xmm1,               QWORD PTR [rsi]                 ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0
-        punpcklbw   xmm1,               xmm6                            ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0
-
-        movdqa      xmm6,               xmm1                            ;
-        punpckhwd   xmm6,               xmm5                            ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4
-
-        punpcklwd   xmm1,               xmm5                            ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
-        movq        xmm5,               QWORD PTR [rsi+rcx]             ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0
-
-        movq        xmm0,               QWORD PTR [rsi+rcx*2]           ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0
-        punpcklbw   xmm0,               xmm5                            ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0
-
-        movq        xmm2,               QWORD PTR [rsi+rcx*4]           ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80
-        movq        xmm5,               QWORD PTR [rdi+rcx*4]           ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90
-
-        punpcklbw   xmm2,               xmm5                            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
-        movdqa      xmm5,               xmm2                            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
-
-        punpcklwd   xmm5,               xmm0                            ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
-        punpckhwd   xmm2,               xmm0                            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
-
-        movdqa      xmm0,               xmm5
-        punpckldq   xmm0,               xmm1                            ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
-
-
-        punpckhdq   xmm5,               xmm1                            ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
-        movdqa      xmm1,               xmm2                            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
-
-        punpckldq   xmm1,               xmm6                            ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84
-        punpckhdq   xmm2,               xmm6                            ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86
-
-        movdqa      xmm6,               xmm7                            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
-        punpcklqdq  xmm6,               xmm2                            ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
-
-
+        lea         rsi,                [rsi+rcx*8]
+        lea         rdi,                [rdi+rcx*8]
         lea         rdx,                srct
-        punpckhqdq  xmm7,               xmm2                            ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07
+        TRANSPOSE_16X8_2 0
 
-        movdqa      [rdx+112],          xmm7                            ; save 7
-        movdqa      xmm2,               xmm3                            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
-
-        movdqa      [rdx+96],           xmm6                            ; save 6
-        punpcklqdq  xmm2,               xmm5                            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
-
-        punpckhqdq  xmm3,               xmm5                            ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
-        movdqa      [rdx+32],           xmm2                            ; save 2
-
-        movdqa      xmm5,               xmm4                            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
-        punpcklqdq  xmm4,               xmm1                            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
-
-        movdqa      [rdx+48],           xmm3                            ; save 3
-        punpckhqdq  xmm5,               xmm1                            ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
-
-        movdqa      [rdx+64],           xmm4                            ; save 4
-        movdqa      [rdx+80],           xmm5                            ; save 5
-
-        movdqa      xmm1,               t0                              ; get
-        movdqa      xmm2,               xmm1                            ;
-
-        punpckhqdq  xmm1,               xmm0                            ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
-        punpcklqdq  xmm2,               xmm0                            ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
-
-        movdqa      [rdx+16],           xmm1
-        movdqa      [rdx],              xmm2
-
-        movdqa      xmm0,               xmm6                            ; q2
-        psubusb     xmm0,               xmm7                            ; q2-q3
-
-        psubusb     xmm7,               xmm6                            ; q3-q2
-        por         xmm7,               xmm0                            ; abs (q3-q2)
-
-        movdqa      xmm1,               xmm5                            ; q1
-        psubusb     xmm1,               xmm6                            ; q1-q2
-
-        psubusb     xmm6,               xmm5                            ; q2-q1
-        por         xmm6,               xmm1                            ; abs (q2-q1)
-
-        ;/*
-        ;movdqa      xmm0,               xmm4                            ; q0
-        ;psubusb     xmm0                xmm5                            ; q0-q1
-        ;
-        ;pusbusb     xmm5,               xmm4                            ; q1-q0
-        ;por         xmm5,               xmm0                            ; abs (q1-q0)
-        ;*/
-
-        movdqa      xmm1,               [rdx+16]                        ; p2
-        movdqa      xmm0,               xmm1
-
-        psubusb     xmm0,               xmm2                            ; p2 - p3;
-        psubusb     xmm2,               xmm1                            ; p3 - p2;
-
-        por         xmm0,               xmm2                            ; abs(p2-p3)
-
-        movdqa      xmm2,               [rdx+32]                        ; p1
-        movdqa      xmm5,               xmm2                            ; p1
-
-        psubusb     xmm5,               xmm1                            ; p1-p2
-        psubusb     xmm1,               xmm2                            ; p2-p1
-
-        por         xmm1,               xmm5                            ; abs(p2-p1)
-        mov         rdx,                arg(3) ;limit
-
-        movdqa      xmm4,               [rdx]                           ; limit
-        psubusb     xmm7,               xmm4                            ;
-
-
-        psubusb     xmm0,               xmm4                            ; abs(p3-p2) > limit
-        psubusb     xmm1,               xmm4                            ; abs(p2-p1) > limit
-
-        psubusb     xmm6,               xmm4                            ; abs(q2-q1) > limit
-        por         xmm7,               xmm6                            ; or
-
-        por         xmm0,               xmm1                            ;
-        por         xmm0,               xmm7                            ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
-
-        movdqa      xmm1,               xmm2                            ; p1
-
-        movdqa      xmm7,               xmm3                            ; p0
-        psubusb     xmm7,               xmm2                            ; p0-p1
-
-        psubusb     xmm2,               xmm3                            ; p1-p0
-        por         xmm2,               xmm7                            ; abs(p1-p0)
-
-        movdqa      t0,                 xmm2                            ; save abs(p1-p0)
-        lea         rdx,                srct
-
-        psubusb     xmm2,               xmm4                            ; abs(p1-p0)>limit
-        por         xmm0,               xmm2                            ; mask
-
-        movdqa      xmm5,               [rdx+64]                        ; q0
-        movdqa      xmm7,               [rdx+80]                        ; q1
-
-        movdqa      xmm6,               xmm5                            ; q0
-        movdqa      xmm2,               xmm7                            ; q1
-        psubusb     xmm5,               xmm7                            ; q0-q1
-
-        psubusb     xmm7,               xmm6                            ; q1-q0
-        por         xmm7,               xmm5                            ; abs(q1-q0)
-
-        movdqa      t1,                 xmm7                            ; save abs(q1-q0)
-        psubusb     xmm7,               xmm4                            ; abs(q1-q0)> limit
-
-        por         xmm0,               xmm7                            ; mask
-
-        movdqa      xmm5,                xmm2                           ; q1
-        psubusb     xmm5,                xmm1                           ; q1-=p1
-        psubusb     xmm1,                xmm2                           ; p1-=q1
-        por         xmm5,                xmm1                           ; abs(p1-q1)
-        pand        xmm5,                [tfe GLOBAL]                   ; set lsb of each byte to zero
-        psrlw       xmm5,                1                              ; abs(p1-q1)/2
-
-        mov         rdx,                arg(2) ;flimit                          ;
-        movdqa      xmm2,               [rdx]                           ; flimit
-
-        movdqa      xmm1,               xmm3                            ; p0
-        movdqa      xmm7,               xmm6                            ; q0
-        psubusb     xmm1,               xmm7                            ; p0-q0
-        psubusb     xmm7,               xmm3                            ; q0-p0
-        por         xmm1,               xmm7                            ; abs(q0-p0)
-        paddusb     xmm1,               xmm1                            ; abs(q0-p0)*2
-        paddusb     xmm1,               xmm5                            ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
-        paddb       xmm2,               xmm2                            ; flimit*2 (less than 255)
-        paddb       xmm4,               xmm2                            ; flimit * 2 + limit (less than 255)
-
-        psubusb     xmm1,               xmm4                            ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
-        por         xmm1,               xmm0;                           ; mask
-        pxor        xmm0,               xmm0
-        pcmpeqb     xmm1,               xmm0
-
+        ; calculate filter mask
+        LFV_FILTER_MASK 0
         ; calculate high edge variance
-        mov         rdx,                arg(4) ;thresh                          ; get thresh
-        movdqa      xmm7,               [rdx]
-
-        movdqa      xmm4,               t0                              ; get abs (q1 - q0)
-        psubusb     xmm4,               xmm7                            ; abs(q1 - q0) > thresh
-
-        movdqa      xmm3,               t1                              ; get abs (p1 - p0)
-        psubusb     xmm3,               xmm7                            ; abs(p1 - p0)> thresh
-
-        por         xmm4,               xmm3                            ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
-        pcmpeqb     xmm4,               xmm0
-
-        pcmpeqb     xmm0,               xmm0
-        pxor        xmm4,               xmm0
-
+        LFV_HEV_MASK
 
         ; start work on filters
-        lea         rdx,                srct
-
-        ; start work on filters
-        movdqa      xmm2,               [rdx+32]                        ; p1
-        movdqa      xmm7,               [rdx+80]                        ; q1
-
-        pxor        xmm2,               [t80 GLOBAL]                    ; p1 offset to convert to signed values
-        pxor        xmm7,               [t80 GLOBAL]                    ; q1 offset to convert to signed values
-
-        psubsb      xmm2,               xmm7                            ; p1 - q1
-        movdqa      xmm6,               [rdx+48]                        ; p0
-
-        movdqa      xmm0,               [rdx+64]                        ; q0
-        pxor        xmm6,               [t80 GLOBAL]                    ; offset to convert to signed values
-
-        pxor        xmm0,               [t80 GLOBAL]                    ; offset to convert to signed values
-        movdqa      xmm3,               xmm0                            ; q0
-
-        psubsb      xmm0,               xmm6                            ; q0 - p0
-        paddsb      xmm2,               xmm0                            ; 1 * (q0 - p0) + (p1 - q1)
-
-        paddsb      xmm2,               xmm0                            ; 2 * (q0 - p0)
-        paddsb      xmm2,               xmm0                            ; 3 * (q0 - p0)+ (p1 - q1)
-
-        pand        xmm1,               xmm2                            ; mask filter values we don't care about
-
-        ; xmm1 = vp8_filter, xmm4=hev, xmm6=ps0, xmm3=qs0
-        movdqa      xmm2,               xmm1                            ; vp8_filter
-        pand        xmm2,               xmm4;                           ; Filter2 = vp8_filter & hev
-
-        movdqa      xmm5,               xmm2
-        paddsb      xmm5,               [t3 GLOBAL]
-
-        pxor        xmm0,               xmm0                            ; 0
-        pxor        xmm7,               xmm7                            ; 0
-
-        punpcklbw   xmm0,               xmm5                            ; e0f0g0h0
-        psraw       xmm0,               11                              ; sign extended shift right by 3
-
-        punpckhbw   xmm7,               xmm5                            ; a0b0c0d0
-        psraw       xmm7,               11                              ; sign extended shift right by 3
-
-        packsswb    xmm0,               xmm7                            ; Filter2 >>=3;
-        movdqa      xmm5,               xmm0                            ; Filter2
-
-        paddsb      xmm2,               [t4 GLOBAL]                     ; vp8_signed_char_clamp(Filter2 + 4)
-        pxor        xmm0,               xmm0                            ; 0
-
-        pxor        xmm7,               xmm7                            ; 0
-        punpcklbw   xmm0,               xmm2                            ; e0f0g0h0
-
-        psraw       xmm0,               11                              ; sign extended shift right by 3
-        punpckhbw   xmm7,               xmm2                            ; a0b0c0d0
-
-        psraw       xmm7,               11                              ; sign extended shift right by 3
-        packsswb    xmm0,               xmm7                            ; Filter2 >>=3;
-
-        ; xmm0= filter2 xmm1 = vp8_filter,  xmm3 =qs0 xmm5=s xmm4 =hev xmm6=ps0
-        psubsb      xmm3,               xmm0                            ; qs0 =qs0 - filter1
-        paddsb      xmm6,               xmm5                            ; ps0 =ps0 + Fitler2
-
-
-        ; xmm1=vp8_filter, xmm3=qs0, xmm4 =hev xmm6=ps0
-        ; vp8_filter &= ~hev;
-        ; Filter2 = vp8_filter;
-        pandn       xmm4,                   xmm1                        ; vp8_filter&=~hev
-
-        ; xmm3=qs0, xmm4=filter2, xmm6=ps0
-        ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
-        ; s = vp8_signed_char_clamp(qs0 - u);
-        ; *oq0 = s^0x80;
-        ; s = vp8_signed_char_clamp(ps0 + u);
-        ; *op0 = s^0x80;
-        pxor        xmm0,                   xmm0
-        pxor        xmm1,                   xmm1
-
-        pxor        xmm2,                   xmm2
-        punpcklbw   xmm1,                   xmm4
-
-        punpckhbw   xmm2,                   xmm4
-        pmulhw      xmm1,                   [s27 GLOBAL]
-
-        pmulhw      xmm2,                   [s27 GLOBAL]
-        paddw       xmm1,                   [s63 GLOBAL]
-
-        paddw       xmm2,                   [s63 GLOBAL]
-        psraw       xmm1,                   7
-
-        psraw       xmm2,                   7
-        packsswb    xmm1,                   xmm2
-
-        psubsb      xmm3,                   xmm1
-        paddsb      xmm6,                   xmm1
-
-        pxor        xmm3,                   [t80 GLOBAL]
-        pxor        xmm6,                   [t80 GLOBAL]
-
-        movdqa      [rdx+48],               xmm6
-        movdqa      [rdx+64],               xmm3
-
-        ; roughly 2/7th difference across boundary
-        ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
-        ; s = vp8_signed_char_clamp(qs1 - u);
-        ; *oq1 = s^0x80;
-        ; s = vp8_signed_char_clamp(ps1 + u);
-        ; *op1 = s^0x80;
-        pxor        xmm1,                       xmm1
-        pxor        xmm2,                       xmm2
-
-        punpcklbw   xmm1,                       xmm4
-        punpckhbw   xmm2,                       xmm4
-
-        pmulhw      xmm1,                       [s18 GLOBAL]
-        pmulhw      xmm2,                       [s18 GLOBAL]
-
-        paddw       xmm1,                       [s63 GLOBAL]
-        paddw       xmm2,                       [s63 GLOBAL]
-
-        psraw       xmm1,                       7
-        psraw       xmm2,                       7
-
-        packsswb    xmm1,                       xmm2
-
-        movdqa      xmm3,                       [rdx + 80]              ;/q1
-        movdqa      xmm6,                       [rdx + 32]              ; p1
-
-        pxor        xmm3,                       [t80 GLOBAL]
-        pxor        xmm6,                       [t80 GLOBAL]
-
-        paddsb      xmm6,                       xmm1
-        psubsb      xmm3,                       xmm1
-
-        pxor        xmm6,                       [t80 GLOBAL]
-        pxor        xmm3,                       [t80 GLOBAL]
-
-        movdqa      [rdx + 80],                 xmm3
-        movdqa      [rdx + 32],                 xmm6
-
-
-        ; roughly 1/7th difference across boundary
-        ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
-        ; s = vp8_signed_char_clamp(qs2 - u);
-        ; *oq2 = s^0x80;
-        ; s = vp8_signed_char_clamp(ps2 + u);
-        ; *op2 = s^0x80;
-        pxor        xmm1,                       xmm1
-        pxor        xmm2,                       xmm2
-
-        punpcklbw   xmm1,                       xmm4
-        punpckhbw   xmm2,                       xmm4
-
-        pmulhw      xmm1,                       [s9 GLOBAL]
-        pmulhw      xmm2,                       [s9 GLOBAL]
-
-        paddw       xmm1,                       [s63 GLOBAL]
-        paddw       xmm2,                       [s63 GLOBAL]
-
-        psraw       xmm1,                       7
-        psraw       xmm2,                       7
-
-        packsswb    xmm1,                       xmm2
-
-        movdqa      xmm6,                       [rdx+16]
-        movdqa      xmm3,                       [rdx+96]
-
-        pxor        xmm6,                       [t80 GLOBAL]
-        pxor        xmm3,                       [t80 GLOBAL]
-
-        paddsb      xmm6,                       xmm1
-        psubsb      xmm3,                       xmm1
-
-        pxor        xmm6,                       [t80 GLOBAL]        ; xmm6 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
-        pxor        xmm3,                       [t80 GLOBAL]        ; xmm3 = f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 15 06
-
+        MBV_FILTER
 
         ; transpose and write back
-        movdqa      xmm0,                       [rdx]               ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
-        movdqa      xmm1,                       xmm0                ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+        MBV_TRANSPOSE
 
-        punpcklbw   xmm0,                       xmm6                ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
-        punpckhbw   xmm1,                       xmm6                ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
+        lea         rsi,                [rsi+rax*8]
+        lea         rdi,                [rdi+rax*8]
+        MBV_WRITEBACK_1
 
-        movdqa      xmm2,                       [rdx+32]            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
-        movdqa      xmm6,                       xmm2                ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+        lea         rsi,                [rsi+rcx*8]
+        lea         rdi,                [rdi+rcx*8]
+        MBV_WRITEBACK_2
 
-        punpcklbw   xmm2,                       [rdx+48]            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
-        punpckhbw   xmm6,                       [rdx+48]            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+    add rsp, 160
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
 
-        movdqa      xmm5,                       xmm0                ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
-        punpcklwd   xmm0,                       xmm2                ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
 
-        punpckhwd   xmm5,                       xmm2                ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
-        movdqa      xmm4,                       xmm1                ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
+;void vp8_mbloop_filter_vertical_edge_uv_sse2
+;(
+;    unsigned char *u,
+;    int            src_pixel_step,
+;    const char    *flimit,
+;    const char    *limit,
+;    const char    *thresh,
+;    unsigned char *v
+;)
+global sym(vp8_mbloop_filter_vertical_edge_uv_sse2)
+sym(vp8_mbloop_filter_vertical_edge_uv_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
 
-        punpcklwd   xmm1,                       xmm6                ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
-        punpckhwd   xmm4,                       xmm6                ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
+    ALIGN_STACK 16, rax
+    sub          rsp, 160     ; reserve 160 bytes
+    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[16];
+    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[16];
+    %define srct [rsp + 32]   ;__declspec(align(16)) char srct[128];
 
-        movdqa      xmm2,                       [rdx+64]            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
-        punpcklbw   xmm2,                       [rdx+80]            ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
+        mov         rsi,                arg(0) ;u_ptr
+        movsxd      rax,                dword ptr arg(1)   ; src_pixel_step
 
-        movdqa      xmm6,                       xmm3                ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
-        punpcklbw   xmm6,                       [rdx+112]           ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06
+        lea         rsi,                [rsi + rax*4 - 4]
+        lea         rdi,                [rsi + rax]        ; rdi points to row +1 for indirect addressing
+        mov         rcx,                rax
+        neg         rax
 
-        movdqa      xmm7,                       xmm2                ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
-        punpcklwd   xmm2,                       xmm6                ; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04
+        ; Transpose
+        TRANSPOSE_16X8_1
 
-        punpckhwd   xmm7,                       xmm6                ; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44
-        movdqa      xmm6,                       xmm0                ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
+        ; XMM3 XMM4 XMM7 in use
+        mov         rsi,                arg(5)             ;v_ptr
+        lea         rsi,                [rsi + rcx*4 - 4]
+        lea         rdi,                [rsi + rcx]
+        lea         rdx,                srct
+        TRANSPOSE_16X8_2 0
 
-        punpckldq   xmm0,                       xmm2                ; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00
-        punpckhdq   xmm6,                       xmm2                ; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20
+        ; calculate filter mask
+        LFV_FILTER_MASK 0
+        ; calculate high edge variance
+        LFV_HEV_MASK
 
-        lea         rsi,                        [rsi+rcx*8]
-        lea         rdi,                        [rdi+rcx*8]
+        ; start work on filters
+        MBV_FILTER
 
-        movq        QWORD PTR [rsi+rcx*4],      xmm0
-        psrldq      xmm0,                       8
+        ; transpose and write back
+        MBV_TRANSPOSE
 
-        movq        QWORD PTR [rsi+rcx*2],      xmm6
-        psrldq      xmm6,                       8
-
-        movq        QWORD PTR [rdi+rcx*4],      xmm0
-        movq        QWORD PTR [rsi+rcx],        xmm6
-
-        movdqa      xmm0,                       xmm5                ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
-        punpckldq   xmm0,                       xmm7                ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40
-
-        punpckhdq   xmm5,                       xmm7                ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60
-
-        movq        QWORD PTR [rsi],            xmm0
-        psrldq      xmm0,                       8
-
-        movq        QWORD PTR [rsi+rax*2],      xmm5
-        psrldq      xmm5,                       8
-
-        movq        QWORD PTR [rsi+rax],        xmm0
-        movq        QWORD PTR [rdi+rax*2],      xmm5
-
-        movdqa      xmm2,                       [rdx+64]            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
-        punpckhbw   xmm2,                       [rdx+80]            ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
-
-        punpckhbw   xmm3,                       [rdx+112]           ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86
-        movdqa      xmm0,                       xmm2
-
-        punpcklwd   xmm0,                       xmm3                ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84
-        punpckhwd   xmm2,                       xmm3                ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4
-
-        movdqa      xmm3,                       xmm1                ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
-        punpckldq   xmm1,                       xmm0                ; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80
-
-        punpckhdq   xmm3,                       xmm0                ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0
-
-        lea         rsi,                        [rsi+rax*8]
-        lea         rdi,                        [rdi+rax*8]
-
-        movq        QWORD PTR [rsi+rcx*4],      xmm1
-        psrldq      xmm1,                       8
-
-        movq        QWORD PTR [rsi+rcx*2],      xmm3
-        psrldq      xmm3,                       8
-
-        movq        QWORD PTR [rdi+rcx*4],      xmm1
-        movq        QWORD PTR [rsi+rcx],        xmm3
-
-        movdqa      xmm1,                       xmm4                ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
-        punpckldq   xmm1,                       xmm2                ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0
-
-        punpckhdq   xmm4,                       xmm2                ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0
-        movq        QWORD PTR [rsi],            xmm1
-
-        psrldq      xmm1,                       8
-
-        movq        QWORD PTR [rsi+rax*2],      xmm4
-        psrldq      xmm4,                       8
-
-        movq        QWORD PTR [rsi+rax],        xmm1
-        movq        QWORD PTR [rdi+rax*2],      xmm4
+        mov         rsi,                arg(0)             ;u_ptr
+        lea         rsi,                [rsi + rcx*4 - 4]
+        lea         rdi,                [rsi + rcx]
+        MBV_WRITEBACK_1
+        mov         rsi,                arg(5)             ;v_ptr
+        lea         rsi,                [rsi + rcx*4 - 4]
+        lea         rdi,                [rsi + rcx]
+        MBV_WRITEBACK_2
 
     add rsp, 160
     pop rsp

diff --git a/vp8/common/x86/loopfilter_x86.c b/vp8/common/x86/loopfilter_x86.c
index 3a9437e..16498ab 100644
--- a/vp8/common/x86/loopfilter_x86.c
+++ b/vp8/common/x86/loopfilter_x86.c

@@ -34,6 +34,11 @@
 prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_sse2);
 prototype_loopfilter(vp8_fast_loop_filter_vertical_edges_sse2);
 
+extern loop_filter_uvfunction vp8_loop_filter_horizontal_edge_uv_sse2;
+extern loop_filter_uvfunction vp8_loop_filter_vertical_edge_uv_sse2;
+extern loop_filter_uvfunction vp8_mbloop_filter_horizontal_edge_uv_sse2;
+extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_sse2;
+
 #if HAVE_MMX
 // Horizontal MB filtering
 void vp8_loop_filter_mbh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -157,10 +162,7 @@
     vp8_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
 
     if (u_ptr)
-        vp8_mbloop_filter_horizontal_edge_mmx(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
-
-    if (v_ptr)
-        vp8_mbloop_filter_horizontal_edge_mmx(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, v_ptr);
 }
 
 
@@ -183,10 +185,7 @@
     vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
 
     if (u_ptr)
-        vp8_mbloop_filter_vertical_edge_mmx(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
-
-    if (v_ptr)
-        vp8_mbloop_filter_vertical_edge_mmx(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, v_ptr);
 }
 
 
@@ -211,10 +210,7 @@
     vp8_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
 
     if (u_ptr)
-        vp8_loop_filter_horizontal_edge_mmx(u_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
-
-    if (v_ptr)
-        vp8_loop_filter_horizontal_edge_mmx(v_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, v_ptr + 4 * uv_stride);
 }
 
 
@@ -241,10 +237,7 @@
     vp8_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
 
     if (u_ptr)
-        vp8_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
-
-    if (v_ptr)
-        vp8_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, v_ptr + 4);
 }
 
 

diff --git a/vp8/decoder/onyxd_if.c b/vp8/decoder/onyxd_if.c
index 3a237de..60ca74a 100644
--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c

@@ -293,17 +293,16 @@
         // Apply the loop filter if appropriate.
 
         if (cm->filter_level > 0)
-        {
             vp8_loop_filter_frame(cm, &pbi->mb, cm->filter_level);
-            cm->last_frame_type = cm->frame_type;
-            cm->last_filter_type = cm->filter_type;
-            cm->last_sharpness_level = cm->sharpness_level;
-
-        }
 
         vpx_usec_timer_mark(&lpftimer);
         pbi->time_loop_filtering += vpx_usec_timer_elapsed(&lpftimer);
     }
+    if (cm->filter_level > 0) {
+        cm->last_frame_type = cm->frame_type;
+        cm->last_filter_type = cm->filter_type;
+        cm->last_sharpness_level = cm->sharpness_level;
+    }
 
     vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show);
 

diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c
index 38d6042..18c8da0 100644
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c

@@ -281,11 +281,11 @@
 
                 YV12_BUFFER_CONFIG *post = &cm->new_frame;
                 loop_filter_info *lfi = cm->lf_info;
+                int frame_type = cm->frame_type;
 
                 int mb_row;
                 int mb_col;
 
-
                 int baseline_filter_level[MAX_MB_SEGMENTS];
                 int filter_level;
                 int alt_flt_enabled = mbd->segmentation_enabled;
@@ -319,7 +319,10 @@
                 }
 
                 // Initialize the loop filter for this frame.
-                vp8_init_loop_filter(cm);
+                if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level))
+                    vp8_init_loop_filter(cm);
+                else if (frame_type != cm->last_frame_type)
+                    vp8_frame_init_loop_filter(lfi, frame_type);
 
                 // Set up the buffer pointers
                 y_ptr = post->y_buffer;

diff --git a/vp8/encoder/block.h b/vp8/encoder/block.h
index c1fcfe2..b55bc51 100644
--- a/vp8/encoder/block.h
+++ b/vp8/encoder/block.h

@@ -100,14 +100,9 @@
 
     void (*vp8_short_fdct4x4)(short *input, short *output, int pitch);
     void (*vp8_short_fdct8x4)(short *input, short *output, int pitch);
-    void (*short_fdct4x4rd)(short *input, short *output, int pitch);
-    void (*short_fdct8x4rd)(short *input, short *output, int pitch);
     void (*short_walsh4x4)(short *input, short *output, int pitch);
-
     void (*quantize_b)(BLOCK *b, BLOCKD *d);
 
-
-
 } MACROBLOCK;
 
 

diff --git a/vp8/encoder/dct.c b/vp8/encoder/dct.c
index 3075e58..58e3610 100644
--- a/vp8/encoder/dct.c
+++ b/vp8/encoder/dct.c

@@ -11,163 +11,54 @@
 
 #include <math.h>
 
-
-static const short dct_matrix2[4][4] =
-{
-    { 23170,  30274,  23170, 12540 },
-    { 23170,  12540, -23170, -30274 },
-    { 23170, -12540, -23170, 30274 },
-    { 23170, -30274,  23170, -12540 }
-};
-
-static const short dct_matrix1[4][4] =
-{
-    { 23170,  23170,  23170,  23170 },
-    { 30274,  12540, -12540, -30274 },
-    { 23170, -23170, -23170,  23170 },
-    { 12540, -30274,  30274, -12540 }
-};
-
-
-#define _1STSTAGESHIFT           14
-#define _1STSTAGEROUNDING        (1<<( _1STSTAGESHIFT-1))
-#define _2NDSTAGESHIFT           16
-#define _2NDSTAGEROUNDING        (1<<( _2NDSTAGESHIFT-1))
-
-// using matrix multiply
 void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
 {
-    int i, j, k;
-    short temp[4][4];
-    int sumtemp;
-    pitch >>= 1;
-
-    for (i = 0; i < 4; i++)
-    {
-        for (j = 0; j < 4; j++)
-        {
-            sumtemp = 0;
-
-            for (k = 0; k < 4; k++)
-            {
-                sumtemp += input[i*pitch+k] * dct_matrix2[k][j];
-
-            }
-
-            temp[i][j] = (short)((sumtemp + _1STSTAGEROUNDING) >> _1STSTAGESHIFT);
-        }
-    }
-
-
-    for (i = 0; i < 4; i++)
-    {
-        for (j = 0; j < 4; j++)
-        {
-            sumtemp = 0;
-
-            for (k = 0; k < 4; k++)
-            {
-                sumtemp += dct_matrix1[i][ k] * temp[k][ j];
-            }
-
-            output[i*4+j] = (short)((sumtemp + _2NDSTAGEROUNDING) >> _2NDSTAGESHIFT);
-        }
-    }
-
-}
-
-
-void vp8_short_fdct8x4_c(short *input, short *output, int pitch)
-{
-    vp8_short_fdct4x4_c(input,   output,    pitch);
-    vp8_short_fdct4x4_c(input + 4, output + 16, pitch);
-}
-
-
-static const signed short x_c1 = 60547;
-static const signed short x_c2 = 46341;
-static const signed short x_c3 = 25080;
-
-void vp8_fast_fdct4x4_c(short *input, short *output, int pitch)
-{
     int i;
     int a1, b1, c1, d1;
-    int a2, b2, c2, d2;
     short *ip = input;
-
     short *op = output;
-    int temp1, temp2;
 
     for (i = 0; i < 4; i++)
     {
-        a1 = (ip[0] + ip[3]) * 2;
-        b1 = (ip[1] + ip[2]) * 2;
-        c1 = (ip[1] - ip[2]) * 2;
-        d1 = (ip[0] - ip[3]) * 2;
+        a1 = ((ip[0] + ip[3])<<3);
+        b1 = ((ip[1] + ip[2])<<3);
+        c1 = ((ip[1] - ip[2])<<3);
+        d1 = ((ip[0] - ip[3])<<3);
 
-        temp1 = a1 + b1;
-        temp2 = a1 - b1;
+        op[0] = a1 + b1;
+        op[2] = a1 - b1;
 
-        op[0] = ((temp1 * x_c2) >> 16) + temp1;
-        op[2] = ((temp2 * x_c2) >> 16) + temp2;
-
-        temp1 = (c1 * x_c3) >> 16;
-        temp2 = ((d1 * x_c1) >> 16) + d1;
-
-        op[1] = temp1 + temp2;
-
-        temp1 = (d1 * x_c3) >> 16;
-        temp2 = ((c1 * x_c1) >> 16) + c1;
-
-        op[3] = temp1 - temp2;
+        op[1] = (c1 * 2217 + d1 * 5352 +  14500)>>12;
+        op[3] = (d1 * 2217 - c1 * 5352 +   7500)>>12;
 
         ip += pitch / 2;
         op += 4;
-    }
 
+    }
     ip = output;
     op = output;
-
     for (i = 0; i < 4; i++)
     {
-
         a1 = ip[0] + ip[12];
         b1 = ip[4] + ip[8];
         c1 = ip[4] - ip[8];
         d1 = ip[0] - ip[12];
 
+        op[0]  = ( a1 + b1 + 7)>>4;
+        op[8]  = ( a1 - b1 + 7)>>4;
 
-        temp1 = a1 + b1;
-        temp2 = a1 - b1;
-
-        a2 = ((temp1 * x_c2) >> 16) + temp1;
-        c2 = ((temp2 * x_c2) >> 16) + temp2;
-
-        temp1 = (c1 * x_c3) >> 16;
-        temp2 = ((d1 * x_c1) >> 16) + d1;
-
-        b2 = temp1 + temp2;
-
-        temp1 = (d1 * x_c3) >> 16;
-        temp2 = ((c1 * x_c1) >> 16) + c1;
-
-        d2 = temp1 - temp2;
-
-
-        op[0]   = (a2 + 1) >> 1;
-        op[4]   = (b2 + 1) >> 1;
-        op[8]   = (c2 + 1) >> 1;
-        op[12]  = (d2 + 1) >> 1;
+        op[4]  =((c1 * 2217 + d1 * 5352 +  12000)>>16) + (d1!=0);
+        op[12] = (d1 * 2217 - c1 * 5352 +  51000)>>16;
 
         ip++;
         op++;
     }
 }
 
-void vp8_fast_fdct8x4_c(short *input, short *output, int pitch)
+void vp8_short_fdct8x4_c(short *input, short *output, int pitch)
 {
-    vp8_fast_fdct4x4_c(input,   output,    pitch);
-    vp8_fast_fdct4x4_c(input + 4, output + 16, pitch);
+    vp8_short_fdct4x4_c(input,   output,    pitch);
+    vp8_short_fdct4x4_c(input + 4, output + 16, pitch);
 }
 
 void vp8_short_walsh4x4_c(short *input, short *output, int pitch)

diff --git a/vp8/encoder/dct.h b/vp8/encoder/dct.h
index f79dba4..0ab40b3 100644
--- a/vp8/encoder/dct.h
+++ b/vp8/encoder/dct.h

@@ -32,16 +32,6 @@
 #endif
 extern prototype_fdct(vp8_fdct_short8x4);
 
-#ifndef vp8_fdct_fast4x4
-#define vp8_fdct_fast4x4  vp8_fast_fdct4x4_c
-#endif
-extern prototype_fdct(vp8_fdct_fast4x4);
-
-#ifndef vp8_fdct_fast8x4
-#define vp8_fdct_fast8x4  vp8_fast_fdct8x4_c
-#endif
-extern prototype_fdct(vp8_fdct_fast8x4);
-
 #ifndef vp8_fdct_walsh_short4x4
 #define vp8_fdct_walsh_short4x4  vp8_short_walsh4x4_c
 #endif

diff --git a/vp8/encoder/encodeintra.c b/vp8/encoder/encodeintra.c
index 0e16093..870cb58 100644
--- a/vp8/encoder/encodeintra.c
+++ b/vp8/encoder/encodeintra.c

@@ -66,7 +66,7 @@
 
     ENCODEMB_INVOKE(&rtcd->encodemb, subb)(be, b, 16);
 
-    x->short_fdct4x4rd(be->src_diff, be->coeff, 32);
+    x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32);
 
     x->quantize_b(be, b);
 

diff --git a/vp8/encoder/encodemb.c b/vp8/encoder/encodemb.c
index 824850c..8bc01df 100644
--- a/vp8/encoder/encodemb.c
+++ b/vp8/encoder/encodemb.c

@@ -130,7 +130,8 @@
 
     for (i = 16; i < 24; i += 2)
     {
-        x->short_fdct8x4rd(&x->block[i].src_diff[0], &x->block[i].coeff[0], 16);
+        x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
+            &x->block[i].coeff[0], 16);
     }
 }
 
@@ -140,14 +141,16 @@
 
     for (i = 0; i < 16; i += 2)
     {
-        x->vp8_short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32);
+        x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
+            &x->block[i].coeff[0], 32);
     }
 
     // build dc block from 16 y dc values
     vp8_build_dcblock(x);
 
     // do 2nd order transform on the dc block
-    x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8);
+    x->short_walsh4x4(&x->block[24].src_diff[0],
+        &x->block[24].coeff[0], 8);
 
 }
 
@@ -157,14 +160,16 @@
 
     for (i = 0; i < 16; i += 2)
     {
-        x->short_fdct8x4rd(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32);
+        x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
+            &x->block[i].coeff[0], 32);
     }
 
     // build dc block from 16 y dc values
     vp8_build_dcblock(x);
 
     // do 2nd order transform on the dc block
-    x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8);
+    x->short_walsh4x4(&x->block[24].src_diff[0],
+        &x->block[24].coeff[0], 8);
 }
 
 void vp8_transform_mb(MACROBLOCK *x)
@@ -173,7 +178,8 @@
 
     for (i = 0; i < 16; i += 2)
     {
-        x->vp8_short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32);
+        x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
+            &x->block[i].coeff[0], 32);
     }
 
     // build dc block from 16 y dc values
@@ -182,12 +188,14 @@
 
     for (i = 16; i < 24; i += 2)
     {
-        x->vp8_short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 16);
+        x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
+            &x->block[i].coeff[0], 16);
     }
 
     // do 2nd order transform on the dc block
     if (x->e_mbd.mbmi.mode != SPLITMV)
-        x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8);
+        x->short_walsh4x4(&x->block[24].src_diff[0],
+        &x->block[24].coeff[0], 8);
 
 }
 
@@ -197,14 +205,16 @@
 
     for (i = 0; i < 16; i += 2)
     {
-        x->vp8_short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32);
+        x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
+            &x->block[i].coeff[0], 32);
     }
 
     // build dc block from 16 y dc values
     if (x->e_mbd.mbmi.mode != SPLITMV)
     {
         vp8_build_dcblock(x);
-        x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8);
+        x->short_walsh4x4(&x->block[24].src_diff[0],
+            &x->block[24].coeff[0], 8);
     }
 }
 
@@ -214,7 +224,8 @@
 
     for (i = 0; i < 16; i += 2)
     {
-        x->short_fdct8x4rd(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32);
+        x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
+            &x->block[i].coeff[0], 32);
     }
 
     // build dc block from 16 y dc values
@@ -223,12 +234,14 @@
 
     for (i = 16; i < 24; i += 2)
     {
-        x->short_fdct8x4rd(&x->block[i].src_diff[0], &x->block[i].coeff[0], 16);
+        x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
+            &x->block[i].coeff[0], 16);
     }
 
     // do 2nd order transform on the dc block
     if (x->e_mbd.mbmi.mode != SPLITMV)
-        x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8);
+        x->short_walsh4x4(&x->block[24].src_diff[0],
+            &x->block[24].coeff[0], 8);
 }
 
 void vp8_stuff_inter16x16(MACROBLOCK *x)

diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c
index a205667..dd98a09 100644
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c

@@ -257,9 +257,6 @@
 
     z->vp8_short_fdct4x4     = x->vp8_short_fdct4x4;
     z->vp8_short_fdct8x4     = x->vp8_short_fdct8x4;
-    z->short_fdct4x4rd   = x->short_fdct4x4rd;
-    z->short_fdct8x4rd   = x->short_fdct8x4rd;
-    z->short_fdct8x4rd   = x->short_fdct8x4rd;
     z->short_walsh4x4    = x->short_walsh4x4;
     z->quantize_b        = x->quantize_b;
 

diff --git a/vp8/encoder/generic/csystemdependent.c b/vp8/encoder/generic/csystemdependent.c
index e68d650..dd89f1a 100644
--- a/vp8/encoder/generic/csystemdependent.c
+++ b/vp8/encoder/generic/csystemdependent.c

@@ -68,8 +68,8 @@
 
     cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_c;
     cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_c;
-    cpi->rtcd.fdct.fast4x4                   = vp8_fast_fdct4x4_c;
-    cpi->rtcd.fdct.fast8x4                   = vp8_fast_fdct8x4_c;
+    cpi->rtcd.fdct.fast4x4                   = vp8_short_fdct4x4_c;
+    cpi->rtcd.fdct.fast8x4                   = vp8_short_fdct8x4_c;
     cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_c;
 
     cpi->rtcd.encodemb.berr                  = vp8_block_error_c;

diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c
index 4c3edd7..156578b 100644
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c

@@ -997,7 +997,7 @@
     int tot_steps;
     MV this_mv;
 
-    unsigned int bestsad = UINT_MAX;
+    int bestsad = INT_MAX;
     int best_site = 0;
     int last_site = 0;
 
@@ -1238,7 +1238,7 @@
     unsigned char *bestaddress;
     MV *best_mv = &d->bmi.mv.as_mv;
     MV this_mv;
-    unsigned int bestsad = UINT_MAX;
+    int bestsad = INT_MAX;
     int r, c;
 
     unsigned char *check_here;

diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index f3456a7..f331a4b 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c

@@ -137,8 +137,6 @@
 
 extern void (*vp8_short_fdct4x4)(short *input, short *output, int pitch);
 extern void (*vp8_short_fdct8x4)(short *input, short *output, int pitch);
-extern void (*vp8_fast_fdct4x4)(short *input, short *output, int pitch);
-extern void (*vp8_fast_fdct8x4)(short *input, short *output, int pitch);
 
 extern const int vp8_bits_per_mb[2][QINDEX_RANGE];
 
@@ -1136,15 +1134,11 @@
     {
         cpi->mb.vp8_short_fdct8x4 = FDCT_INVOKE(&cpi->rtcd.fdct, short8x4);
         cpi->mb.vp8_short_fdct4x4 = FDCT_INVOKE(&cpi->rtcd.fdct, short4x4);
-        cpi->mb.short_fdct8x4rd = FDCT_INVOKE(&cpi->rtcd.fdct, short8x4);
-        cpi->mb.short_fdct4x4rd = FDCT_INVOKE(&cpi->rtcd.fdct, short4x4);
     }
     else
     {
         cpi->mb.vp8_short_fdct8x4   = FDCT_INVOKE(&cpi->rtcd.fdct, fast8x4);
         cpi->mb.vp8_short_fdct4x4   = FDCT_INVOKE(&cpi->rtcd.fdct, fast4x4);
-        cpi->mb.short_fdct8x4rd = FDCT_INVOKE(&cpi->rtcd.fdct, fast8x4);
-        cpi->mb.short_fdct4x4rd = FDCT_INVOKE(&cpi->rtcd.fdct, fast4x4);
     }
 
     cpi->mb.short_walsh4x4 = FDCT_INVOKE(&cpi->rtcd.fdct, walsh_short4x4);
@@ -3237,7 +3231,7 @@
     unsigned char block_size
 )
 {
-    int byte = 0;           // Buffer offset for the current pixel value being filtered
+    int byte = 0;         // Buffer offset for current pixel being filtered
     int frame = 0;
     int modifier = 0;
     int i, j, k;
@@ -3270,9 +3264,9 @@
                     for (frame = 0; frame < frame_count; frame++)
                     {
                         // get current frame pixel value
-                        int pixel_value = frames[frame][byte];       // int pixel_value = *frameptr;
+                        int pixel_value = frames[frame][byte];
 
-                        modifier   = src_byte;                       // modifier   = s[byte];
+                        modifier   = src_byte;
                         modifier  -= pixel_value;
                         modifier  *= modifier;
                         modifier >>= strength;
@@ -3289,10 +3283,10 @@
                     }
 
                     accumulator += (count >> 1);
-                    accumulator *= fixed_divide[count];          // accumulator *= ppi->fixed_divide[count];
+                    accumulator *= fixed_divide[count];
                     accumulator >>= 16;
 
-                    dst[byte] = accumulator;        // d[byte] = accumulator;
+                    dst[byte] = accumulator;
 
                     // move to next pixel
                     byte++;
@@ -3398,7 +3392,8 @@
         {
             if ((frames_to_blur_backward + frames_to_blur_forward) >= max_frames)
             {
-                frames_to_blur_backward = max_frames - frames_to_blur_forward - 1;
+                frames_to_blur_backward 
+                    = max_frames - frames_to_blur_forward - 1;
             }
         }
         else
@@ -3455,7 +3450,7 @@
 
     for (frame = 0; frame < frames_to_blur; frame++)
     {
-        int which_buffer =  cpi->last_alt_ref_sei - frame;
+        int which_buffer =  start_frame - frame;
 
         if (which_buffer < 0)
             which_buffer += cpi->oxcf.lag_in_frames;
@@ -3479,7 +3474,7 @@
 
     for (frame = 0; frame < frames_to_blur; frame++)
     {
-        int which_buffer =  cpi->last_alt_ref_sei - frame;
+        int which_buffer =  start_frame - frame;
 
         if (which_buffer < 0)
             which_buffer += cpi->oxcf.lag_in_frames;

diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 2d6dee1..70cf122 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c

@@ -1028,7 +1028,7 @@
 
             vp8_build_inter_predictors_b(bd, 16, x->e_mbd.subpixel_predict);
             ENCODEMB_INVOKE(rtcd, subb)(be, bd, 16);
-            x->short_fdct4x4rd(be->src_diff, be->coeff, 32);
+            x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32);
 
             // set to 0 no way to account for 2nd order DC so discount
             //be->coeff[0] = 0;
@@ -1056,7 +1056,7 @@
     // Fdct and building the 2nd order block
     for (beptr = mb->block; beptr < mb->block + 16; beptr += 2)
     {
-        mb->short_fdct8x4rd(beptr->src_diff, beptr->coeff, 32);
+        mb->vp8_short_fdct8x4(beptr->src_diff, beptr->coeff, 32);
         *Y2DCPtr++ = beptr->coeff[0];
         *Y2DCPtr++ = beptr->coeff[16];
     }

diff --git a/vp8/encoder/x86/csystemdependent.c b/vp8/encoder/x86/csystemdependent.c
index 6aeac50..bf12fee 100644
--- a/vp8/encoder/x86/csystemdependent.c
+++ b/vp8/encoder/x86/csystemdependent.c

@@ -181,10 +181,17 @@
         // Willamette instruction set available:
         vp8_mbuverror                = vp8_mbuverror_xmm;
         vp8_fast_quantize_b            = vp8_fast_quantize_b_sse;
+#if 0 //new fdct
         vp8_short_fdct4x4             = vp8_short_fdct4x4_mmx;
         vp8_short_fdct8x4             = vp8_short_fdct8x4_mmx;
-        vp8_fast_fdct4x4              = vp8_fast_fdct4x4_mmx;
-        vp8_fast_fdct8x4              = vp8_fast_fdct8x4_wmt;
+        vp8_fast_fdct4x4              = vp8_short_fdct4x4_mmx;
+        vp8_fast_fdct8x4              = vp8_short_fdct8x4_wmt;
+#else
+        vp8_short_fdct4x4             = vp8_short_fdct4x4_c;
+        vp8_short_fdct8x4             = vp8_short_fdct8x4_c;
+        vp8_fast_fdct4x4              = vp8_short_fdct4x4_c;
+        vp8_fast_fdct8x4              = vp8_fast_fdct8x4_c;
+#endif
         vp8_subtract_b                = vp8_subtract_b_mmx;
         vp8_subtract_mbuv             = vp8_subtract_mbuv_mmx;
         vp8_variance4x4              = vp8_variance4x4_mmx;
@@ -218,10 +225,17 @@
         // MMX instruction set available:
         vp8_mbuverror                = vp8_mbuverror_mmx;
         vp8_fast_quantize_b            = vp8_fast_quantize_b_mmx;
+#if 0 // new fdct
         vp8_short_fdct4x4             = vp8_short_fdct4x4_mmx;
         vp8_short_fdct8x4             = vp8_short_fdct8x4_mmx;
-        vp8_fast_fdct4x4              = vp8_fast_fdct4x4_mmx;
-        vp8_fast_fdct8x4              = vp8_fast_fdct8x4_mmx;
+        vp8_fast_fdct4x4              = vp8_short_fdct4x4_mmx;
+        vp8_fast_fdct8x4              = vp8_short_fdct8x4_mmx;
+#else
+        vp8_short_fdct4x4             = vp8_short_fdct4x4_c;
+        vp8_short_fdct8x4             = vp8_short_fdct8x4_c;
+        vp8_fast_fdct4x4              = vp8_short_fdct4x4_c;
+        vp8_fast_fdct8x4              = vp8_fast_fdct8x4_c;
+#endif
         vp8_subtract_b                = vp8_subtract_b_mmx;
         vp8_subtract_mbuv             = vp8_subtract_mbuv_mmx;
         vp8_variance4x4              = vp8_variance4x4_mmx;
@@ -254,10 +268,10 @@
     {
         // Pure C:
         vp8_mbuverror                = vp8_mbuverror_c;
-        vp8_fast_quantize_b            = vp8_fast_quantize_b_c;
+        vp8_fast_quantize_b          = vp8_fast_quantize_b_c;
         vp8_short_fdct4x4             = vp8_short_fdct4x4_c;
         vp8_short_fdct8x4             = vp8_short_fdct8x4_c;
-        vp8_fast_fdct4x4              = vp8_fast_fdct4x4_c;
+        vp8_fast_fdct4x4              = vp8_short_fdct4x4_c;
         vp8_fast_fdct8x4              = vp8_fast_fdct8x4_c;
         vp8_subtract_b                = vp8_subtract_b_c;
         vp8_subtract_mbuv             = vp8_subtract_mbuv_c;

diff --git a/vp8/encoder/x86/dct_mmx.asm b/vp8/encoder/x86/dct_mmx.asm
index 32d6610..ff96c49 100644
--- a/vp8/encoder/x86/dct_mmx.asm
+++ b/vp8/encoder/x86/dct_mmx.asm

@@ -13,8 +13,7 @@
 
 section .text
     global sym(vp8_short_fdct4x4_mmx)
-    global sym(vp8_fast_fdct4x4_mmx)
-    global sym(vp8_fast_fdct8x4_wmt)
+    global sym(vp8_short_fdct8x4_wmt)
 
 
 %define         DCTCONSTANTSBITS         (16)
@@ -24,10 +23,6 @@
 %define         x_c3                      (25080)          ; cos(pi*3/8) * (1<<15)
 
 
-%define _1STSTAGESHIFT           14
-%define _2NDSTAGESHIFT           16
-
-; using matrix multiply with source and destbuffer has a pitch
 ;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch)
 sym(vp8_short_fdct4x4_mmx):
     push        rbp
@@ -37,333 +32,6 @@
     push rsi
     push rdi
     ; end prolog
-
-        mov         rsi,    arg(0) ;input
-        mov         rdi,    arg(1) ;output
-
-        movsxd      rax,    dword ptr arg(2) ;pitch
-        lea         rdx,    [dct_matrix GLOBAL]
-
-        movq        mm0,    [rsi   ]
-        movq        mm1,    [rsi + rax]
-
-        movq        mm2,    [rsi + rax*2]
-        lea         rsi,    [rsi + rax*2]
-
-        movq        mm3,    [rsi + rax]
-
-        ; first column
-        movq        mm4,    mm0
-        movq        mm7,    [rdx]
-
-        pmaddwd     mm4,    mm7
-        movq        mm5,    mm1
-
-        pmaddwd     mm5,    mm7
-        movq        mm6,    mm4
-
-        punpckldq   mm4,    mm5
-        punpckhdq   mm6,    mm5
-
-        paddd       mm4,    mm6
-        movq        mm5,    mm2
-
-
-        pmaddwd     mm5,    mm7
-        movq        mm6,    mm3
-
-        pmaddwd     mm6,    mm7
-        movq        mm7,    mm5
-
-        punpckldq   mm5,    mm6
-        punpckhdq   mm7,    mm6
-
-        paddd       mm5,    mm7
-        movq        mm6,    [dct1st_stage_rounding_mmx GLOBAL]
-
-        paddd       mm4,    mm6
-        paddd       mm5,    mm6
-
-        psrad       mm4,    _1STSTAGESHIFT
-        psrad       mm5,    _1STSTAGESHIFT
-
-        packssdw    mm4,    mm5
-        movq        [rdi],  mm4
-
-        ;second column
-        movq        mm4,    mm0
-
-        pmaddwd     mm4,    [rdx+8]
-        movq        mm5,    mm1
-
-        pmaddwd     mm5,    [rdx+8]
-        movq        mm6,    mm4
-
-        punpckldq   mm4,    mm5
-        punpckhdq   mm6,    mm5
-
-        paddd       mm4,    mm6
-        movq        mm5,    mm2
-
-        pmaddwd     mm5,    [rdx+8]
-        movq        mm6,    mm3
-
-        pmaddwd     mm6,    [rdx+8]
-        movq        mm7,    mm5
-
-        punpckldq   mm5,    mm6
-        punpckhdq   mm7,    mm6
-
-        paddd       mm5,    mm7
-        movq        mm6,    [dct1st_stage_rounding_mmx GLOBAL]
-
-        paddd       mm4,    mm6
-        paddd       mm5,    mm6
-
-        psrad       mm4,    _1STSTAGESHIFT
-        psrad       mm5,    _1STSTAGESHIFT
-
-        packssdw    mm4,    mm5
-        movq        [rdi+8],  mm4
-
-
-        ;third column
-        movq        mm4,    mm0
-
-        pmaddwd     mm4,    [rdx+16]
-        movq        mm5,    mm1
-
-        pmaddwd     mm5,    [rdx+16]
-        movq        mm6,    mm4
-
-        punpckldq   mm4,    mm5
-        punpckhdq   mm6,    mm5
-
-        paddd       mm4,    mm6
-        movq        mm5,    mm2
-
-        pmaddwd     mm5,    [rdx+16]
-        movq        mm6,    mm3
-
-        pmaddwd     mm6,    [rdx+16]
-        movq        mm7,    mm5
-
-        punpckldq   mm5,    mm6
-        punpckhdq   mm7,    mm6
-
-        paddd       mm5,    mm7
-        movq        mm6,    [dct1st_stage_rounding_mmx GLOBAL]
-
-        paddd       mm4,    mm6
-        paddd       mm5,    mm6
-
-        psrad       mm4,    _1STSTAGESHIFT
-        psrad       mm5,    _1STSTAGESHIFT
-
-        packssdw    mm4,    mm5
-        movq        [rdi+16],  mm4
-
-        ;fourth column (this is the last column, so we do not have save the source any more)
-
-        pmaddwd     mm0,    [rdx+24]
-
-        pmaddwd     mm1,    [rdx+24]
-        movq        mm6,    mm0
-
-        punpckldq   mm0,    mm1
-        punpckhdq   mm6,    mm1
-
-        paddd       mm0,    mm6
-
-        pmaddwd     mm2,    [rdx+24]
-
-        pmaddwd     mm3,    [rdx+24]
-        movq        mm7,    mm2
-
-        punpckldq   mm2,    mm3
-        punpckhdq   mm7,    mm3
-
-        paddd       mm2,    mm7
-        movq        mm6,    [dct1st_stage_rounding_mmx GLOBAL]
-
-        paddd       mm0,    mm6
-        paddd       mm2,    mm6
-
-        psrad       mm0,    _1STSTAGESHIFT
-        psrad       mm2,    _1STSTAGESHIFT
-
-        packssdw    mm0,    mm2
-
-        movq        mm3,    mm0
-
-        ; done with one pass
-        ; now start second pass
-        movq        mm0,    [rdi   ]
-        movq        mm1,    [rdi+ 8]
-        movq        mm2,    [rdi+ 16]
-
-        movq        mm4,    mm0
-
-        pmaddwd     mm4,    [rdx]
-        movq        mm5,    mm1
-
-        pmaddwd     mm5,    [rdx]
-        movq        mm6,    mm4
-
-        punpckldq   mm4,    mm5
-        punpckhdq   mm6,    mm5
-
-        paddd       mm4,    mm6
-        movq        mm5,    mm2
-
-        pmaddwd     mm5,    [rdx]
-        movq        mm6,    mm3
-
-        pmaddwd     mm6,    [rdx]
-        movq        mm7,    mm5
-
-        punpckldq   mm5,    mm6
-        punpckhdq   mm7,    mm6
-
-        paddd       mm5,    mm7
-        movq        mm6,    [dct2nd_stage_rounding_mmx GLOBAL]
-
-        paddd       mm4,    mm6
-        paddd       mm5,    mm6
-
-        psrad       mm4,    _2NDSTAGESHIFT
-        psrad       mm5,    _2NDSTAGESHIFT
-
-        packssdw    mm4,    mm5
-        movq        [rdi],  mm4
-
-        ;second column
-        movq        mm4,    mm0
-
-        pmaddwd     mm4,    [rdx+8]
-        movq        mm5,    mm1
-
-        pmaddwd     mm5,    [rdx+8]
-        movq        mm6,    mm4
-
-        punpckldq   mm4,    mm5
-        punpckhdq   mm6,    mm5
-
-        paddd       mm4,    mm6
-        movq        mm5,    mm2
-
-        pmaddwd     mm5,    [rdx+8]
-        movq        mm6,    mm3
-
-        pmaddwd     mm6,    [rdx+8]
-        movq        mm7,    mm5
-
-        punpckldq   mm5,    mm6
-        punpckhdq   mm7,    mm6
-
-        paddd       mm5,    mm7
-        movq        mm6,    [dct2nd_stage_rounding_mmx GLOBAL]
-
-        paddd       mm4,    mm6
-        paddd       mm5,    mm6
-
-        psrad       mm4,    _2NDSTAGESHIFT
-        psrad       mm5,    _2NDSTAGESHIFT
-
-        packssdw    mm4,    mm5
-        movq        [rdi+8],  mm4
-
-
-        ;third column
-        movq        mm4,    mm0
-
-        pmaddwd     mm4,    [rdx+16]
-        movq        mm5,    mm1
-
-        pmaddwd     mm5,    [rdx+16]
-        movq        mm6,    mm4
-
-        punpckldq   mm4,    mm5
-        punpckhdq   mm6,    mm5
-
-        paddd       mm4,    mm6
-        movq        mm5,    mm2
-
-        pmaddwd     mm5,    [rdx+16]
-        movq        mm6,    mm3
-
-        pmaddwd     mm6,    [rdx+16]
-        movq        mm7,    mm5
-
-        punpckldq   mm5,    mm6
-        punpckhdq   mm7,    mm6
-
-        paddd       mm5,    mm7
-        movq        mm6,    [dct2nd_stage_rounding_mmx GLOBAL]
-
-        paddd       mm4,    mm6
-        paddd       mm5,    mm6
-
-        psrad       mm4,    _2NDSTAGESHIFT
-        psrad       mm5,    _2NDSTAGESHIFT
-
-        packssdw    mm4,    mm5
-        movq        [rdi+16],  mm4
-
-        ;fourth column
-        movq        mm4,    mm0
-
-        pmaddwd     mm4,    [rdx+24]
-        movq        mm5,    mm1
-
-        pmaddwd     mm5,    [rdx+24]
-        movq        mm6,    mm4
-
-        punpckldq   mm4,    mm5
-        punpckhdq   mm6,    mm5
-
-        paddd       mm4,    mm6
-        movq        mm5,    mm2
-
-        pmaddwd     mm5,    [rdx+24]
-        movq        mm6,    mm3
-
-        pmaddwd     mm6,    [rdx+24]
-        movq        mm7,    mm5
-
-        punpckldq   mm5,    mm6
-        punpckhdq   mm7,    mm6
-
-        paddd       mm5,    mm7
-        movq        mm6,    [dct2nd_stage_rounding_mmx GLOBAL]
-
-        paddd       mm4,    mm6
-        paddd       mm5,    mm6
-
-        psrad       mm4,    _2NDSTAGESHIFT
-        psrad       mm5,    _2NDSTAGESHIFT
-
-        packssdw    mm4,    mm5
-        movq        [rdi+24],  mm4
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp8_fast_fdct4x4_mmx(short *input, short *output, int pitch)
-sym(vp8_fast_fdct4x4_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 3
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
         mov     rsi,    arg(0) ;input
         mov     rdi,    arg(1) ;output
 
@@ -379,11 +47,11 @@
         movq    mm3,    [rcx + rax]
         ; get the constants
         ;shift to left by 1 for prescision
-        paddw   mm0,    mm0
-        paddw   mm1,    mm1
+        psllw   mm0,    3
+        psllw   mm1,    3
 
-        psllw   mm2,    1
-        psllw   mm3,    1
+        psllw   mm2,    3
+        psllw   mm3,    3
 
         ; transpose for the second stage
         movq    mm4,    mm0         ; 00 01 02 03
@@ -531,20 +199,23 @@
         movq    mm3,    mm5
         ; done with vertical
 
-		pcmpeqw	mm4,	mm4
-		pcmpeqw	mm5,	mm5
-		psrlw	mm4,	15
-		psrlw	mm5,	15
+        pcmpeqw mm4,    mm4
+        pcmpeqw mm5,    mm5
+        psrlw   mm4,    15
+        psrlw   mm5,    15
+
+        psllw   mm4,    2
+        psllw   mm5,    2
 
         paddw   mm0,    mm4
         paddw   mm1,    mm5
         paddw   mm2,    mm4
         paddw   mm3,    mm5
 
-        psraw   mm0, 1
-        psraw   mm1, 1
-        psraw   mm2, 1
-        psraw   mm3, 1
+        psraw   mm0, 3
+        psraw   mm1, 3
+        psraw   mm2, 3
+        psraw   mm3, 3
 
         movq        [rdi   ],   mm0
         movq        [rdi+ 8],   mm1
@@ -560,8 +231,8 @@
     ret
 
 
-;void vp8_fast_fdct8x4_wmt(short *input, short *output, int pitch)
-sym(vp8_fast_fdct8x4_wmt):
+;void vp8_short_fdct8x4_wmt(short *input, short *output, int pitch)
+sym(vp8_short_fdct8x4_wmt):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 3
@@ -584,11 +255,11 @@
         movdqa      xmm3,       [rcx + rax]
         ; get the constants
         ;shift to left by 1 for prescision
-        psllw       xmm0,        1
-        psllw       xmm2,        1
+        psllw       xmm0,        3
+        psllw       xmm2,        3
 
-        psllw       xmm4,        1
-        psllw       xmm3,        1
+        psllw       xmm4,        3
+        psllw       xmm3,        3
 
         ; transpose for the second stage
         movdqa      xmm1,       xmm0         ; 00 01 02 03 04 05 06 07
@@ -758,20 +429,23 @@
         ; done with vertical
 
 
-        pcmpeqw		xmm4,		xmm4
-        pcmpeqw		xmm5,		xmm5;
-        psrlw		xmm4,		15
-        psrlw		xmm5,		15
+        pcmpeqw     xmm4,       xmm4
+        pcmpeqw     xmm5,       xmm5;
+        psrlw       xmm4,       15
+        psrlw       xmm5,       15
+
+        psllw       xmm4,       2
+        psllw       xmm5,       2
 
         paddw       xmm0,       xmm4
         paddw       xmm1,       xmm5
         paddw       xmm2,       xmm4
         paddw       xmm3,       xmm5
 
-        psraw       xmm0,       1
-        psraw       xmm1,       1
-        psraw       xmm2,       1
-        psraw       xmm3,       1
+        psraw       xmm0,       3
+        psraw       xmm1,       3
+        psraw       xmm2,       3
+        psraw       xmm3,       3
 
         movq        QWORD PTR[rdi   ],   xmm0
         movq        QWORD PTR[rdi+ 8],   xmm1

diff --git a/vp8/encoder/x86/dct_sse2.asm b/vp8/encoder/x86/dct_sse2.asm
index 1cd137d..0e8cfcf 100644
--- a/vp8/encoder/x86/dct_sse2.asm
+++ b/vp8/encoder/x86/dct_sse2.asm

@@ -11,251 +11,179 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
-global sym(vp8_short_fdct4x4_wmt)
-
-%define         DCTCONSTANTSBITS         (16)
-%define         DCTROUNDINGVALUE         (1<< (DCTCONSTANTSBITS-1))
-%define         x_c1                      (60547)          ; cos(pi  /8) * (1<<15)
-%define         x_c2                      (46341)          ; cos(pi*2/8) * (1<<15)
-%define         x_c3                      (25080)          ; cos(pi*3/8) * (1<<15)
-
-%define _1STSTAGESHIFT           14
-%define _2NDSTAGESHIFT           16
-
-
-;; using matrix multiply
-;void vp8_short_fdct4x4_wmt(short *input, short *output)
-sym(vp8_short_fdct4x4_wmt):
+;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch)
+global sym(vp8_short_fdct4x4_sse2)
+sym(vp8_short_fdct4x4_sse2):
     push        rbp
     mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 2
+    SHADOW_ARGS_TO_STACK 3
+;;    SAVE_XMM
     GET_GOT     rbx
+    push        rsi
+    push        rdi
     ; end prolog
 
-        mov         rax,        arg(0) ;input
-        mov         rcx,        arg(1) ;output
+    mov         rsi, arg(0)
+    movsxd      rax, DWORD PTR arg(2)
+    lea         rdi, [rsi + rax*2]
 
-        lea         rdx,        [dct_matrix_sse2 GLOBAL]
+    movq        xmm0, MMWORD PTR[rsi   ]        ;03 02 01 00
+    movq        xmm2, MMWORD PTR[rsi + rax]     ;13 12 11 10
+    movq        xmm1, MMWORD PTR[rsi + rax*2]   ;23 22 21 20
+    movq        xmm3, MMWORD PTR[rdi + rax]     ;33 32 31 30
 
-        movdqu      xmm0,       [rax   ]
-        movdqu      xmm1,       [rax+16]
+    punpcklqdq  xmm0, xmm2                      ;13 12 11 10 03 02 01 00
+    punpcklqdq  xmm1, xmm3                      ;33 32 31 30 23 22 21 20
 
-        ; first column
-        movdqa      xmm2,       xmm0
-        movdqa      xmm7,       [rdx]
+    mov         rdi, arg(1)
 
-        pmaddwd     xmm2,       xmm7
-        movdqa      xmm3,       xmm1
+    movdqa      xmm2, xmm0
+    punpckldq   xmm0, xmm1                      ;23 22 03 02 21 20 01 00
+    punpckhdq   xmm2, xmm1                      ;33 32 13 12 31 30 11 10
+    movdqa      xmm1, xmm0
+    punpckldq   xmm0, xmm2                      ;31 21 30 20 11 10 01 00
+    pshufhw     xmm1, xmm1, 0b1h                ;22 23 02 03 xx xx xx xx
+    pshufhw     xmm2, xmm2, 0b1h                ;32 33 12 13 xx xx xx xx
 
-        pmaddwd     xmm3,       xmm7
-        movdqa      xmm4,       xmm2
+    punpckhdq   xmm1, xmm2                      ;32 33 22 23 12 13 02 03
+    movdqa      xmm3, xmm0
+    paddw       xmm0, xmm1                      ;b1 a1 b1 a1 b1 a1 b1 a1
+    psubw       xmm3, xmm1                      ;c1 d1 c1 d1 c1 d1 c1 d1
+    psllw       xmm0, 3                         ;b1 <<= 3 a1 <<= 3
+    psllw       xmm3, 3                         ;c1 <<= 3 d1 <<= 3
+    movdqa      xmm1, xmm0
+    pmaddwd     xmm0, XMMWORD PTR[_mult_add GLOBAL]     ;a1 + b1
+    pmaddwd     xmm1, XMMWORD PTR[_mult_sub GLOBAL]     ;a1 - b1
+    movdqa      xmm4, xmm3
+    pmaddwd     xmm3, XMMWORD PTR[_5352_2217 GLOBAL]    ;c1*2217 + d1*5352
+    pmaddwd     xmm4, XMMWORD PTR[_2217_neg5352 GLOBAL] ;d1*2217 - c1*5352
 
-        punpckldq   xmm2,       xmm3
-        punpckhdq   xmm4,       xmm3
+    paddd       xmm3, XMMWORD PTR[_14500 GLOBAL]
+    paddd       xmm4, XMMWORD PTR[_7500 GLOBAL]
+    psrad       xmm3, 12            ;(c1 * 2217 + d1 * 5352 +  14500)>>12
+    psrad       xmm4, 12            ;(d1 * 2217 - c1 * 5352 +   7500)>>12
 
-        movdqa      xmm3,       xmm2
-        punpckldq   xmm2,       xmm4
+    packssdw    xmm0, xmm1                      ;op[2] op[0]
+    packssdw    xmm3, xmm4                      ;op[3] op[1]
+    ; 23 22 21 20 03 02 01 00
+    ;
+    ; 33 32 31 30 13 12 11 10
+    ;
+    movdqa      xmm2, xmm0
+    punpcklqdq  xmm0, xmm3                      ;13 12 11 10 03 02 01 00
+    punpckhqdq  xmm2, xmm3                      ;23 22 21 20 33 32 31 30
 
-        punpckhdq   xmm3,       xmm4
-        paddd       xmm2,       xmm3
+    movdqa      xmm3, xmm0
+    punpcklwd   xmm0, xmm2                      ;32 30 22 20 12 10 02 00
+    punpckhwd   xmm3, xmm2                      ;33 31 23 21 13 11 03 01
+    movdqa      xmm2, xmm0
+    punpcklwd   xmm0, xmm3                      ;13 12 11 10 03 02 01 00
+    punpckhwd   xmm2, xmm3                      ;33 32 31 30 23 22 21 20
 
+    movdqa      xmm5, XMMWORD PTR[_7 GLOBAL]
+    pshufd      xmm2, xmm2, 04eh
+    movdqa      xmm3, xmm0
+    paddw       xmm0, xmm2                      ;b1 b1 b1 b1 a1 a1 a1 a1
+    psubw       xmm3, xmm2                      ;c1 c1 c1 c1 d1 d1 d1 d1
 
-        paddd       xmm2,       XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL]
-        psrad       xmm2,       _1STSTAGESHIFT
-        ;second column
-        movdqa      xmm3,       xmm0
-        pmaddwd     xmm3,       [rdx+16]
+    pshufd      xmm0, xmm0, 0d8h                ;b1 b1 a1 a1 b1 b1 a1 a1
+    movdqa      xmm2, xmm3                      ;save d1 for compare
+    pshufd      xmm3, xmm3, 0d8h                ;c1 c1 d1 d1 c1 c1 d1 d1
+    pshuflw     xmm0, xmm0, 0d8h                ;b1 b1 a1 a1 b1 a1 b1 a1
+    pshuflw     xmm3, xmm3, 0d8h                ;c1 c1 d1 d1 c1 d1 c1 d1
+    pshufhw     xmm0, xmm0, 0d8h                ;b1 a1 b1 a1 b1 a1 b1 a1
+    pshufhw     xmm3, xmm3, 0d8h                ;c1 d1 c1 d1 c1 d1 c1 d1
+    movdqa      xmm1, xmm0
+    pmaddwd     xmm0, XMMWORD PTR[_mult_add GLOBAL] ;a1 + b1
+    pmaddwd     xmm1, XMMWORD PTR[_mult_sub GLOBAL] ;a1 - b1
 
-        movdqa      xmm4,       xmm1
-        pmaddwd     xmm4,       [rdx+16]
+    pxor        xmm4, xmm4                      ;zero out for compare
+    paddd       xmm0, xmm5
+    paddd       xmm1, xmm5
+    pcmpeqw     xmm2, xmm4
+    psrad       xmm0, 4                         ;(a1 + b1 + 7)>>4
+    psrad       xmm1, 4                         ;(a1 - b1 + 7)>>4
+    pandn       xmm2, XMMWORD PTR[_cmp_mask GLOBAL] ;clear upper,
+                                                    ;and keep bit 0 of lower
 
-        movdqa      xmm5,       xmm3
-        punpckldq   xmm3,       xmm4
+    movdqa      xmm4, xmm3
+    pmaddwd     xmm3, XMMWORD PTR[_5352_2217 GLOBAL]    ;c1*2217 + d1*5352
+    pmaddwd     xmm4, XMMWORD PTR[_2217_neg5352 GLOBAL] ;d1*2217 - c1*5352
+    paddd       xmm3, XMMWORD PTR[_12000 GLOBAL]
+    paddd       xmm4, XMMWORD PTR[_51000 GLOBAL]
+    packssdw    xmm0, xmm1                      ;op[8] op[0]
+    psrad       xmm3, 16                ;(c1 * 2217 + d1 * 5352 +  12000)>>16
+    psrad       xmm4, 16                ;(d1 * 2217 - c1 * 5352 +  51000)>>16
 
-        punpckhdq   xmm5,       xmm4
-        movdqa      xmm4,       xmm3
+    packssdw    xmm3, xmm4                      ;op[12] op[4]
+    movdqa      xmm1, xmm0
+    paddw       xmm3, xmm2                      ;op[4] += (d1!=0)
+    punpcklqdq  xmm0, xmm3                      ;op[4] op[0]
+    punpckhqdq  xmm1, xmm3                      ;op[12] op[8]
 
-        punpckldq   xmm3,       xmm5
-        punpckhdq   xmm4,       xmm5
+    movdqa      XMMWORD PTR[rdi + 0], xmm0
+    movdqa      XMMWORD PTR[rdi + 16], xmm1
 
-        paddd       xmm3,       xmm4
-        paddd       xmm3,       XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL]
-
-
-        psrad       xmm3,       _1STSTAGESHIFT
-        packssdw    xmm2,       xmm3
-
-        ;third column
-        movdqa      xmm3,       xmm0
-        pmaddwd     xmm3,       [rdx+32]
-
-        movdqa      xmm4,       xmm1
-        pmaddwd     xmm4,       [rdx+32]
-
-        movdqa      xmm5,       xmm3
-        punpckldq   xmm3,       xmm4
-
-        punpckhdq   xmm5,       xmm4
-        movdqa      xmm4,       xmm3
-
-        punpckldq   xmm3,       xmm5
-        punpckhdq   xmm4,       xmm5
-
-        paddd       xmm3,       xmm4
-        paddd       xmm3,       XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL]
-
-        psrad       xmm3,       _1STSTAGESHIFT
-
-        ;fourth column (this is the last column, so we do not have save the source any more)
-        pmaddwd     xmm0,       [rdx+48]
-        pmaddwd     xmm1,       [rdx+48]
-
-        movdqa      xmm4,       xmm0
-        punpckldq   xmm0,       xmm1
-
-        punpckhdq   xmm4,       xmm1
-        movdqa      xmm1,       xmm0
-
-        punpckldq   xmm0,       xmm4
-        punpckhdq   xmm1,       xmm4
-
-        paddd       xmm0,       xmm1
-        paddd       xmm0,       XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL]
-
-
-        psrad       xmm0,       _1STSTAGESHIFT
-        packssdw    xmm3,       xmm0
-        ; done with one pass
-        ; now start second pass
-        movdqa      xmm0,       xmm2
-        movdqa      xmm1,       xmm3
-
-        pmaddwd     xmm2,       xmm7
-        pmaddwd     xmm3,       xmm7
-
-        movdqa      xmm4,       xmm2
-        punpckldq   xmm2,       xmm3
-
-        punpckhdq   xmm4,       xmm3
-        movdqa      xmm3,       xmm2
-
-        punpckldq   xmm2,       xmm4
-        punpckhdq   xmm3,       xmm4
-
-        paddd       xmm2,       xmm3
-        paddd       xmm2,       XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL]
-
-        psrad       xmm2,       _2NDSTAGESHIFT
-
-        ;second column
-        movdqa      xmm3,       xmm0
-        pmaddwd     xmm3,       [rdx+16]
-
-        movdqa      xmm4,       xmm1
-        pmaddwd     xmm4,       [rdx+16]
-
-        movdqa      xmm5,       xmm3
-        punpckldq   xmm3,       xmm4
-
-        punpckhdq   xmm5,       xmm4
-        movdqa      xmm4,       xmm3
-
-        punpckldq   xmm3,       xmm5
-        punpckhdq   xmm4,       xmm5
-
-        paddd       xmm3,       xmm4
-        paddd       xmm3,       XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL]
-
-        psrad       xmm3,       _2NDSTAGESHIFT
-        packssdw    xmm2,       xmm3
-
-        movdqu      [rcx],      xmm2
-        ;third column
-        movdqa      xmm3,       xmm0
-        pmaddwd     xmm3,       [rdx+32]
-
-        movdqa      xmm4,       xmm1
-        pmaddwd     xmm4,       [rdx+32]
-
-        movdqa      xmm5,       xmm3
-        punpckldq   xmm3,       xmm4
-
-        punpckhdq   xmm5,       xmm4
-        movdqa      xmm4,       xmm3
-
-        punpckldq   xmm3,       xmm5
-        punpckhdq   xmm4,       xmm5
-
-        paddd       xmm3,       xmm4
-        paddd       xmm3,       XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL]
-
-        psrad       xmm3,       _2NDSTAGESHIFT
-        ;fourth column
-        pmaddwd     xmm0,       [rdx+48]
-        pmaddwd     xmm1,       [rdx+48]
-
-        movdqa      xmm4,       xmm0
-        punpckldq   xmm0,       xmm1
-
-        punpckhdq   xmm4,       xmm1
-        movdqa      xmm1,       xmm0
-
-        punpckldq   xmm0,       xmm4
-        punpckhdq   xmm1,       xmm4
-
-        paddd       xmm0,       xmm1
-        paddd       xmm0,       XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL]
-
-        psrad       xmm0,       _2NDSTAGESHIFT
-        packssdw    xmm3,       xmm0
-
-        movdqu     [rcx+16],   xmm3
-
-    mov rsp, rbp
     ; begin epilog
+    pop rdi
+    pop rsi
     RESTORE_GOT
+;;    RESTORE_XMM
     UNSHADOW_ARGS
     pop         rbp
     ret
 
-
 SECTION_RODATA
-;static unsigned int dct1st_stage_rounding_sse2[4] =
 align 16
-dct1st_stage_rounding_sse2:
-    times 4 dd 8192
-
-
-;static unsigned int dct2nd_stage_rounding_sse2[4] =
+_5352_2217:
+    dw 5352
+    dw 2217
+    dw 5352
+    dw 2217
+    dw 5352
+    dw 2217
+    dw 5352
+    dw 2217
 align 16
-dct2nd_stage_rounding_sse2:
-    times 4 dd 32768
-
-;static short dct_matrix_sse2[4][8]=
+_2217_neg5352:
+    dw 2217
+    dw -5352
+    dw 2217
+    dw -5352
+    dw 2217
+    dw -5352
+    dw 2217
+    dw -5352
 align 16
-dct_matrix_sse2:
-    times 8 dw 23170
+_mult_add:
+    times 8 dw 1
+align 16
+_cmp_mask:
+    times 4 dw 1
+    times 4 dw 0
 
-    dw  30274
-    dw  12540
-    dw -12540
-    dw -30274
-    dw  30274
-    dw  12540
-    dw -12540
-    dw -30274
-
-    dw  23170
-    times 2 dw -23170
-    times 2 dw  23170
-    times 2 dw -23170
-    dw  23170
-
-    dw  12540
-    dw -30274
-    dw  30274
-    dw -12540
-    dw  12540
-    dw -30274
-    dw  30274
-    dw -12540
+align 16
+_mult_sub:
+    dw 1
+    dw -1
+    dw 1
+    dw -1
+    dw 1
+    dw -1
+    dw 1
+    dw -1
+align 16
+_7:
+    times 4 dd 7
+align 16
+_14500:
+    times 4 dd 14500
+align 16
+_7500:
+    times 4 dd 7500
+align 16
+_12000:
+    times 4 dd 12000
+align 16
+_51000:
+    times 4 dd 51000

diff --git a/vp8/encoder/x86/dct_x86.h b/vp8/encoder/x86/dct_x86.h
index 05d0180..bff52e1 100644
--- a/vp8/encoder/x86/dct_x86.h
+++ b/vp8/encoder/x86/dct_x86.h

@@ -22,46 +22,41 @@
 #if HAVE_MMX
 extern prototype_fdct(vp8_short_fdct4x4_mmx);
 extern prototype_fdct(vp8_short_fdct8x4_mmx);
-extern prototype_fdct(vp8_fast_fdct4x4_mmx);
-extern prototype_fdct(vp8_fast_fdct8x4_mmx);
 
 #if !CONFIG_RUNTIME_CPU_DETECT
+#if 0
 #undef  vp8_fdct_short4x4
 #define vp8_fdct_short4x4 vp8_short_fdct4x4_mmx
 
 #undef  vp8_fdct_short8x4
 #define vp8_fdct_short8x4 vp8_short_fdct8x4_mmx
-
-#undef  vp8_fdct_fast4x4
-#define vp8_fdct_fast4x4 vp8_fast_fdct4x4_mmx
-
-#undef  vp8_fdct_fast8x4
-#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_mmx
+#endif
 
 #endif
 #endif
 
 
 #if HAVE_SSE2
-extern prototype_fdct(vp8_short_fdct4x4_wmt);
 extern prototype_fdct(vp8_short_fdct8x4_wmt);
-extern prototype_fdct(vp8_fast_fdct8x4_wmt);
-
 extern prototype_fdct(vp8_short_walsh4x4_sse2);
 
-#if !CONFIG_RUNTIME_CPU_DETECT
+extern prototype_fdct(vp8_short_fdct4x4_sse2);
 
-#if 0
+#if !CONFIG_RUNTIME_CPU_DETECT
+#if 1
 /* short SSE2 DCT currently disabled, does not match the MMX version */
 #undef  vp8_fdct_short4x4
-#define vp8_fdct_short4x4 vp8_short_fdct4x4_wmt
+#define vp8_fdct_short4x4 vp8_short_fdct4x4_sse2
 
 #undef  vp8_fdct_short8x4
-#define vp8_fdct_short8x4 vp8_short_fdct8x4_wmt
+#define vp8_fdct_short8x4 vp8_short_fdct8x4_sse2
 #endif
 
+#undef  vp8_fdct_fast4x4
+#define vp8_fdct_fast4x4 vp8_short_fdct4x4_sse2
+
 #undef  vp8_fdct_fast8x4
-#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_wmt
+#define vp8_fdct_fast8x4 vp8_short_fdct8x4_sse2
 
 #undef vp8_fdct_walsh_short4x4
 #define vp8_fdct_walsh_short4x4  vp8_short_walsh4x4_sse2

diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c
index f375045..4d05156 100644
--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ b/vp8/encoder/x86/x86_csystemdependent.c

@@ -18,15 +18,10 @@
 #if HAVE_MMX
 void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch)
 {
-    vp8_short_fdct4x4_mmx(input,   output,    pitch);
-    vp8_short_fdct4x4_mmx(input + 4, output + 16, pitch);
+    vp8_short_fdct4x4_c(input,   output,    pitch);
+    vp8_short_fdct4x4_c(input + 4, output + 16, pitch);
 }
 
-void vp8_fast_fdct8x4_mmx(short *input, short *output, int pitch)
-{
-    vp8_fast_fdct4x4_mmx(input,   output   , pitch);
-    vp8_fast_fdct4x4_mmx(input + 4, output + 16, pitch);
-}
 
 int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
                                  short *qcoeff_ptr, short *dequant_ptr,
@@ -87,10 +82,10 @@
 #endif
 
 #if HAVE_SSE2
-void vp8_short_fdct8x4_wmt(short *input, short *output, int pitch)
+void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch)
 {
-    vp8_short_fdct4x4_wmt(input,   output,    pitch);
-    vp8_short_fdct4x4_wmt(input + 4, output + 16, pitch);
+    vp8_short_fdct4x4_sse2(input,   output,    pitch);
+    vp8_short_fdct4x4_sse2(input + 4, output + 16, pitch);
 }
 
 int vp8_fast_quantize_b_impl_sse(short *coeff_ptr, short *zbin_ptr,
@@ -221,11 +216,19 @@
         cpi->rtcd.variance.get8x8var             = vp8_get8x8var_mmx;
         cpi->rtcd.variance.get16x16var           = vp8_get16x16var_mmx;
         cpi->rtcd.variance.get4x4sse_cs          = vp8_get4x4sse_cs_mmx;
-
+#if 0 // new fdct
         cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_mmx;
         cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_mmx;
-        cpi->rtcd.fdct.fast4x4                   = vp8_fast_fdct4x4_mmx;
-        cpi->rtcd.fdct.fast8x4                   = vp8_fast_fdct8x4_mmx;
+        cpi->rtcd.fdct.fast4x4                   = vp8_short_fdct4x4_mmx;
+        cpi->rtcd.fdct.fast8x4                   = vp8_short_fdct8x4_mmx;
+#else
+        cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_c;
+        cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_c;
+        cpi->rtcd.fdct.fast4x4                   = vp8_short_fdct4x4_c;
+        cpi->rtcd.fdct.fast8x4                   = vp8_short_fdct8x4_c;
+
+#endif
+
         cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_c;
 
         cpi->rtcd.encodemb.berr                  = vp8_block_error_mmx;
@@ -270,13 +273,11 @@
         cpi->rtcd.variance.get16x16var           = vp8_get16x16var_sse2;
         /* cpi->rtcd.variance.get4x4sse_cs  not implemented for wmt */;
 
-#if 0
-        /* short SSE2 DCT currently disabled, does not match the MMX version */
-        cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_wmt;
-        cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_wmt;
-#endif
-        /* cpi->rtcd.fdct.fast4x4  not implemented for wmt */;
-        cpi->rtcd.fdct.fast8x4                   = vp8_fast_fdct8x4_wmt;
+        cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_sse2;
+        cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_sse2;
+        cpi->rtcd.fdct.fast4x4                   = vp8_short_fdct4x4_sse2;
+        cpi->rtcd.fdct.fast8x4                   = vp8_short_fdct8x4_sse2;
+
         cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_sse2;
 
         cpi->rtcd.encodemb.berr                  = vp8_block_error_xmm;

diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk
index f09f258..c88df47 100644
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk

@@ -93,10 +93,10 @@
 VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/sad_mmx.asm
 VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/dct_mmx.asm
 VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/subtract_mmx.asm
+VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/variance_sse2.c
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/variance_impl_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/sad_sse2.asm
-VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE3) += encoder/x86/sad_sse3.asm
commit	b3eb3d216394a2ee9ec5a304accd839bd0851433	[log] [tgz]
author	John Koleszar <jkoleszar@google.com>	Wed Jun 30 07:59:39 2010 -0700
committer	Code Review <code-review@webmproject.org>	Wed Jun 30 07:59:39 2010 -0700
tree	5103544df3119384a42b37769b0119f66c46cba7
parent	308e867f91ec1b76d59009060ada99f52a73c602 [diff]
parent	5e34461448c3b68385d5cc92d6b4d4c95be394fb [diff]