Merge "Disable __longjmp_chk protection"
diff --git a/libmkv/EbmlWriter.c b/libmkv/EbmlWriter.c
index ac70d09..fbf2c66 100644
--- a/libmkv/EbmlWriter.c
+++ b/libmkv/EbmlWriter.c
@@ -11,6 +11,7 @@
 #include <stdlib.h>
 #include <wchar.h>
 #include <string.h>
+#include <limits.h>
 #if defined(_MSC_VER)
 #define LITERALU64(n) n
 #else
@@ -33,7 +34,7 @@
 
     val |= (LITERALU64(0x000000000000080) << ((size - 1) * 7));
 
-    Ebml_Serialize(glob, (void *) &val, size);
+    Ebml_Serialize(glob, (void *) &val, sizeof(val), size);
 }
 
 void Ebml_WriteString(EbmlGlobal *glob, const char *str)
@@ -60,21 +61,26 @@
 
 void Ebml_WriteID(EbmlGlobal *glob, unsigned long class_id)
 {
+    int len;
+
     if (class_id >= 0x01000000)
-        Ebml_Serialize(glob, (void *)&class_id, 4);
+        len = 4;
     else if (class_id >= 0x00010000)
-        Ebml_Serialize(glob, (void *)&class_id, 3);
+        len = 3;
     else if (class_id >= 0x00000100)
-        Ebml_Serialize(glob, (void *)&class_id, 2);
+        len = 2;
     else
-        Ebml_Serialize(glob, (void *)&class_id, 1);
+        len = 1;
+
+    Ebml_Serialize(glob, (void *)&class_id, sizeof(class_id), len);
 }
+
 void Ebml_SerializeUnsigned64(EbmlGlobal *glob, unsigned long class_id, uint64_t ui)
 {
     unsigned char sizeSerialized = 8 | 0x80;
     Ebml_WriteID(glob, class_id);
-    Ebml_Serialize(glob, &sizeSerialized, 1);
-    Ebml_Serialize(glob, &ui, 8);
+    Ebml_Serialize(glob, &sizeSerialized, sizeof(sizeSerialized), 1);
+    Ebml_Serialize(glob, &ui, sizeof(ui), 8);
 }
 
 void Ebml_SerializeUnsigned(EbmlGlobal *glob, unsigned long class_id, unsigned long ui)
@@ -97,8 +103,8 @@
     }
 
     sizeSerialized = 0x80 | size;
-    Ebml_Serialize(glob, &sizeSerialized, 1);
-    Ebml_Serialize(glob, &ui, size);
+    Ebml_Serialize(glob, &sizeSerialized, sizeof(sizeSerialized), 1);
+    Ebml_Serialize(glob, &ui, sizeof(ui), size);
 }
 //TODO: perhaps this is a poor name for this id serializer helper function
 void Ebml_SerializeBinary(EbmlGlobal *glob, unsigned long class_id, unsigned long bin)
@@ -119,14 +125,14 @@
     unsigned char len = 0x88;
 
     Ebml_WriteID(glob, class_id);
-    Ebml_Serialize(glob, &len, 1);
-    Ebml_Serialize(glob,  &d, 8);
+    Ebml_Serialize(glob, &len, sizeof(len), 1);
+    Ebml_Serialize(glob,  &d, sizeof(d), 8);
 }
 
 void Ebml_WriteSigned16(EbmlGlobal *glob, short val)
 {
     signed long out = ((val & 0x003FFFFF) | 0x00200000) << 8;
-    Ebml_Serialize(glob, &out, 3);
+    Ebml_Serialize(glob, &out, sizeof(out), 3);
 }
 
 void Ebml_SerializeString(EbmlGlobal *glob, unsigned long class_id, const char *s)
diff --git a/libmkv/EbmlWriter.h b/libmkv/EbmlWriter.h
index 8c7fe7c..324c9bc 100644
--- a/libmkv/EbmlWriter.h
+++ b/libmkv/EbmlWriter.h
@@ -15,7 +15,7 @@
 #include "vpx/vpx_integer.h"
 
 typedef struct EbmlGlobal EbmlGlobal;
-void  Ebml_Serialize(EbmlGlobal *glob, const void *, unsigned long);
+void  Ebml_Serialize(EbmlGlobal *glob, const void *, int, unsigned long);
 void  Ebml_Write(EbmlGlobal *glob, const void *, unsigned long);
 /////
 
diff --git a/libmkv/WebMElement.c b/libmkv/WebMElement.c
index 25a9024..0ef5100 100644
--- a/libmkv/WebMElement.c
+++ b/libmkv/WebMElement.c
@@ -35,11 +35,11 @@
     Ebml_WriteID(glob, SimpleBlock);
     unsigned long blockLength = 4 + dataLength;
     blockLength |= 0x10000000; //TODO check length < 0x0FFFFFFFF
-    Ebml_Serialize(glob, &blockLength, 4);
+    Ebml_Serialize(glob, &blockLength, sizeof(blockLength), 4);
     trackNumber |= 0x80;  //TODO check track nubmer < 128
     Ebml_Write(glob, &trackNumber, 1);
     //Ebml_WriteSigned16(glob, timeCode,2); //this is 3 bytes
-    Ebml_Serialize(glob, &timeCode, 2);
+    Ebml_Serialize(glob, &timeCode, sizeof(timeCode), 2);
     unsigned char flags = 0x00 | (isKeyframe ? 0x80 : 0x00) | (lacingFlag << 1) | discardable;
     Ebml_Write(glob, &flags, 1);
     Ebml_Write(glob, data, dataLength);
diff --git a/vp8/common/arm/arm_systemdependent.c b/vp8/common/arm/arm_systemdependent.c
index 8aab0ff..c0467cd 100644
--- a/vp8/common/arm/arm_systemdependent.c
+++ b/vp8/common/arm/arm_systemdependent.c
@@ -54,9 +54,11 @@
         rtcd->loopfilter.normal_b_v  = vp8_loop_filter_bv_armv6;
         rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_armv6;
         rtcd->loopfilter.normal_b_h  = vp8_loop_filter_bh_armv6;
-        rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_armv6;
+        rtcd->loopfilter.simple_mb_v =
+                vp8_loop_filter_simple_vertical_edge_armv6;
         rtcd->loopfilter.simple_b_v  = vp8_loop_filter_bvs_armv6;
-        rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_armv6;
+        rtcd->loopfilter.simple_mb_h =
+                vp8_loop_filter_simple_horizontal_edge_armv6;
         rtcd->loopfilter.simple_b_h  = vp8_loop_filter_bhs_armv6;
 
         rtcd->recon.copy16x16   = vp8_copy_mem16x16_v6;
diff --git a/vp8/common/arm/armv6/loopfilter_v6.asm b/vp8/common/arm/armv6/loopfilter_v6.asm
index c7441b0..1cbbbcd 100644
--- a/vp8/common/arm/armv6/loopfilter_v6.asm
+++ b/vp8/common/arm/armv6/loopfilter_v6.asm
@@ -53,14 +53,11 @@
 
 ;r0     unsigned char *src_ptr,
 ;r1     int src_pixel_step,
-;r2     const char *flimit,
+;r2     const char *blimit,
 ;r3     const char *limit,
 ;stack  const char *thresh,
 ;stack  int  count
 
-;Note: All 16 elements in flimit are equal. So, in the code, only one load is needed
-;for flimit. Same way applies to limit and thresh.
-
 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
 |vp8_loop_filter_horizontal_edge_armv6| PROC
 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
@@ -72,14 +69,18 @@
     sub         sp, sp, #16                 ; create temp buffer
 
     ldr         r9, [src], pstep            ; p3
-    ldr         r4, [r2], #4                ; flimit
+    ldrb        r4, [r2]                    ; blimit
     ldr         r10, [src], pstep           ; p2
-    ldr         r2, [r3], #4                ; limit
+    ldrb        r2, [r3]                    ; limit
     ldr         r11, [src], pstep           ; p1
-    uadd8       r4, r4, r4                  ; flimit * 2
-    ldr         r3, [r6], #4                ; thresh
+    orr         r4, r4, r4, lsl #8
+    ldrb        r3, [r6]                    ; thresh
+    orr         r2, r2, r2, lsl #8
     mov         count, count, lsl #1        ; 4-in-parallel
-    uadd8       r4, r4, r2                  ; flimit * 2 + limit
+    orr         r4, r4, r4, lsl #16
+    orr         r3, r3, r3, lsl #8
+    orr         r2, r2, r2, lsl #16
+    orr         r3, r3, r3, lsl #16
 
 |Hnext8|
     ; vp8_filter_mask() function
@@ -275,14 +276,18 @@
     sub         sp, sp, #16                 ; create temp buffer
 
     ldr         r9, [src], pstep            ; p3
-    ldr         r4, [r2], #4                ; flimit
+    ldrb        r4, [r2]                    ; blimit
     ldr         r10, [src], pstep           ; p2
-    ldr         r2, [r3], #4                ; limit
+    ldrb        r2, [r3]                    ; limit
     ldr         r11, [src], pstep           ; p1
-    uadd8       r4, r4, r4                  ; flimit * 2
-    ldr         r3, [r6], #4                ; thresh
+    orr         r4, r4, r4, lsl #8
+    ldrb        r3, [r6]                    ; thresh
+    orr         r2, r2, r2, lsl #8
     mov         count, count, lsl #1        ; 4-in-parallel
-    uadd8       r4, r4, r2                  ; flimit * 2 + limit
+    orr         r4, r4, r4, lsl #16
+    orr         r3, r3, r3, lsl #8
+    orr         r2, r2, r2, lsl #16
+    orr         r3, r3, r3, lsl #16
 
 |MBHnext8|
 
@@ -584,15 +589,19 @@
     sub         sp, sp, #16                 ; create temp buffer
 
     ldr         r6, [src], pstep            ; load source data
-    ldr         r4, [r2], #4                ; flimit
+    ldrb        r4, [r2]                    ; blimit
     ldr         r7, [src], pstep
-    ldr         r2, [r3], #4                ; limit
+    ldrb        r2, [r3]                    ; limit
     ldr         r8, [src], pstep
-    uadd8       r4, r4, r4                  ; flimit * 2
-    ldr         r3, [r12], #4               ; thresh
+    orr         r4, r4, r4, lsl #8
+    ldrb        r3, [r12]                   ; thresh
+    orr         r2, r2, r2, lsl #8
     ldr         lr, [src], pstep
     mov         count, count, lsl #1        ; 4-in-parallel
-    uadd8       r4, r4, r2                  ; flimit * 2 + limit
+    orr         r4, r4, r4, lsl #16
+    orr         r3, r3, r3, lsl #8
+    orr         r2, r2, r2, lsl #16
+    orr         r3, r3, r3, lsl #16
 
 |Vnext8|
 
@@ -855,18 +864,22 @@
     sub         sp, sp, #16                 ; create temp buffer
 
     ldr         r6, [src], pstep            ; load source data
-    ldr         r4, [r2], #4                ; flimit
+    ldrb        r4, [r2]                    ; blimit
     pld         [src, #23]
     ldr         r7, [src], pstep
-    ldr         r2, [r3], #4                ; limit
+    ldrb        r2, [r3]                    ; limit
     pld         [src, #23]
     ldr         r8, [src], pstep
-    uadd8       r4, r4, r4                  ; flimit * 2
-    ldr         r3, [r12], #4               ; thresh
+    orr         r4, r4, r4, lsl #8
+    ldrb        r3, [r12]                   ; thresh
+    orr         r2, r2, r2, lsl #8
     pld         [src, #23]
     ldr         lr, [src], pstep
     mov         count, count, lsl #1        ; 4-in-parallel
-    uadd8       r4, r4, r2                  ; flimit * 2 + limit
+    orr         r4, r4, r4, lsl #16
+    orr         r3, r3, r3, lsl #8
+    orr         r2, r2, r2, lsl #16
+    orr         r3, r3, r3, lsl #16
 
 |MBVnext8|
     ; vp8_filter_mask() function
@@ -906,6 +919,7 @@
     str         lr, [sp, #8]
     ldr         lr, [src], pstep
 
+
     TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
 
     ldr         lr, [sp, #8]                ; load back (f)limit accumulator
@@ -954,6 +968,7 @@
     beq         mbvskip_filter               ; skip filtering
 
 
+
     ;vp8_hevmask() function
     ;calculate high edge variance
 
@@ -1121,6 +1136,7 @@
     smlabb      r8, r6, lr, r7
     smlatb      r6, r6, lr, r7
     smlabb      r9, r10, lr, r7
+
     smlatb      r10, r10, lr, r7
     ssat        r8, #8, r8, asr #7
     ssat        r6, #8, r6, asr #7
diff --git a/vp8/common/arm/armv6/simpleloopfilter_v6.asm b/vp8/common/arm/armv6/simpleloopfilter_v6.asm
index 40a71f4..5e00cf0 100644
--- a/vp8/common/arm/armv6/simpleloopfilter_v6.asm
+++ b/vp8/common/arm/armv6/simpleloopfilter_v6.asm
@@ -45,35 +45,28 @@
     MEND
 
 
+
 src         RN  r0
 pstep       RN  r1
 
 ;r0     unsigned char *src_ptr,
 ;r1     int src_pixel_step,
-;r2     const char *flimit,
-;r3     const char *limit,
-;stack  const char *thresh,
-;stack  int  count
-
-; All 16 elements in flimit are equal. So, in the code, only one load is needed
-; for flimit. Same applies to limit. thresh is not used in simple looopfilter
+;r2     const char *blimit
 
 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
 |vp8_loop_filter_simple_horizontal_edge_armv6| PROC
 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
     stmdb       sp!, {r4 - r11, lr}
 
-    ldr         r12, [r3]                   ; limit
+    ldrb        r12, [r2]                   ; blimit
     ldr         r3, [src, -pstep, lsl #1]   ; p1
     ldr         r4, [src, -pstep]           ; p0
     ldr         r5, [src]                   ; q0
     ldr         r6, [src, pstep]            ; q1
-    ldr         r7, [r2]                    ; flimit
+    orr         r12, r12, r12, lsl #8       ; blimit
     ldr         r2, c0x80808080
-    ldr         r9, [sp, #40]               ; count for 8-in-parallel
-    uadd8       r7, r7, r7                  ; flimit * 2
-    mov         r9, r9, lsl #1              ; double the count. we're doing 4 at a time
-    uadd8       r12, r7, r12                ; flimit * 2 + limit
+    orr         r12, r12, r12, lsl #16      ; blimit
+    mov         r9, #4                      ; double the count. we're doing 4 at a time
     mov         lr, #0                      ; need 0 in a couple places
 
 |simple_hnext8|
@@ -148,34 +141,32 @@
 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
     stmdb       sp!, {r4 - r11, lr}
 
-    ldr         r12, [r2]                   ; r12: flimit
+    ldrb        r12, [r2]                   ; r12: blimit
     ldr         r2, c0x80808080
-    ldr         r7, [r3]                    ; limit
+    orr         r12, r12, r12, lsl #8
 
     ; load soure data to r7, r8, r9, r10
     ldrh        r3, [src, #-2]
     pld         [src, #23]                  ; preload for next block
     ldrh        r4, [src], pstep
-    uadd8       r12, r12, r12               ; flimit * 2
+    orr         r12, r12, r12, lsl #16
 
     ldrh        r5, [src, #-2]
     pld         [src, #23]
     ldrh        r6, [src], pstep
-    uadd8       r12, r12, r7                ; flimit * 2 + limit
 
     pkhbt       r7, r3, r4, lsl #16
 
     ldrh        r3, [src, #-2]
     pld         [src, #23]
     ldrh        r4, [src], pstep
-    ldr         r11, [sp, #40]              ; count (r11) for 8-in-parallel
 
     pkhbt       r8, r5, r6, lsl #16
 
     ldrh        r5, [src, #-2]
     pld         [src, #23]
     ldrh        r6, [src], pstep
-    mov         r11, r11, lsl #1            ; 4-in-parallel
+    mov         r11, #4                     ; double the count. we're doing 4 at a time
 
 |simple_vnext8|
     ; vp8_simple_filter_mask() function
diff --git a/vp8/common/arm/loopfilter_arm.c b/vp8/common/arm/loopfilter_arm.c
index 6d1caa4..c841d45 100644
--- a/vp8/common/arm/loopfilter_arm.c
+++ b/vp8/common/arm/loopfilter_arm.c
@@ -9,30 +9,34 @@
  */
 
 
-#include "vpx_ports/config.h"
-#include <math.h>
+#include "vpx_config.h"
 #include "vp8/common/loopfilter.h"
 #include "vp8/common/onyxc_int.h"
 
+#if HAVE_ARMV6
 extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_armv6);
 extern prototype_loopfilter(vp8_loop_filter_vertical_edge_armv6);
 extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_armv6);
 extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_armv6);
-extern prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_armv6);
-extern prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_armv6);
+#endif
 
-extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_y_neon);
-extern prototype_loopfilter(vp8_loop_filter_vertical_edge_y_neon);
-extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_y_neon);
-extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_y_neon);
-extern prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_neon);
-extern prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_neon);
+#if HAVE_ARMV7
+typedef void loopfilter_y_neon(unsigned char *src, int pitch,
+        unsigned char blimit, unsigned char limit, unsigned char thresh);
+typedef void loopfilter_uv_neon(unsigned char *u, int pitch,
+        unsigned char blimit, unsigned char limit, unsigned char thresh,
+        unsigned char *v);
 
-extern loop_filter_uvfunction vp8_loop_filter_horizontal_edge_uv_neon;
-extern loop_filter_uvfunction vp8_loop_filter_vertical_edge_uv_neon;
-extern loop_filter_uvfunction vp8_mbloop_filter_horizontal_edge_uv_neon;
-extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_neon;
+extern loopfilter_y_neon vp8_loop_filter_horizontal_edge_y_neon;
+extern loopfilter_y_neon vp8_loop_filter_vertical_edge_y_neon;
+extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon;
+extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon;
 
+extern loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon;
+extern loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon;
+extern loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon;
+extern loopfilter_uv_neon vp8_mbloop_filter_vertical_edge_uv_neon;
+#endif
 
 #if HAVE_ARMV6
 /*ARMV6 loopfilter functions*/
@@ -40,96 +44,72 @@
 void vp8_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                                int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
+    vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
 
     if (u_ptr)
-        vp8_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
+        vp8_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
 
     if (v_ptr)
-        vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
-}
-
-void vp8_loop_filter_mbhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                                int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
+        vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
 }
 
 /* Vertical MB Filtering */
 void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                                int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
+    vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
 
     if (u_ptr)
-        vp8_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
+        vp8_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
 
     if (v_ptr)
-        vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
-}
-
-void vp8_loop_filter_mbvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                                int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
+        vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
 }
 
 /* Horizontal B Filtering */
 void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                               int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    vp8_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
 
     if (u_ptr)
-        vp8_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
+        vp8_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
 
     if (v_ptr)
-        vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
+        vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
 }
 
-void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                               int y_stride, int uv_stride, loop_filter_info *lfi)
+void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, int y_stride,
+                               const unsigned char *blimit)
 {
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, blimit);
+    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, blimit);
+    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, blimit);
 }
 
 /* Vertical B Filtering */
 void vp8_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                               int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    vp8_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
 
     if (u_ptr)
-        vp8_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
+        vp8_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
 
     if (v_ptr)
-        vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
+        vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
 }
 
-void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                               int y_stride, int uv_stride, loop_filter_info *lfi)
+void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, int y_stride,
+                               const unsigned char *blimit)
 {
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, blimit);
+    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, blimit);
+    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, blimit);
 }
 #endif
 
@@ -139,83 +119,58 @@
 void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                               int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    vp8_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
+    unsigned char mblim = *lfi->mblim;
+    unsigned char lim = *lfi->lim;
+    unsigned char hev_thr = *lfi->hev_thr;
+    vp8_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr);
 
     if (u_ptr)
-        vp8_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, v_ptr);
-}
-
-void vp8_loop_filter_mbhs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                               int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
+        vp8_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);
 }
 
 /* Vertical MB Filtering */
 void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                               int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    vp8_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
+    unsigned char mblim = *lfi->mblim;
+    unsigned char lim = *lfi->lim;
+    unsigned char hev_thr = *lfi->hev_thr;
+
+    vp8_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr);
 
     if (u_ptr)
-        vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, v_ptr);
-}
-
-void vp8_loop_filter_mbvs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                               int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
+        vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);
 }
 
 /* Horizontal B Filtering */
 void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                              int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    unsigned char blim = *lfi->blim;
+    unsigned char lim = *lfi->lim;
+    unsigned char hev_thr = *lfi->hev_thr;
+
+    vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, blim, lim, hev_thr);
+    vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, blim, lim, hev_thr);
+    vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, blim, lim, hev_thr);
 
     if (u_ptr)
-        vp8_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, v_ptr + 4 * uv_stride);
-}
-
-void vp8_loop_filter_bhs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+        vp8_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, blim, lim, hev_thr, v_ptr + 4 * uv_stride);
 }
 
 /* Vertical B Filtering */
 void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                              int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    vp8_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    unsigned char blim = *lfi->blim;
+    unsigned char lim = *lfi->lim;
+    unsigned char hev_thr = *lfi->hev_thr;
+
+    vp8_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, blim, lim, hev_thr);
+    vp8_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, blim, lim, hev_thr);
+    vp8_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, blim, lim, hev_thr);
 
     if (u_ptr)
-        vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, v_ptr + 4);
-}
-
-void vp8_loop_filter_bvs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+        vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, blim, lim, hev_thr, v_ptr + 4);
 }
 #endif
diff --git a/vp8/common/arm/loopfilter_arm.h b/vp8/common/arm/loopfilter_arm.h
index cd62207..390a547 100644
--- a/vp8/common/arm/loopfilter_arm.h
+++ b/vp8/common/arm/loopfilter_arm.h
@@ -12,15 +12,17 @@
 #ifndef LOOPFILTER_ARM_H
 #define LOOPFILTER_ARM_H
 
+#include "vpx_config.h"
+
 #if HAVE_ARMV6
 extern prototype_loopfilter_block(vp8_loop_filter_mbv_armv6);
 extern prototype_loopfilter_block(vp8_loop_filter_bv_armv6);
 extern prototype_loopfilter_block(vp8_loop_filter_mbh_armv6);
 extern prototype_loopfilter_block(vp8_loop_filter_bh_armv6);
-extern prototype_loopfilter_block(vp8_loop_filter_mbvs_armv6);
-extern prototype_loopfilter_block(vp8_loop_filter_bvs_armv6);
-extern prototype_loopfilter_block(vp8_loop_filter_mbhs_armv6);
-extern prototype_loopfilter_block(vp8_loop_filter_bhs_armv6);
+extern prototype_simple_loopfilter(vp8_loop_filter_bvs_armv6);
+extern prototype_simple_loopfilter(vp8_loop_filter_bhs_armv6);
+extern prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_armv6);
+extern prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_armv6);
 
 #if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_lf_normal_mb_v
@@ -36,28 +38,29 @@
 #define vp8_lf_normal_b_h vp8_loop_filter_bh_armv6
 
 #undef  vp8_lf_simple_mb_v
-#define vp8_lf_simple_mb_v vp8_loop_filter_mbvs_armv6
+#define vp8_lf_simple_mb_v vp8_loop_filter_simple_vertical_edge_armv6
 
 #undef  vp8_lf_simple_b_v
 #define vp8_lf_simple_b_v vp8_loop_filter_bvs_armv6
 
 #undef  vp8_lf_simple_mb_h
-#define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_armv6
+#define vp8_lf_simple_mb_h vp8_loop_filter_simple_horizontal_edge_armv6
 
 #undef  vp8_lf_simple_b_h
 #define vp8_lf_simple_b_h vp8_loop_filter_bhs_armv6
-#endif
-#endif
+#endif /* !CONFIG_RUNTIME_CPU_DETECT */
+
+#endif /* HAVE_ARMV6 */
 
 #if HAVE_ARMV7
 extern prototype_loopfilter_block(vp8_loop_filter_mbv_neon);
 extern prototype_loopfilter_block(vp8_loop_filter_bv_neon);
 extern prototype_loopfilter_block(vp8_loop_filter_mbh_neon);
 extern prototype_loopfilter_block(vp8_loop_filter_bh_neon);
-extern prototype_loopfilter_block(vp8_loop_filter_mbvs_neon);
-extern prototype_loopfilter_block(vp8_loop_filter_bvs_neon);
-extern prototype_loopfilter_block(vp8_loop_filter_mbhs_neon);
-extern prototype_loopfilter_block(vp8_loop_filter_bhs_neon);
+extern prototype_simple_loopfilter(vp8_loop_filter_mbvs_neon);
+extern prototype_simple_loopfilter(vp8_loop_filter_bvs_neon);
+extern prototype_simple_loopfilter(vp8_loop_filter_mbhs_neon);
+extern prototype_simple_loopfilter(vp8_loop_filter_bhs_neon);
 
 #if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_lf_normal_mb_v
@@ -83,7 +86,8 @@
 
 #undef  vp8_lf_simple_b_h
 #define vp8_lf_simple_b_h vp8_loop_filter_bhs_neon
-#endif
-#endif
+#endif /* !CONFIG_RUNTIME_CPU_DETECT */
 
-#endif
+#endif /* HAVE_ARMV7 */
+
+#endif /* LOOPFILTER_ARM_H */
diff --git a/vp8/common/arm/neon/loopfilter_neon.asm b/vp8/common/arm/neon/loopfilter_neon.asm
index e73dd64..e44be0a 100644
--- a/vp8/common/arm/neon/loopfilter_neon.asm
+++ b/vp8/common/arm/neon/loopfilter_neon.asm
@@ -14,109 +14,97 @@
     EXPORT  |vp8_loop_filter_vertical_edge_y_neon|
     EXPORT  |vp8_loop_filter_vertical_edge_uv_neon|
     ARM
-    REQUIRE8
-    PRESERVE8
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-; flimit, limit, and thresh should be positive numbers.
-; All 16 elements in these variables are equal.
-
-; void vp8_loop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch,
-;                                             const signed char *flimit,
-;                                             const signed char *limit,
-;                                             const signed char *thresh,
-;                                             int count)
 ; r0    unsigned char *src
 ; r1    int pitch
-; r2    const signed char *flimit
-; r3    const signed char *limit
-; sp    const signed char *thresh,
-; sp+4  int count (unused)
+; r2    unsigned char blimit
+; r3    unsigned char limit
+; sp    unsigned char thresh,
 |vp8_loop_filter_horizontal_edge_y_neon| PROC
-    stmdb       sp!, {lr}
-    vld1.s8     {d0[], d1[]}, [r2]          ; flimit
-    vld1.s8     {d2[], d3[]}, [r3]          ; limit
-    sub         r2, r0, r1, lsl #2          ; move src pointer down by 4 lines
-    ldr         r12, [sp, #4]               ; load thresh pointer
+    push        {lr}
+    vdup.u8     q0, r2                     ; duplicate blimit
+    vdup.u8     q1, r3                     ; duplicate limit
+    sub         r2, r0, r1, lsl #2         ; move src pointer down by 4 lines
+    ldr         r3, [sp, #4]               ; load thresh
+    add         r12, r2, r1
+    add         r1, r1, r1
 
-    vld1.u8     {q3}, [r2], r1              ; p3
-    vld1.u8     {q4}, [r2], r1              ; p2
-    vld1.u8     {q5}, [r2], r1              ; p1
-    vld1.u8     {q6}, [r2], r1              ; p0
-    vld1.u8     {q7}, [r2], r1              ; q0
-    vld1.u8     {q8}, [r2], r1              ; q1
-    vld1.u8     {q9}, [r2], r1              ; q2
-    vld1.u8     {q10}, [r2]                 ; q3
-    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
-    sub         r0, r0, r1, lsl #1
+    vdup.u8     q2, r3                     ; duplicate thresh
+
+    vld1.u8     {q3}, [r2@128], r1              ; p3
+    vld1.u8     {q4}, [r12@128], r1             ; p2
+    vld1.u8     {q5}, [r2@128], r1              ; p1
+    vld1.u8     {q6}, [r12@128], r1             ; p0
+    vld1.u8     {q7}, [r2@128], r1              ; q0
+    vld1.u8     {q8}, [r12@128], r1             ; q1
+    vld1.u8     {q9}, [r2@128]                  ; q2
+    vld1.u8     {q10}, [r12@128]                ; q3
+
+    sub         r2, r2, r1, lsl #1
+    sub         r12, r12, r1, lsl #1
 
     bl          vp8_loop_filter_neon
 
-    vst1.u8     {q5}, [r0], r1              ; store op1
-    vst1.u8     {q6}, [r0], r1              ; store op0
-    vst1.u8     {q7}, [r0], r1              ; store oq0
-    vst1.u8     {q8}, [r0], r1              ; store oq1
+    vst1.u8     {q5}, [r2@128], r1              ; store op1
+    vst1.u8     {q6}, [r12@128], r1             ; store op0
+    vst1.u8     {q7}, [r2@128], r1              ; store oq0
+    vst1.u8     {q8}, [r12@128], r1             ; store oq1
 
-    ldmia       sp!, {pc}
+    pop         {pc}
     ENDP        ; |vp8_loop_filter_horizontal_edge_y_neon|
 
-; void vp8_loop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch
-;                                              const signed char *flimit,
-;                                              const signed char *limit,
-;                                              const signed char *thresh,
-;                                              unsigned char *v)
+
 ; r0    unsigned char *u,
 ; r1    int pitch,
-; r2    const signed char *flimit,
-; r3    const signed char *limit,
-; sp    const signed char *thresh,
+; r2    unsigned char blimit
+; r3    unsigned char limit
+; sp    unsigned char thresh,
 ; sp+4  unsigned char *v
 |vp8_loop_filter_horizontal_edge_uv_neon| PROC
-    stmdb       sp!, {lr}
-    vld1.s8     {d0[], d1[]}, [r2]          ; flimit
-    vld1.s8     {d2[], d3[]}, [r3]          ; limit
+    push        {lr}
+    vdup.u8     q0, r2                      ; duplicate blimit
+    vdup.u8     q1, r3                      ; duplicate limit
+    ldr         r12, [sp, #4]               ; load thresh
     ldr         r2, [sp, #8]                ; load v ptr
+    vdup.u8     q2, r12                     ; duplicate thresh
 
     sub         r3, r0, r1, lsl #2          ; move u pointer down by 4 lines
-    vld1.u8     {d6}, [r3], r1              ; p3
-    vld1.u8     {d8}, [r3], r1              ; p2
-    vld1.u8     {d10}, [r3], r1             ; p1
-    vld1.u8     {d12}, [r3], r1             ; p0
-    vld1.u8     {d14}, [r3], r1             ; q0
-    vld1.u8     {d16}, [r3], r1             ; q1
-    vld1.u8     {d18}, [r3], r1             ; q2
-    vld1.u8     {d20}, [r3]                 ; q3
-
-    ldr         r3, [sp, #4]                ; load thresh pointer
-
     sub         r12, r2, r1, lsl #2         ; move v pointer down by 4 lines
-    vld1.u8     {d7}, [r12], r1             ; p3
-    vld1.u8     {d9}, [r12], r1             ; p2
-    vld1.u8     {d11}, [r12], r1            ; p1
-    vld1.u8     {d13}, [r12], r1            ; p0
-    vld1.u8     {d15}, [r12], r1            ; q0
-    vld1.u8     {d17}, [r12], r1            ; q1
-    vld1.u8     {d19}, [r12], r1            ; q2
-    vld1.u8     {d21}, [r12]                ; q3
 
-    vld1.s8     {d4[], d5[]}, [r3]          ; thresh
+    vld1.u8     {d6}, [r3@64], r1              ; p3
+    vld1.u8     {d7}, [r12@64], r1             ; p3
+    vld1.u8     {d8}, [r3@64], r1              ; p2
+    vld1.u8     {d9}, [r12@64], r1             ; p2
+    vld1.u8     {d10}, [r3@64], r1             ; p1
+    vld1.u8     {d11}, [r12@64], r1            ; p1
+    vld1.u8     {d12}, [r3@64], r1             ; p0
+    vld1.u8     {d13}, [r12@64], r1            ; p0
+    vld1.u8     {d14}, [r3@64], r1             ; q0
+    vld1.u8     {d15}, [r12@64], r1            ; q0
+    vld1.u8     {d16}, [r3@64], r1             ; q1
+    vld1.u8     {d17}, [r12@64], r1            ; q1
+    vld1.u8     {d18}, [r3@64], r1             ; q2
+    vld1.u8     {d19}, [r12@64], r1            ; q2
+    vld1.u8     {d20}, [r3@64]                 ; q3
+    vld1.u8     {d21}, [r12@64]                ; q3
 
     bl          vp8_loop_filter_neon
 
     sub         r0, r0, r1, lsl #1
     sub         r2, r2, r1, lsl #1
 
-    vst1.u8     {d10}, [r0], r1             ; store u op1
-    vst1.u8     {d11}, [r2], r1             ; store v op1
-    vst1.u8     {d12}, [r0], r1             ; store u op0
-    vst1.u8     {d13}, [r2], r1             ; store v op0
-    vst1.u8     {d14}, [r0], r1             ; store u oq0
-    vst1.u8     {d15}, [r2], r1             ; store v oq0
-    vst1.u8     {d16}, [r0]                 ; store u oq1
-    vst1.u8     {d17}, [r2]                 ; store v oq1
+    vst1.u8     {d10}, [r0@64], r1             ; store u op1
+    vst1.u8     {d11}, [r2@64], r1             ; store v op1
+    vst1.u8     {d12}, [r0@64], r1             ; store u op0
+    vst1.u8     {d13}, [r2@64], r1             ; store v op0
+    vst1.u8     {d14}, [r0@64], r1             ; store u oq0
+    vst1.u8     {d15}, [r2@64], r1             ; store v oq0
+    vst1.u8     {d16}, [r0@64]                 ; store u oq1
+    vst1.u8     {d17}, [r2@64]                 ; store v oq1
 
-    ldmia       sp!, {pc}
+    pop         {pc}
     ENDP        ; |vp8_loop_filter_horizontal_edge_uv_neon|
 
 ; void vp8_loop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
@@ -124,39 +112,38 @@
 ;                                           const signed char *limit,
 ;                                           const signed char *thresh,
 ;                                           int count)
-; r0    unsigned char *src,
-; r1    int pitch,
-; r2    const signed char *flimit,
-; r3    const signed char *limit,
-; sp    const signed char *thresh,
-; sp+4  int count (unused)
+; r0    unsigned char *src
+; r1    int pitch
+; r2    unsigned char blimit
+; r3    unsigned char limit
+; sp    unsigned char thresh,
+
 |vp8_loop_filter_vertical_edge_y_neon| PROC
-    stmdb       sp!, {lr}
-    vld1.s8     {d0[], d1[]}, [r2]          ; flimit
-    vld1.s8     {d2[], d3[]}, [r3]          ; limit
-    sub         r2, r0, #4                  ; src ptr down by 4 columns
-    sub         r0, r0, #2                  ; dst ptr
-    ldr         r12, [sp, #4]               ; load thresh pointer
+    push        {lr}
+    vdup.u8     q0, r2                     ; duplicate blimit
+    vdup.u8     q1, r3                     ; duplicate limit
+    sub         r2, r0, #4                 ; src ptr down by 4 columns
+    add         r1, r1, r1
+    ldr         r3, [sp, #4]               ; load thresh
+    add         r12, r2, r1, asr #1
 
-    vld1.u8     {d6}, [r2], r1              ; load first 8-line src data
-    vld1.u8     {d8}, [r2], r1
+    vld1.u8     {d6}, [r2], r1
+    vld1.u8     {d8}, [r12], r1
     vld1.u8     {d10}, [r2], r1
-    vld1.u8     {d12}, [r2], r1
+    vld1.u8     {d12}, [r12], r1
     vld1.u8     {d14}, [r2], r1
-    vld1.u8     {d16}, [r2], r1
+    vld1.u8     {d16}, [r12], r1
     vld1.u8     {d18}, [r2], r1
-    vld1.u8     {d20}, [r2], r1
-
-    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
+    vld1.u8     {d20}, [r12], r1
 
     vld1.u8     {d7}, [r2], r1              ; load second 8-line src data
-    vld1.u8     {d9}, [r2], r1
+    vld1.u8     {d9}, [r12], r1
     vld1.u8     {d11}, [r2], r1
-    vld1.u8     {d13}, [r2], r1
+    vld1.u8     {d13}, [r12], r1
     vld1.u8     {d15}, [r2], r1
-    vld1.u8     {d17}, [r2], r1
-    vld1.u8     {d19}, [r2], r1
-    vld1.u8     {d21}, [r2]
+    vld1.u8     {d17}, [r12], r1
+    vld1.u8     {d19}, [r2]
+    vld1.u8     {d21}, [r12]
 
     ;transpose to 8x16 matrix
     vtrn.32     q3, q7
@@ -164,6 +151,8 @@
     vtrn.32     q5, q9
     vtrn.32     q6, q10
 
+    vdup.u8     q2, r3                     ; duplicate thresh
+
     vtrn.16     q3, q5
     vtrn.16     q4, q6
     vtrn.16     q7, q9
@@ -178,28 +167,34 @@
 
     vswp        d12, d11
     vswp        d16, d13
+
+    sub         r0, r0, #2                 ; dst ptr
+
     vswp        d14, d12
     vswp        d16, d15
 
+    add         r12, r0, r1, asr #1
+
     ;store op1, op0, oq0, oq1
     vst4.8      {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
-    vst4.8      {d10[1], d11[1], d12[1], d13[1]}, [r0], r1
+    vst4.8      {d10[1], d11[1], d12[1], d13[1]}, [r12], r1
     vst4.8      {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
-    vst4.8      {d10[3], d11[3], d12[3], d13[3]}, [r0], r1
+    vst4.8      {d10[3], d11[3], d12[3], d13[3]}, [r12], r1
     vst4.8      {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
-    vst4.8      {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
+    vst4.8      {d10[5], d11[5], d12[5], d13[5]}, [r12], r1
     vst4.8      {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
-    vst4.8      {d10[7], d11[7], d12[7], d13[7]}, [r0], r1
-    vst4.8      {d14[0], d15[0], d16[0], d17[0]}, [r0], r1
-    vst4.8      {d14[1], d15[1], d16[1], d17[1]}, [r0], r1
-    vst4.8      {d14[2], d15[2], d16[2], d17[2]}, [r0], r1
-    vst4.8      {d14[3], d15[3], d16[3], d17[3]}, [r0], r1
-    vst4.8      {d14[4], d15[4], d16[4], d17[4]}, [r0], r1
-    vst4.8      {d14[5], d15[5], d16[5], d17[5]}, [r0], r1
-    vst4.8      {d14[6], d15[6], d16[6], d17[6]}, [r0], r1
-    vst4.8      {d14[7], d15[7], d16[7], d17[7]}, [r0]
+    vst4.8      {d10[7], d11[7], d12[7], d13[7]}, [r12], r1
 
-    ldmia       sp!, {pc}
+    vst4.8      {d14[0], d15[0], d16[0], d17[0]}, [r0], r1
+    vst4.8      {d14[1], d15[1], d16[1], d17[1]}, [r12], r1
+    vst4.8      {d14[2], d15[2], d16[2], d17[2]}, [r0], r1
+    vst4.8      {d14[3], d15[3], d16[3], d17[3]}, [r12], r1
+    vst4.8      {d14[4], d15[4], d16[4], d17[4]}, [r0], r1
+    vst4.8      {d14[5], d15[5], d16[5], d17[5]}, [r12], r1
+    vst4.8      {d14[6], d15[6], d16[6], d17[6]}, [r0]
+    vst4.8      {d14[7], d15[7], d16[7], d17[7]}, [r12]
+
+    pop         {pc}
     ENDP        ; |vp8_loop_filter_vertical_edge_y_neon|
 
 ; void vp8_loop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch
@@ -209,38 +204,36 @@
 ;                                            unsigned char *v)
 ; r0    unsigned char *u,
 ; r1    int pitch,
-; r2    const signed char *flimit,
-; r3    const signed char *limit,
-; sp    const signed char *thresh,
+; r2    unsigned char blimit
+; r3    unsigned char limit
+; sp    unsigned char thresh,
 ; sp+4  unsigned char *v
 |vp8_loop_filter_vertical_edge_uv_neon| PROC
-    stmdb       sp!, {lr}
-    sub         r12, r0, #4                  ; move u pointer down by 4 columns
-    vld1.s8     {d0[], d1[]}, [r2]          ; flimit
-    vld1.s8     {d2[], d3[]}, [r3]          ; limit
-
+    push        {lr}
+    vdup.u8     q0, r2                      ; duplicate blimit
+    sub         r12, r0, #4                 ; move u pointer down by 4 columns
     ldr         r2, [sp, #8]                ; load v ptr
-
-    vld1.u8     {d6}, [r12], r1              ;load u data
-    vld1.u8     {d8}, [r12], r1
-    vld1.u8     {d10}, [r12], r1
-    vld1.u8     {d12}, [r12], r1
-    vld1.u8     {d14}, [r12], r1
-    vld1.u8     {d16}, [r12], r1
-    vld1.u8     {d18}, [r12], r1
-    vld1.u8     {d20}, [r12]
-
+    vdup.u8     q1, r3                      ; duplicate limit
     sub         r3, r2, #4                  ; move v pointer down by 4 columns
+
+    vld1.u8     {d6}, [r12], r1             ;load u data
     vld1.u8     {d7}, [r3], r1              ;load v data
+    vld1.u8     {d8}, [r12], r1
     vld1.u8     {d9}, [r3], r1
+    vld1.u8     {d10}, [r12], r1
     vld1.u8     {d11}, [r3], r1
+    vld1.u8     {d12}, [r12], r1
     vld1.u8     {d13}, [r3], r1
+    vld1.u8     {d14}, [r12], r1
     vld1.u8     {d15}, [r3], r1
+    vld1.u8     {d16}, [r12], r1
     vld1.u8     {d17}, [r3], r1
+    vld1.u8     {d18}, [r12], r1
     vld1.u8     {d19}, [r3], r1
+    vld1.u8     {d20}, [r12]
     vld1.u8     {d21}, [r3]
 
-    ldr         r12, [sp, #4]               ; load thresh pointer
+    ldr        r12, [sp, #4]               ; load thresh
 
     ;transpose to 8x16 matrix
     vtrn.32     q3, q7
@@ -248,6 +241,8 @@
     vtrn.32     q5, q9
     vtrn.32     q6, q10
 
+    vdup.u8     q2, r12                     ; duplicate thresh
+
     vtrn.16     q3, q5
     vtrn.16     q4, q6
     vtrn.16     q7, q9
@@ -258,18 +253,16 @@
     vtrn.8      q7, q8
     vtrn.8      q9, q10
 
-    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
-
     bl          vp8_loop_filter_neon
 
-    sub         r0, r0, #2
-    sub         r2, r2, #2
-
     vswp        d12, d11
     vswp        d16, d13
     vswp        d14, d12
     vswp        d16, d15
 
+    sub         r0, r0, #2
+    sub         r2, r2, #2
+
     ;store op1, op0, oq0, oq1
     vst4.8      {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
     vst4.8      {d14[0], d15[0], d16[0], d17[0]}, [r2], r1
@@ -288,7 +281,7 @@
     vst4.8      {d10[7], d11[7], d12[7], d13[7]}, [r0]
     vst4.8      {d14[7], d15[7], d16[7], d17[7]}, [r2]
 
-    ldmia       sp!, {pc}
+    pop         {pc}
     ENDP        ; |vp8_loop_filter_vertical_edge_uv_neon|
 
 ; void vp8_loop_filter_neon();
@@ -316,42 +309,44 @@
     vabd.u8     q14, q8, q7                 ; abs(q1 - q0)
     vabd.u8     q3, q9, q8                  ; abs(q2 - q1)
     vabd.u8     q4, q10, q9                 ; abs(q3 - q2)
-    vabd.u8     q9, q6, q7                  ; abs(p0 - q0)
 
     vmax.u8     q11, q11, q12
     vmax.u8     q12, q13, q14
     vmax.u8     q3, q3, q4
     vmax.u8     q15, q11, q12
 
+    vabd.u8     q9, q6, q7                  ; abs(p0 - q0)
+
     ; vp8_hevmask
     vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh)*-1
     vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh)*-1
     vmax.u8     q15, q15, q3
 
-    vadd.u8     q0, q0, q0                  ; flimit * 2
-    vadd.u8     q0, q0, q1                  ; flimit * 2 + limit
-    vcge.u8     q15, q1, q15
+    vmov.u8     q10, #0x80                   ; 0x80
 
     vabd.u8     q2, q5, q8                  ; a = abs(p1 - q1)
     vqadd.u8    q9, q9, q9                  ; b = abs(p0 - q0) * 2
-    vshr.u8     q2, q2, #1                  ; a = a / 2
-    vqadd.u8    q9, q9, q2                  ; a = b + a
-    vcge.u8     q9, q0, q9                  ; (a > flimit * 2 + limit) * -1
 
-    vmov.u8     q0, #0x80                   ; 0x80
+    vcge.u8     q15, q1, q15
 
     ; vp8_filter() function
     ; convert to signed
-    veor        q7, q7, q0                  ; qs0
-    veor        q6, q6, q0                  ; ps0
-    veor        q5, q5, q0                  ; ps1
-    veor        q8, q8, q0                  ; qs1
+    veor        q7, q7, q10                 ; qs0
+    vshr.u8     q2, q2, #1                  ; a = a / 2
+    veor        q6, q6, q10                 ; ps0
+
+    veor        q5, q5, q10                 ; ps1
+    vqadd.u8    q9, q9, q2                  ; a = b + a
+
+    veor        q8, q8, q10                 ; qs1
 
     vmov.u8     q10, #3                     ; #3
 
     vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
     vsubl.s8    q11, d15, d13
 
+    vcge.u8     q9, q0, q9                  ; (a > flimit * 2 + limit) * -1
+
     vmovl.u8    q4, d20
 
     vqsub.s8    q1, q5, q8                  ; vp8_filter = clamp(ps1-qs1)
@@ -378,19 +373,20 @@
     vshr.s8     q2, q2, #3                  ; Filter2 >>= 3
     vshr.s8     q1, q1, #3                  ; Filter1 >>= 3
 
+
     vqadd.s8    q11, q6, q2                 ; u = clamp(ps0 + Filter2)
     vqsub.s8    q10, q7, q1                 ; u = clamp(qs0 - Filter1)
 
     ; outer tap adjustments: ++vp8_filter >> 1
     vrshr.s8    q1, q1, #1
     vbic        q1, q1, q14                 ; vp8_filter &= ~hev
-
+    vmov.u8     q0, #0x80                   ; 0x80
     vqadd.s8    q13, q5, q1                 ; u = clamp(ps1 + vp8_filter)
     vqsub.s8    q12, q8, q1                 ; u = clamp(qs1 - vp8_filter)
 
-    veor        q5, q13, q0                 ; *op1 = u^0x80
     veor        q6, q11, q0                 ; *op0 = u^0x80
     veor        q7, q10, q0                 ; *oq0 = u^0x80
+    veor        q5, q13, q0                 ; *op1 = u^0x80
     veor        q8, q12, q0                 ; *oq1 = u^0x80
 
     bx          lr
diff --git a/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm b/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm
index 7c5ea36..adf848b 100644
--- a/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm
+++ b/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm
@@ -9,99 +9,109 @@
 ;
 
 
-    EXPORT  |vp8_loop_filter_simple_horizontal_edge_neon|
+    ;EXPORT  |vp8_loop_filter_simple_horizontal_edge_neon|
+    EXPORT  |vp8_loop_filter_bhs_neon|
+    EXPORT  |vp8_loop_filter_mbhs_neon|
     ARM
-    REQUIRE8
     PRESERVE8
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
-;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit
-;are equal. So, in the code, only one load is needed
-;for flimit. Same way applies to limit and thresh.
-; r0    unsigned char *s,
-; r1    int p, //pitch
-; r2    const signed char *flimit,
-; r3    const signed char *limit,
-; stack(r4) const signed char *thresh (unused)
-; //stack(r5)   int count --unused
+
+; r0    unsigned char *s, PRESERVE
+; r1    int p, PRESERVE
+; q1    limit, PRESERVE
 
 |vp8_loop_filter_simple_horizontal_edge_neon| PROC
-    sub         r0, r0, r1, lsl #1          ; move src pointer down by 2 lines
 
-    vld1.u8     {q5}, [r0], r1              ; p1
-    vld1.s8     {d2[], d3[]}, [r2]          ; flimit
-    vld1.s8     {d26[], d27[]}, [r3]        ; limit -> q13
-    vld1.u8     {q6}, [r0], r1              ; p0
-    vmov.u8     q0, #0x80                   ; 0x80
-    vld1.u8     {q7}, [r0], r1              ; q0
-    vmov.u8     q10, #0x03                  ; 0x03
-    vld1.u8     {q8}, [r0]                  ; q1
+    sub         r3, r0, r1, lsl #1          ; move src pointer down by 2 lines
 
-    ;vp8_filter_mask() function
+    vld1.u8     {q7}, [r0@128], r1          ; q0
+    vld1.u8     {q5}, [r3@128], r1          ; p0
+    vld1.u8     {q8}, [r0@128]              ; q1
+    vld1.u8     {q6}, [r3@128]              ; p1
+
     vabd.u8     q15, q6, q7                 ; abs(p0 - q0)
     vabd.u8     q14, q5, q8                 ; abs(p1 - q1)
+
     vqadd.u8    q15, q15, q15               ; abs(p0 - q0) * 2
     vshr.u8     q14, q14, #1                ; abs(p1 - q1) / 2
+    vmov.u8     q0, #0x80                   ; 0x80
+    vmov.s16    q13, #3
     vqadd.u8    q15, q15, q14               ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
 
-    ;vp8_filter() function
     veor        q7, q7, q0                  ; qs0: q0 offset to convert to a signed value
     veor        q6, q6, q0                  ; ps0: p0 offset to convert to a signed value
     veor        q5, q5, q0                  ; ps1: p1 offset to convert to a signed value
     veor        q8, q8, q0                  ; qs1: q1 offset to convert to a signed value
 
-    vadd.u8     q1, q1, q1                  ; flimit * 2
-    vadd.u8     q1, q1, q13                 ; flimit * 2 + limit
-    vcge.u8     q15, q1, q15                ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1
+    vcge.u8     q15, q1, q15                ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > limit)*-1
 
-;;;;;;;;;;
-    ;vqsub.s8   q2, q7, q6                  ; ( qs0 - ps0)
     vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
     vsubl.s8    q3, d15, d13
 
     vqsub.s8    q4, q5, q8                  ; q4: vp8_filter = vp8_signed_char_clamp(ps1-qs1)
 
-    ;vmul.i8    q2, q2, q10                 ;  3 * ( qs0 - ps0)
-    vadd.s16    q11, q2, q2                 ;  3 * ( qs0 - ps0)
-    vadd.s16    q12, q3, q3
+    vmul.s16    q2, q2, q13                 ;  3 * ( qs0 - ps0)
+    vmul.s16    q3, q3, q13
 
+    vmov.u8     q10, #0x03                  ; 0x03
     vmov.u8     q9, #0x04                   ; 0x04
 
-    vadd.s16    q2, q2, q11
-    vadd.s16    q3, q3, q12
-
     vaddw.s8    q2, q2, d8                  ; vp8_filter + 3 * ( qs0 - ps0)
     vaddw.s8    q3, q3, d9
 
-    ;vqadd.s8   q4, q4, q2                  ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
     vqmovn.s16  d8, q2                      ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
     vqmovn.s16  d9, q3
-;;;;;;;;;;;;;
 
-    vand        q4, q4, q15                 ; vp8_filter &= mask
+    vand        q14, q4, q15                ; vp8_filter &= mask
 
-    vqadd.s8    q2, q4, q10                 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
-    vqadd.s8    q4, q4, q9                  ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
+    vqadd.s8    q2, q14, q10                ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
+    vqadd.s8    q3, q14, q9                 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
     vshr.s8     q2, q2, #3                  ; Filter2 >>= 3
-    vshr.s8     q4, q4, #3                  ; Filter1 >>= 3
+    vshr.s8     q4, q3, #3                  ; Filter1 >>= 3
 
-    sub         r0, r0, r1, lsl #1
+    sub         r0, r0, r1
 
     ;calculate output
     vqadd.s8    q11, q6, q2                 ; u = vp8_signed_char_clamp(ps0 + Filter2)
     vqsub.s8    q10, q7, q4                 ; u = vp8_signed_char_clamp(qs0 - Filter1)
 
-    add         r3, r0, r1
-
     veor        q6, q11, q0                 ; *op0 = u^0x80
     veor        q7, q10, q0                 ; *oq0 = u^0x80
 
-    vst1.u8     {q6}, [r0]                  ; store op0
-    vst1.u8     {q7}, [r3]                  ; store oq0
+    vst1.u8     {q6}, [r3@128]              ; store op0
+    vst1.u8     {q7}, [r0@128]              ; store oq0
 
     bx          lr
     ENDP        ; |vp8_loop_filter_simple_horizontal_edge_neon|
 
-;-----------------
+; r0    unsigned char *y
+; r1    int ystride
+; r2    const unsigned char *blimit
+
+|vp8_loop_filter_bhs_neon| PROC
+    push        {r4, lr}
+    ldrb        r3, [r2]                    ; load blim from mem
+    vdup.s8     q1, r3                      ; duplicate blim
+
+    add         r0, r0, r1, lsl #2          ; src = y_ptr + 4 * y_stride
+    bl          vp8_loop_filter_simple_horizontal_edge_neon
+    ; vp8_loop_filter_simple_horizontal_edge_neon preserves r0, r1 and q1
+    add         r0, r0, r1, lsl #2          ; src = y_ptr + 8* y_stride
+    bl          vp8_loop_filter_simple_horizontal_edge_neon
+    add         r0, r0, r1, lsl #2          ; src = y_ptr + 12 * y_stride
+    pop         {r4, lr}
+    b           vp8_loop_filter_simple_horizontal_edge_neon
+    ENDP        ;|vp8_loop_filter_bhs_neon|
+
+; r0    unsigned char *y
+; r1    int ystride
+; r2    const unsigned char *blimit
+
+|vp8_loop_filter_mbhs_neon| PROC
+    ldrb        r3, [r2]                   ; load blim from mem
+    vdup.s8     q1, r3                     ; duplicate mblim
+    b           vp8_loop_filter_simple_horizontal_edge_neon
+    ENDP        ;|vp8_loop_filter_bhs_neon|
 
     END
diff --git a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
index a7f7b69..e690df2 100644
--- a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
+++ b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
@@ -9,59 +9,54 @@
 ;
 
 
-    EXPORT  |vp8_loop_filter_simple_vertical_edge_neon|
+    ;EXPORT  |vp8_loop_filter_simple_vertical_edge_neon|
+    EXPORT |vp8_loop_filter_bvs_neon|
+    EXPORT |vp8_loop_filter_mbvs_neon|
     ARM
-    REQUIRE8
     PRESERVE8
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
-;Note: flimit, limit, and thresh should be positive numbers. All 16 elements in flimit
-;are equal. So, in the code, only one load is needed
-;for flimit. Same way applies to limit and thresh.
-; r0    unsigned char *s,
-; r1    int p, //pitch
-; r2    const signed char *flimit,
-; r3    const signed char *limit,
-; stack(r4) const signed char *thresh (unused)
-; //stack(r5)   int count --unused
+
+; r0    unsigned char *s, PRESERVE
+; r1    int p, PRESERVE
+; q1    limit, PRESERVE
 
 |vp8_loop_filter_simple_vertical_edge_neon| PROC
     sub         r0, r0, #2                  ; move src pointer down by 2 columns
+    add         r12, r1, r1
+    add         r3, r0, r1
 
-    vld4.8      {d6[0], d7[0], d8[0], d9[0]}, [r0], r1
-    vld1.s8     {d2[], d3[]}, [r2]          ; flimit
-    vld1.s8     {d26[], d27[]}, [r3]        ; limit -> q13
-    vld4.8      {d6[1], d7[1], d8[1], d9[1]}, [r0], r1
-    vld4.8      {d6[2], d7[2], d8[2], d9[2]}, [r0], r1
-    vld4.8      {d6[3], d7[3], d8[3], d9[3]}, [r0], r1
-    vld4.8      {d6[4], d7[4], d8[4], d9[4]}, [r0], r1
-    vld4.8      {d6[5], d7[5], d8[5], d9[5]}, [r0], r1
-    vld4.8      {d6[6], d7[6], d8[6], d9[6]}, [r0], r1
-    vld4.8      {d6[7], d7[7], d8[7], d9[7]}, [r0], r1
+    vld4.8      {d6[0], d7[0], d8[0], d9[0]}, [r0], r12
+    vld4.8      {d6[1], d7[1], d8[1], d9[1]}, [r3], r12
+    vld4.8      {d6[2], d7[2], d8[2], d9[2]}, [r0], r12
+    vld4.8      {d6[3], d7[3], d8[3], d9[3]}, [r3], r12
+    vld4.8      {d6[4], d7[4], d8[4], d9[4]}, [r0], r12
+    vld4.8      {d6[5], d7[5], d8[5], d9[5]}, [r3], r12
+    vld4.8      {d6[6], d7[6], d8[6], d9[6]}, [r0], r12
+    vld4.8      {d6[7], d7[7], d8[7], d9[7]}, [r3], r12
 
-    vld4.8      {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
-    vmov.u8     q0, #0x80                ; 0x80
-    vld4.8      {d10[1], d11[1], d12[1], d13[1]}, [r0], r1
-    vmov.u8     q11, #0x03              ; 0x03
-    vld4.8      {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
-    vmov.u8     q12, #0x04               ; 0x04
-    vld4.8      {d10[3], d11[3], d12[3], d13[3]}, [r0], r1
-    vld4.8      {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
-    vld4.8      {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
-    vld4.8      {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
-    vld4.8      {d10[7], d11[7], d12[7], d13[7]}, [r0], r1
+    vld4.8      {d10[0], d11[0], d12[0], d13[0]}, [r0], r12
+    vld4.8      {d10[1], d11[1], d12[1], d13[1]}, [r3], r12
+    vld4.8      {d10[2], d11[2], d12[2], d13[2]}, [r0], r12
+    vld4.8      {d10[3], d11[3], d12[3], d13[3]}, [r3], r12
+    vld4.8      {d10[4], d11[4], d12[4], d13[4]}, [r0], r12
+    vld4.8      {d10[5], d11[5], d12[5], d13[5]}, [r3], r12
+    vld4.8      {d10[6], d11[6], d12[6], d13[6]}, [r0], r12
+    vld4.8      {d10[7], d11[7], d12[7], d13[7]}, [r3]
 
     vswp        d7, d10
     vswp        d12, d9
-    ;vswp       q4, q5                      ; p1:q3, p0:q5, q0:q4, q1:q6
 
     ;vp8_filter_mask() function
     ;vp8_hevmask() function
     sub         r0, r0, r1, lsl #4
     vabd.u8     q15, q5, q4                 ; abs(p0 - q0)
     vabd.u8     q14, q3, q6                 ; abs(p1 - q1)
+
     vqadd.u8    q15, q15, q15               ; abs(p0 - q0) * 2
     vshr.u8     q14, q14, #1                ; abs(p1 - q1) / 2
+    vmov.u8     q0, #0x80                   ; 0x80
+    vmov.s16    q11, #3
     vqadd.u8    q15, q15, q14               ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
 
     veor        q4, q4, q0                  ; qs0: q0 offset to convert to a signed value
@@ -69,80 +64,91 @@
     veor        q3, q3, q0                  ; ps1: p1 offset to convert to a signed value
     veor        q6, q6, q0                  ; qs1: q1 offset to convert to a signed value
 
-    vadd.u8     q1, q1, q1                  ; flimit * 2
-    vadd.u8     q1, q1, q13                 ; flimit * 2 + limit
     vcge.u8     q15, q1, q15                ; abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1
 
-    ;vp8_filter() function
-;;;;;;;;;;
-    ;vqsub.s8   q2, q5, q4                  ; ( qs0 - ps0)
     vsubl.s8    q2, d8, d10                 ; ( qs0 - ps0)
     vsubl.s8    q13, d9, d11
 
-    vqsub.s8    q1, q3, q6                  ; vp8_filter = vp8_signed_char_clamp(ps1-qs1)
+    vqsub.s8    q14, q3, q6                  ; vp8_filter = vp8_signed_char_clamp(ps1-qs1)
 
-    ;vmul.i8    q2, q2, q11                 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
-    vadd.s16    q10, q2, q2                 ;  3 * ( qs0 - ps0)
-    vadd.s16    q14, q13, q13
-    vadd.s16    q2, q2, q10
-    vadd.s16    q13, q13, q14
+    vmul.s16    q2, q2, q11                 ;  3 * ( qs0 - ps0)
+    vmul.s16    q13, q13, q11
 
-    ;vqadd.s8   q1, q1, q2
-    vaddw.s8    q2, q2, d2                  ; vp8_filter + 3 * ( qs0 - ps0)
-    vaddw.s8    q13, q13, d3
+    vmov.u8     q11, #0x03                  ; 0x03
+    vmov.u8     q12, #0x04                  ; 0x04
 
-    vqmovn.s16  d2, q2                      ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
-    vqmovn.s16  d3, q13
+    vaddw.s8    q2, q2, d28                  ; vp8_filter + 3 * ( qs0 - ps0)
+    vaddw.s8    q13, q13, d29
+
+    vqmovn.s16  d28, q2                      ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d29, q13
 
     add         r0, r0, #1
-    add         r2, r0, r1
-;;;;;;;;;;;
+    add         r3, r0, r1
 
-    vand        q1, q1, q15                 ; vp8_filter &= mask
+    vand        q14, q14, q15                 ; vp8_filter &= mask
 
-    vqadd.s8    q2, q1, q11                 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
-    vqadd.s8    q1, q1, q12                 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
+    vqadd.s8    q2, q14, q11                 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
+    vqadd.s8    q3, q14, q12                 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
     vshr.s8     q2, q2, #3                  ; Filter2 >>= 3
-    vshr.s8     q1, q1, #3                  ; Filter1 >>= 3
+    vshr.s8     q14, q3, #3                  ; Filter1 >>= 3
 
     ;calculate output
-    vqsub.s8    q10, q4, q1                 ; u = vp8_signed_char_clamp(qs0 - Filter1)
     vqadd.s8    q11, q5, q2                 ; u = vp8_signed_char_clamp(ps0 + Filter2)
+    vqsub.s8    q10, q4, q14                 ; u = vp8_signed_char_clamp(qs0 - Filter1)
 
-    veor        q7, q10, q0                 ; *oq0 = u^0x80
     veor        q6, q11, q0                 ; *op0 = u^0x80
-
-    add         r3, r2, r1
+    veor        q7, q10, q0                 ; *oq0 = u^0x80
+    add         r12, r1, r1
     vswp        d13, d14
-    add         r12, r3, r1
 
     ;store op1, op0, oq0, oq1
-    vst2.8      {d12[0], d13[0]}, [r0]
-    vst2.8      {d12[1], d13[1]}, [r2]
-    vst2.8      {d12[2], d13[2]}, [r3]
-    vst2.8      {d12[3], d13[3]}, [r12], r1
-    add         r0, r12, r1
-    vst2.8      {d12[4], d13[4]}, [r12]
-    vst2.8      {d12[5], d13[5]}, [r0], r1
-    add         r2, r0, r1
-    vst2.8      {d12[6], d13[6]}, [r0]
-    vst2.8      {d12[7], d13[7]}, [r2], r1
-    add         r3, r2, r1
-    vst2.8      {d14[0], d15[0]}, [r2]
-    vst2.8      {d14[1], d15[1]}, [r3], r1
-    add         r12, r3, r1
-    vst2.8      {d14[2], d15[2]}, [r3]
-    vst2.8      {d14[3], d15[3]}, [r12], r1
-    add         r0, r12, r1
-    vst2.8      {d14[4], d15[4]}, [r12]
-    vst2.8      {d14[5], d15[5]}, [r0], r1
-    add         r2, r0, r1
-    vst2.8      {d14[6], d15[6]}, [r0]
-    vst2.8      {d14[7], d15[7]}, [r2]
+    vst2.8      {d12[0], d13[0]}, [r0], r12
+    vst2.8      {d12[1], d13[1]}, [r3], r12
+    vst2.8      {d12[2], d13[2]}, [r0], r12
+    vst2.8      {d12[3], d13[3]}, [r3], r12
+    vst2.8      {d12[4], d13[4]}, [r0], r12
+    vst2.8      {d12[5], d13[5]}, [r3], r12
+    vst2.8      {d12[6], d13[6]}, [r0], r12
+    vst2.8      {d12[7], d13[7]}, [r3], r12
+    vst2.8      {d14[0], d15[0]}, [r0], r12
+    vst2.8      {d14[1], d15[1]}, [r3], r12
+    vst2.8      {d14[2], d15[2]}, [r0], r12
+    vst2.8      {d14[3], d15[3]}, [r3], r12
+    vst2.8      {d14[4], d15[4]}, [r0], r12
+    vst2.8      {d14[5], d15[5]}, [r3], r12
+    vst2.8      {d14[6], d15[6]}, [r0], r12
+    vst2.8      {d14[7], d15[7]}, [r3]
 
     bx          lr
     ENDP        ; |vp8_loop_filter_simple_vertical_edge_neon|
 
-;-----------------
+; r0    unsigned char *y
+; r1    int ystride
+; r2    const unsigned char *blimit
 
+|vp8_loop_filter_bvs_neon| PROC
+    push        {r4, lr}
+    ldrb        r3, [r2]                   ; load blim from mem
+    mov         r4, r0
+    add         r0, r0, #4
+    vdup.s8     q1, r3                     ; duplicate blim
+    bl          vp8_loop_filter_simple_vertical_edge_neon
+    ; vp8_loop_filter_simple_vertical_edge_neon preserves  r1 and q1
+    add         r0, r4, #8
+    bl          vp8_loop_filter_simple_vertical_edge_neon
+    add         r0, r4, #12
+    pop         {r4, lr}
+    b           vp8_loop_filter_simple_vertical_edge_neon
+    ENDP        ;|vp8_loop_filter_bvs_neon|
+
+; r0    unsigned char *y
+; r1    int ystride
+; r2    const unsigned char *blimit
+
+|vp8_loop_filter_mbvs_neon| PROC
+    ldrb        r3, [r2]                   ; load mblim from mem
+    vdup.s8     q1, r3                     ; duplicate mblim
+    b           vp8_loop_filter_simple_vertical_edge_neon
+    ENDP        ;|vp8_loop_filter_bvs_neon|
     END
diff --git a/vp8/common/arm/neon/mbloopfilter_neon.asm b/vp8/common/arm/neon/mbloopfilter_neon.asm
index 72f0f92..f41c156 100644
--- a/vp8/common/arm/neon/mbloopfilter_neon.asm
+++ b/vp8/common/arm/neon/mbloopfilter_neon.asm
@@ -14,155 +14,143 @@
     EXPORT  |vp8_mbloop_filter_vertical_edge_y_neon|
     EXPORT  |vp8_mbloop_filter_vertical_edge_uv_neon|
     ARM
-    REQUIRE8
-    PRESERVE8
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-; flimit, limit, and thresh should be positive numbers.
-; All 16 elements in these variables are equal.
-
 ; void vp8_mbloop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch,
-;                                               const signed char *flimit,
-;                                               const signed char *limit,
-;                                               const signed char *thresh,
-;                                               int count)
+;                                               const unsigned char *blimit,
+;                                               const unsigned char *limit,
+;                                               const unsigned char *thresh)
 ; r0    unsigned char *src,
 ; r1    int pitch,
-; r2    const signed char *flimit,
-; r3    const signed char *limit,
-; sp    const signed char *thresh,
-; sp+4  int count (unused)
+; r2    unsigned char blimit
+; r3    unsigned char limit
+; sp    unsigned char thresh,
 |vp8_mbloop_filter_horizontal_edge_y_neon| PROC
-    stmdb       sp!, {lr}
-    sub         r0, r0, r1, lsl #2          ; move src pointer down by 4 lines
-    ldr         r12, [sp, #4]               ; load thresh pointer
+    push        {lr}
+    add         r1, r1, r1                  ; double stride
+    ldr         r12, [sp, #4]               ; load thresh
+    sub         r0, r0, r1, lsl #1          ; move src pointer down by 4 lines
+    vdup.u8     q2, r12                     ; thresh
+    add         r12, r0, r1,  lsr #1        ; move src pointer up by 1 line
 
-    vld1.u8     {q3}, [r0], r1              ; p3
-    vld1.s8     {d2[], d3[]}, [r3]          ; limit
-    vld1.u8     {q4}, [r0], r1              ; p2
-    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
-    vld1.u8     {q5}, [r0], r1              ; p1
-    vld1.u8     {q6}, [r0], r1              ; p0
-    vld1.u8     {q7}, [r0], r1              ; q0
-    vld1.u8     {q8}, [r0], r1              ; q1
-    vld1.u8     {q9}, [r0], r1              ; q2
-    vld1.u8     {q10}, [r0], r1             ; q3
+    vld1.u8     {q3}, [r0@128], r1              ; p3
+    vld1.u8     {q4}, [r12@128], r1             ; p2
+    vld1.u8     {q5}, [r0@128], r1              ; p1
+    vld1.u8     {q6}, [r12@128], r1             ; p0
+    vld1.u8     {q7}, [r0@128], r1              ; q0
+    vld1.u8     {q8}, [r12@128], r1             ; q1
+    vld1.u8     {q9}, [r0@128], r1              ; q2
+    vld1.u8     {q10}, [r12@128], r1            ; q3
 
     bl          vp8_mbloop_filter_neon
 
-    sub         r0, r0, r1, lsl #3
-    add         r0, r0, r1
-    add         r2, r0, r1
-    add         r3, r2, r1
+    sub         r12, r12, r1, lsl #2
+    add         r0, r12, r1, lsr #1
 
-    vst1.u8     {q4}, [r0]                  ; store op2
-    vst1.u8     {q5}, [r2]                  ; store op1
-    vst1.u8     {q6}, [r3], r1              ; store op0
-    add         r12, r3, r1
-    vst1.u8     {q7}, [r3]                  ; store oq0
-    vst1.u8     {q8}, [r12], r1             ; store oq1
-    vst1.u8     {q9}, [r12]             ; store oq2
+    vst1.u8     {q4}, [r12@128],r1         ; store op2
+    vst1.u8     {q5}, [r0@128],r1          ; store op1
+    vst1.u8     {q6}, [r12@128], r1        ; store op0
+    vst1.u8     {q7}, [r0@128],r1          ; store oq0
+    vst1.u8     {q8}, [r12@128]            ; store oq1
+    vst1.u8     {q9}, [r0@128]             ; store oq2
 
-    ldmia       sp!, {pc}
+    pop         {pc}
     ENDP        ; |vp8_mbloop_filter_horizontal_edge_y_neon|
 
 ; void vp8_mbloop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch,
-;                                                const signed char *flimit,
-;                                                const signed char *limit,
-;                                                const signed char *thresh,
+;                                                const unsigned char *blimit,
+;                                                const unsigned char *limit,
+;                                                const unsigned char *thresh,
 ;                                                unsigned char *v)
 ; r0    unsigned char *u,
 ; r1    int pitch,
-; r2    const signed char *flimit,
-; r3    const signed char *limit,
-; sp    const signed char *thresh,
+; r2    unsigned char blimit
+; r3    unsigned char limit
+; sp    unsigned char thresh,
 ; sp+4  unsigned char *v
+
 |vp8_mbloop_filter_horizontal_edge_uv_neon| PROC
-    stmdb       sp!, {lr}
-    sub         r0, r0, r1, lsl #2          ; move u pointer down by 4 lines
-    vld1.s8     {d2[], d3[]}, [r3]          ; limit
-    ldr         r3, [sp, #8]                ; load v ptr
-    ldr         r12, [sp, #4]               ; load thresh pointer
-    sub         r3, r3, r1, lsl #2          ; move v pointer down by 4 lines
+    push        {lr}
+    ldr         r12, [sp, #4]                 ; load thresh
+    sub         r0, r0, r1, lsl #2            ; move u pointer down by 4 lines
+    vdup.u8     q2, r12                       ; thresh
+    ldr         r12, [sp, #8]                 ; load v ptr
+    sub         r12, r12, r1, lsl #2          ; move v pointer down by 4 lines
 
-    vld1.u8     {d6}, [r0], r1              ; p3
-    vld1.u8     {d7}, [r3], r1              ; p3
-    vld1.u8     {d8}, [r0], r1              ; p2
-    vld1.u8     {d9}, [r3], r1              ; p2
-    vld1.u8     {d10}, [r0], r1             ; p1
-    vld1.u8     {d11}, [r3], r1             ; p1
-    vld1.u8     {d12}, [r0], r1             ; p0
-    vld1.u8     {d13}, [r3], r1             ; p0
-    vld1.u8     {d14}, [r0], r1             ; q0
-    vld1.u8     {d15}, [r3], r1             ; q0
-    vld1.u8     {d16}, [r0], r1             ; q1
-    vld1.u8     {d17}, [r3], r1             ; q1
-    vld1.u8     {d18}, [r0], r1             ; q2
-    vld1.u8     {d19}, [r3], r1             ; q2
-    vld1.u8     {d20}, [r0], r1             ; q3
-    vld1.u8     {d21}, [r3], r1             ; q3
-
-    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
+    vld1.u8     {d6}, [r0@64], r1              ; p3
+    vld1.u8     {d7}, [r12@64], r1              ; p3
+    vld1.u8     {d8}, [r0@64], r1              ; p2
+    vld1.u8     {d9}, [r12@64], r1              ; p2
+    vld1.u8     {d10}, [r0@64], r1             ; p1
+    vld1.u8     {d11}, [r12@64], r1             ; p1
+    vld1.u8     {d12}, [r0@64], r1             ; p0
+    vld1.u8     {d13}, [r12@64], r1             ; p0
+    vld1.u8     {d14}, [r0@64], r1             ; q0
+    vld1.u8     {d15}, [r12@64], r1             ; q0
+    vld1.u8     {d16}, [r0@64], r1             ; q1
+    vld1.u8     {d17}, [r12@64], r1             ; q1
+    vld1.u8     {d18}, [r0@64], r1             ; q2
+    vld1.u8     {d19}, [r12@64], r1             ; q2
+    vld1.u8     {d20}, [r0@64], r1             ; q3
+    vld1.u8     {d21}, [r12@64], r1             ; q3
 
     bl          vp8_mbloop_filter_neon
 
     sub         r0, r0, r1, lsl #3
-    sub         r3, r3, r1, lsl #3
+    sub         r12, r12, r1, lsl #3
 
     add         r0, r0, r1
-    add         r3, r3, r1
+    add         r12, r12, r1
 
-    vst1.u8     {d8}, [r0], r1              ; store u op2
-    vst1.u8     {d9}, [r3], r1              ; store v op2
-    vst1.u8     {d10}, [r0], r1             ; store u op1
-    vst1.u8     {d11}, [r3], r1             ; store v op1
-    vst1.u8     {d12}, [r0], r1             ; store u op0
-    vst1.u8     {d13}, [r3], r1             ; store v op0
-    vst1.u8     {d14}, [r0], r1             ; store u oq0
-    vst1.u8     {d15}, [r3], r1             ; store v oq0
-    vst1.u8     {d16}, [r0], r1             ; store u oq1
-    vst1.u8     {d17}, [r3], r1             ; store v oq1
-    vst1.u8     {d18}, [r0], r1             ; store u oq2
-    vst1.u8     {d19}, [r3], r1             ; store v oq2
+    vst1.u8     {d8}, [r0@64], r1              ; store u op2
+    vst1.u8     {d9}, [r12@64], r1              ; store v op2
+    vst1.u8     {d10}, [r0@64], r1             ; store u op1
+    vst1.u8     {d11}, [r12@64], r1             ; store v op1
+    vst1.u8     {d12}, [r0@64], r1             ; store u op0
+    vst1.u8     {d13}, [r12@64], r1             ; store v op0
+    vst1.u8     {d14}, [r0@64], r1             ; store u oq0
+    vst1.u8     {d15}, [r12@64], r1             ; store v oq0
+    vst1.u8     {d16}, [r0@64], r1             ; store u oq1
+    vst1.u8     {d17}, [r12@64], r1             ; store v oq1
+    vst1.u8     {d18}, [r0@64], r1             ; store u oq2
+    vst1.u8     {d19}, [r12@64], r1             ; store v oq2
 
-    ldmia       sp!, {pc}
+    pop         {pc}
     ENDP        ; |vp8_mbloop_filter_horizontal_edge_uv_neon|
 
 ; void vp8_mbloop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
-;                                             const signed char *flimit,
-;                                             const signed char *limit,
-;                                             const signed char *thresh,
-;                                             int count)
+;                                             const unsigned char *blimit,
+;                                             const unsigned char *limit,
+;                                             const unsigned char *thresh)
 ; r0    unsigned char *src,
 ; r1    int pitch,
-; r2    const signed char *flimit,
-; r3    const signed char *limit,
-; sp    const signed char *thresh,
-; sp+4  int count (unused)
+; r2    unsigned char blimit
+; r3    unsigned char limit
+; sp    unsigned char thresh,
 |vp8_mbloop_filter_vertical_edge_y_neon| PROC
-    stmdb       sp!, {lr}
+    push        {lr}
+    ldr         r12, [sp, #4]               ; load thresh
     sub         r0, r0, #4                  ; move src pointer down by 4 columns
+    vdup.s8     q2, r12                     ; thresh
+    add         r12, r0, r1, lsl #3         ; move src pointer down by 8 lines
 
     vld1.u8     {d6}, [r0], r1              ; load first 8-line src data
-    ldr         r12, [sp, #4]               ; load thresh pointer
+    vld1.u8     {d7}, [r12], r1             ; load second 8-line src data
     vld1.u8     {d8}, [r0], r1
-    sub         sp, sp, #32
+    vld1.u8     {d9}, [r12], r1
     vld1.u8     {d10}, [r0], r1
+    vld1.u8     {d11}, [r12], r1
     vld1.u8     {d12}, [r0], r1
+    vld1.u8     {d13}, [r12], r1
     vld1.u8     {d14}, [r0], r1
+    vld1.u8     {d15}, [r12], r1
     vld1.u8     {d16}, [r0], r1
+    vld1.u8     {d17}, [r12], r1
     vld1.u8     {d18}, [r0], r1
+    vld1.u8     {d19}, [r12], r1
     vld1.u8     {d20}, [r0], r1
-
-    vld1.u8     {d7}, [r0], r1              ; load second 8-line src data
-    vld1.u8     {d9}, [r0], r1
-    vld1.u8     {d11}, [r0], r1
-    vld1.u8     {d13}, [r0], r1
-    vld1.u8     {d15}, [r0], r1
-    vld1.u8     {d17}, [r0], r1
-    vld1.u8     {d19}, [r0], r1
-    vld1.u8     {d21}, [r0], r1
+    vld1.u8     {d21}, [r12], r1
 
     ;transpose to 8x16 matrix
     vtrn.32     q3, q7
@@ -180,133 +168,11 @@
     vtrn.8      q7, q8
     vtrn.8      q9, q10
 
-    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
-    vld1.s8     {d2[], d3[]}, [r3]          ; limit
-    mov         r12, sp
-    vst1.u8     {q3}, [r12]!
-    vst1.u8     {q10}, [r12]!
-
-    bl          vp8_mbloop_filter_neon
-
-    sub         r0, r0, r1, lsl #4
-
-    add         r2, r0, r1
-
-    add         r3, r2, r1
-
-    vld1.u8     {q3}, [sp]!
-    vld1.u8     {q10}, [sp]!
-
-    ;transpose to 16x8 matrix
-    vtrn.32     q3, q7
-    vtrn.32     q4, q8
-    vtrn.32     q5, q9
-    vtrn.32     q6, q10
-    add         r12, r3, r1
-
-    vtrn.16     q3, q5
-    vtrn.16     q4, q6
-    vtrn.16     q7, q9
-    vtrn.16     q8, q10
-
-    vtrn.8      q3, q4
-    vtrn.8      q5, q6
-    vtrn.8      q7, q8
-    vtrn.8      q9, q10
-
-    ;store op2, op1, op0, oq0, oq1, oq2
-    vst1.8      {d6}, [r0]
-    vst1.8      {d8}, [r2]
-    vst1.8      {d10}, [r3]
-    vst1.8      {d12}, [r12], r1
-    add         r0, r12, r1
-    vst1.8      {d14}, [r12]
-    vst1.8      {d16}, [r0], r1
-    add         r2, r0, r1
-    vst1.8      {d18}, [r0]
-    vst1.8      {d20}, [r2], r1
-    add         r3, r2, r1
-    vst1.8      {d7}, [r2]
-    vst1.8      {d9}, [r3], r1
-    add         r12, r3, r1
-    vst1.8      {d11}, [r3]
-    vst1.8      {d13}, [r12], r1
-    add         r0, r12, r1
-    vst1.8      {d15}, [r12]
-    vst1.8      {d17}, [r0], r1
-    add         r2, r0, r1
-    vst1.8      {d19}, [r0]
-    vst1.8      {d21}, [r2]
-
-    ldmia       sp!, {pc}
-    ENDP        ; |vp8_mbloop_filter_vertical_edge_y_neon|
-
-; void vp8_mbloop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch,
-;                                              const signed char *flimit,
-;                                              const signed char *limit,
-;                                              const signed char *thresh,
-;                                              unsigned char *v)
-; r0    unsigned char *u,
-; r1    int pitch,
-; r2    const signed char *flimit,
-; r3    const signed char *limit,
-; sp    const signed char *thresh,
-; sp+4  unsigned char *v
-|vp8_mbloop_filter_vertical_edge_uv_neon| PROC
-    stmdb       sp!, {lr}
-    sub         r0, r0, #4                  ; move src pointer down by 4 columns
-    vld1.s8     {d2[], d3[]}, [r3]          ; limit
-    ldr         r3, [sp, #8]                ; load v ptr
-    ldr         r12, [sp, #4]               ; load thresh pointer
-
-    sub         r3, r3, #4                  ; move v pointer down by 4 columns
-
-    vld1.u8     {d6}, [r0], r1              ;load u data
-    vld1.u8     {d7}, [r3], r1              ;load v data
-    vld1.u8     {d8}, [r0], r1
-    vld1.u8     {d9}, [r3], r1
-    vld1.u8     {d10}, [r0], r1
-    vld1.u8     {d11}, [r3], r1
-    vld1.u8     {d12}, [r0], r1
-    vld1.u8     {d13}, [r3], r1
-    vld1.u8     {d14}, [r0], r1
-    vld1.u8     {d15}, [r3], r1
-    vld1.u8     {d16}, [r0], r1
-    vld1.u8     {d17}, [r3], r1
-    vld1.u8     {d18}, [r0], r1
-    vld1.u8     {d19}, [r3], r1
-    vld1.u8     {d20}, [r0], r1
-    vld1.u8     {d21}, [r3], r1
-
-    ;transpose to 8x16 matrix
-    vtrn.32     q3, q7
-    vtrn.32     q4, q8
-    vtrn.32     q5, q9
-    vtrn.32     q6, q10
-
-    vtrn.16     q3, q5
-    vtrn.16     q4, q6
-    vtrn.16     q7, q9
-    vtrn.16     q8, q10
-
-    vtrn.8      q3, q4
-    vtrn.8      q5, q6
-    vtrn.8      q7, q8
-    vtrn.8      q9, q10
-
-    sub         sp, sp, #32
-    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
-    mov         r12, sp
-    vst1.u8     {q3}, [r12]!
-    vst1.u8     {q10}, [r12]!
-
-    bl          vp8_mbloop_filter_neon
-
     sub         r0, r0, r1, lsl #3
-    sub         r3, r3, r1, lsl #3
 
-    vld1.u8     {q3}, [sp]!
-    vld1.u8     {q10}, [sp]!
+    bl          vp8_mbloop_filter_neon
+
+    sub         r12, r12, r1, lsl #3
 
     ;transpose to 16x8 matrix
     vtrn.32     q3, q7
@@ -326,23 +192,118 @@
 
     ;store op2, op1, op0, oq0, oq1, oq2
     vst1.8      {d6}, [r0], r1
-    vst1.8      {d7}, [r3], r1
+    vst1.8      {d7}, [r12], r1
     vst1.8      {d8}, [r0], r1
-    vst1.8      {d9}, [r3], r1
+    vst1.8      {d9}, [r12], r1
     vst1.8      {d10}, [r0], r1
-    vst1.8      {d11}, [r3], r1
+    vst1.8      {d11}, [r12], r1
     vst1.8      {d12}, [r0], r1
-    vst1.8      {d13}, [r3], r1
+    vst1.8      {d13}, [r12], r1
     vst1.8      {d14}, [r0], r1
-    vst1.8      {d15}, [r3], r1
+    vst1.8      {d15}, [r12], r1
     vst1.8      {d16}, [r0], r1
-    vst1.8      {d17}, [r3], r1
+    vst1.8      {d17}, [r12], r1
     vst1.8      {d18}, [r0], r1
-    vst1.8      {d19}, [r3], r1
-    vst1.8      {d20}, [r0], r1
-    vst1.8      {d21}, [r3], r1
+    vst1.8      {d19}, [r12], r1
+    vst1.8      {d20}, [r0]
+    vst1.8      {d21}, [r12]
 
-    ldmia       sp!, {pc}
+    pop         {pc}
+    ENDP        ; |vp8_mbloop_filter_vertical_edge_y_neon|
+
+; void vp8_mbloop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch,
+;                                              const unsigned char *blimit,
+;                                              const unsigned char *limit,
+;                                              const unsigned char *thresh,
+;                                              unsigned char *v)
+; r0    unsigned char *u,
+; r1    int pitch,
+; r2    const signed char *flimit,
+; r3    const signed char *limit,
+; sp    const signed char *thresh,
+; sp+4  unsigned char *v
+|vp8_mbloop_filter_vertical_edge_uv_neon| PROC
+    push        {lr}
+    ldr         r12, [sp, #4]               ; load thresh
+    sub         r0, r0, #4                  ; move u pointer down by 4 columns
+    vdup.u8     q2, r12                     ; thresh
+    ldr         r12, [sp, #8]               ; load v ptr
+    sub         r12, r12, #4                ; move v pointer down by 4 columns
+
+    vld1.u8     {d6}, [r0], r1              ;load u data
+    vld1.u8     {d7}, [r12], r1             ;load v data
+    vld1.u8     {d8}, [r0], r1
+    vld1.u8     {d9}, [r12], r1
+    vld1.u8     {d10}, [r0], r1
+    vld1.u8     {d11}, [r12], r1
+    vld1.u8     {d12}, [r0], r1
+    vld1.u8     {d13}, [r12], r1
+    vld1.u8     {d14}, [r0], r1
+    vld1.u8     {d15}, [r12], r1
+    vld1.u8     {d16}, [r0], r1
+    vld1.u8     {d17}, [r12], r1
+    vld1.u8     {d18}, [r0], r1
+    vld1.u8     {d19}, [r12], r1
+    vld1.u8     {d20}, [r0], r1
+    vld1.u8     {d21}, [r12], r1
+
+    ;transpose to 8x16 matrix
+    vtrn.32     q3, q7
+    vtrn.32     q4, q8
+    vtrn.32     q5, q9
+    vtrn.32     q6, q10
+
+    vtrn.16     q3, q5
+    vtrn.16     q4, q6
+    vtrn.16     q7, q9
+    vtrn.16     q8, q10
+
+    vtrn.8      q3, q4
+    vtrn.8      q5, q6
+    vtrn.8      q7, q8
+    vtrn.8      q9, q10
+
+    sub         r0, r0, r1, lsl #3
+
+    bl          vp8_mbloop_filter_neon
+
+    sub         r12, r12, r1, lsl #3
+
+    ;transpose to 16x8 matrix
+    vtrn.32     q3, q7
+    vtrn.32     q4, q8
+    vtrn.32     q5, q9
+    vtrn.32     q6, q10
+
+    vtrn.16     q3, q5
+    vtrn.16     q4, q6
+    vtrn.16     q7, q9
+    vtrn.16     q8, q10
+
+    vtrn.8      q3, q4
+    vtrn.8      q5, q6
+    vtrn.8      q7, q8
+    vtrn.8      q9, q10
+
+    ;store op2, op1, op0, oq0, oq1, oq2
+    vst1.8      {d6}, [r0], r1
+    vst1.8      {d7}, [r12], r1
+    vst1.8      {d8}, [r0], r1
+    vst1.8      {d9}, [r12], r1
+    vst1.8      {d10}, [r0], r1
+    vst1.8      {d11}, [r12], r1
+    vst1.8      {d12}, [r0], r1
+    vst1.8      {d13}, [r12], r1
+    vst1.8      {d14}, [r0], r1
+    vst1.8      {d15}, [r12], r1
+    vst1.8      {d16}, [r0], r1
+    vst1.8      {d17}, [r12], r1
+    vst1.8      {d18}, [r0], r1
+    vst1.8      {d19}, [r12], r1
+    vst1.8      {d20}, [r0]
+    vst1.8      {d21}, [r12]
+
+    pop         {pc}
     ENDP        ; |vp8_mbloop_filter_vertical_edge_uv_neon|
 
 ; void vp8_mbloop_filter_neon()
@@ -350,26 +311,19 @@
 ; functions do the necessary load, transpose (if necessary), preserve (if
 ; necessary) and store.
 
-; TODO:
-; The vertical filter writes p3/q3 back out because two 4 element writes are
-; much simpler than ordering and writing two 3 element sets (or three 2 elements
-; sets, or whichever other combinations are possible).
-; If we can preserve q3 and q10, the vertical filter will be able to avoid
-; storing those values on the stack and reading them back after the filter.
-
 ; r0,r1 PRESERVE
-; r2    flimit
-; r3    PRESERVE
-; q1    limit
+; r2    mblimit
+; r3    limit
+
 ; q2    thresh
-; q3    p3
+; q3    p3 PRESERVE
 ; q4    p2
 ; q5    p1
 ; q6    p0
 ; q7    q0
 ; q8    q1
 ; q9    q2
-; q10   q3
+; q10   q3 PRESERVE
 
 |vp8_mbloop_filter_neon| PROC
 
@@ -378,12 +332,12 @@
     vabd.u8     q12, q4, q5                 ; abs(p2 - p1)
     vabd.u8     q13, q5, q6                 ; abs(p1 - p0)
     vabd.u8     q14, q8, q7                 ; abs(q1 - q0)
-    vabd.u8     q3, q9, q8                  ; abs(q2 - q1)
+    vabd.u8     q1, q9, q8                  ; abs(q2 - q1)
     vabd.u8     q0, q10, q9                 ; abs(q3 - q2)
 
     vmax.u8     q11, q11, q12
     vmax.u8     q12, q13, q14
-    vmax.u8     q3, q3, q0
+    vmax.u8     q1, q1, q0
     vmax.u8     q15, q11, q12
 
     vabd.u8     q12, q6, q7                 ; abs(p0 - q0)
@@ -391,44 +345,46 @@
     ; vp8_hevmask
     vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh) * -1
     vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh) * -1
-    vmax.u8     q15, q15, q3
+    vmax.u8     q15, q15, q1
 
-    vld1.s8     {d4[], d5[]}, [r2]          ; flimit
+    vdup.u8     q1, r3                      ; limit
+    vdup.u8     q2, r2                      ; mblimit
 
     vmov.u8     q0, #0x80                   ; 0x80
 
-    vadd.u8     q2, q2, q2                  ; flimit * 2
-    vadd.u8     q2, q2, q1                  ; flimit * 2 +  limit
     vcge.u8     q15, q1, q15
 
     vabd.u8     q1, q5, q8                  ; a = abs(p1 - q1)
     vqadd.u8    q12, q12, q12               ; b = abs(p0 - q0) * 2
-    vshr.u8     q1, q1, #1                  ; a = a / 2
-    vqadd.u8    q12, q12, q1                ; a = b + a
-    vcge.u8     q12, q2, q12                ; (a > flimit * 2 + limit) * -1
+    vmov.u16    q11, #3                     ; #3
 
     ; vp8_filter
     ; convert to signed
     veor        q7, q7, q0                  ; qs0
+    vshr.u8     q1, q1, #1                  ; a = a / 2
     veor        q6, q6, q0                  ; ps0
     veor        q5, q5, q0                  ; ps1
+
+    vqadd.u8    q12, q12, q1                ; a = b + a
+
     veor        q8, q8, q0                  ; qs1
     veor        q4, q4, q0                  ; ps2
     veor        q9, q9, q0                  ; qs2
 
     vorr        q14, q13, q14               ; vp8_hevmask
 
+    vcge.u8     q12, q2, q12                ; (a > flimit * 2 + limit) * -1
+
     vsubl.s8    q2, d14, d12                ; qs0 - ps0
     vsubl.s8    q13, d15, d13
 
     vqsub.s8    q1, q5, q8                  ; vp8_filter = clamp(ps1-qs1)
 
-    vadd.s16    q10, q2, q2                 ; 3 * (qs0 - ps0)
-    vadd.s16    q11, q13, q13
+    vmul.i16    q2, q2, q11                 ; 3 * ( qs0 - ps0)
+
     vand        q15, q15, q12               ; vp8_filter_mask
 
-    vadd.s16    q2, q2, q10
-    vadd.s16    q13, q13, q11
+    vmul.i16    q13, q13, q11
 
     vmov.u8     q12, #3                     ; #3
 
@@ -447,23 +403,19 @@
 
     vand        q13, q1, q14                ; Filter2 &= hev
 
-    vmov.u8     d7, #9                      ; #9
-
     vqadd.s8    q2, q13, q11                ; Filter1 = clamp(Filter2+4)
     vqadd.s8    q13, q13, q12               ; Filter2 = clamp(Filter2+3)
 
-    vmov.u8     d6, #18                     ; #18
+    vmov        q0, q15
 
     vshr.s8     q2, q2, #3                  ; Filter1 >>= 3
     vshr.s8     q13, q13, #3                ; Filter2 >>= 3
 
-    vmov        q10, q15
+    vmov        q11, q15
     vmov        q12, q15
 
     vqsub.s8    q7, q7, q2                  ; qs0 = clamp(qs0 - Filter1)
 
-    vmov.u8     d5, #27                     ; #27
-
     vqadd.s8    q6, q6, q13                 ; ps0 = clamp(ps0 + Filter2)
 
     vbic        q1, q1, q14                 ; vp8_filter &= ~hev
@@ -471,35 +423,43 @@
     ; roughly 1/7th difference across boundary
     ; roughly 2/7th difference across boundary
     ; roughly 3/7th difference across boundary
-    vmov        q11, q15
+
+    vmov.u8     d5, #9                      ; #9
+    vmov.u8     d4, #18                     ; #18
+
     vmov        q13, q15
     vmov        q14, q15
 
-    vmlal.s8    q10, d2, d7                 ; Filter2 * 9
-    vmlal.s8    q11, d3, d7
-    vmlal.s8    q12, d2, d6                 ; Filter2 * 18
-    vmlal.s8    q13, d3, d6
-    vmlal.s8    q14, d2, d5                 ; Filter2 * 27
+    vmlal.s8    q0, d2, d5                  ; 63 + Filter2 * 9
+    vmlal.s8    q11, d3, d5
+    vmov.u8     d5, #27                     ; #27
+    vmlal.s8    q12, d2, d4                 ; 63 + Filter2 * 18
+    vmlal.s8    q13, d3, d4
+    vmlal.s8    q14, d2, d5                 ; 63 + Filter2 * 27
     vmlal.s8    q15, d3, d5
-    vqshrn.s16  d20, q10, #7                ; u = clamp((63 + Filter2 * 9)>>7)
-    vqshrn.s16  d21, q11, #7
+
+    vqshrn.s16  d0, q0, #7                  ; u = clamp((63 + Filter2 * 9)>>7)
+    vqshrn.s16  d1, q11, #7
     vqshrn.s16  d24, q12, #7                ; u = clamp((63 + Filter2 * 18)>>7)
     vqshrn.s16  d25, q13, #7
     vqshrn.s16  d28, q14, #7                ; u = clamp((63 + Filter2 * 27)>>7)
     vqshrn.s16  d29, q15, #7
 
-    vqsub.s8    q11, q9, q10                ; s = clamp(qs2 - u)
-    vqadd.s8    q10, q4, q10                ; s = clamp(ps2 + u)
+    vmov.u8     q1, #0x80                   ; 0x80
+
+    vqsub.s8    q11, q9, q0                 ; s = clamp(qs2 - u)
+    vqadd.s8    q0, q4, q0                  ; s = clamp(ps2 + u)
     vqsub.s8    q13, q8, q12                ; s = clamp(qs1 - u)
     vqadd.s8    q12, q5, q12                ; s = clamp(ps1 + u)
     vqsub.s8    q15, q7, q14                ; s = clamp(qs0 - u)
     vqadd.s8    q14, q6, q14                ; s = clamp(ps0 + u)
-    veor        q9, q11, q0                 ; *oq2 = s^0x80
-    veor        q4, q10, q0                 ; *op2 = s^0x80
-    veor        q8, q13, q0                 ; *oq1 = s^0x80
-    veor        q5, q12, q0                 ; *op2 = s^0x80
-    veor        q7, q15, q0                 ; *oq0 = s^0x80
-    veor        q6, q14, q0                 ; *op0 = s^0x80
+
+    veor        q9, q11, q1                 ; *oq2 = s^0x80
+    veor        q4, q0, q1                  ; *op2 = s^0x80
+    veor        q8, q13, q1                 ; *oq1 = s^0x80
+    veor        q5, q12, q1                 ; *op2 = s^0x80
+    veor        q7, q15, q1                 ; *oq0 = s^0x80
+    veor        q6, q14, q1                 ; *op0 = s^0x80
 
     bx          lr
     ENDP        ; |vp8_mbloop_filter_neon|
diff --git a/vp8/common/extend.c b/vp8/common/extend.c
index 036bafc..a2d3253 100644
--- a/vp8/common/extend.c
+++ b/vp8/common/extend.c
@@ -85,10 +85,10 @@
                           src->y_height, src->y_width,
                           et, el, eb, er);
 
-    et = (et + 1) >> 1;
-    el = (el + 1) >> 1;
-    eb = (eb + 1) >> 1;
-    er = (er + 1) >> 1;
+    et = dst->border >> 1;
+    el = dst->border >> 1;
+    eb = (dst->border >> 1) + dst->uv_height - src->uv_height;
+    er = (dst->border >> 1) + dst->uv_width - src->uv_width;
 
     copy_and_extend_plane(src->u_buffer, src->uv_stride,
                           dst->u_buffer, dst->uv_stride,
diff --git a/vp8/common/generic/systemdependent.c b/vp8/common/generic/systemdependent.c
index 1339380..c616294 100644
--- a/vp8/common/generic/systemdependent.c
+++ b/vp8/common/generic/systemdependent.c
@@ -108,9 +108,9 @@
     rtcd->loopfilter.normal_b_v  = vp8_loop_filter_bv_c;
     rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_c;
     rtcd->loopfilter.normal_b_h  = vp8_loop_filter_bh_c;
-    rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_c;
+    rtcd->loopfilter.simple_mb_v = vp8_loop_filter_simple_vertical_edge_c;
     rtcd->loopfilter.simple_b_v  = vp8_loop_filter_bvs_c;
-    rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_c;
+    rtcd->loopfilter.simple_mb_h = vp8_loop_filter_simple_horizontal_edge_c;
     rtcd->loopfilter.simple_b_h  = vp8_loop_filter_bhs_c;
 
 #if CONFIG_POSTPROC || (CONFIG_VP8_ENCODER && CONFIG_INTERNAL_STATS)
diff --git a/vp8/common/loopfilter.c b/vp8/common/loopfilter.c
index a324271..be3f535 100644
--- a/vp8/common/loopfilter.c
+++ b/vp8/common/loopfilter.c
@@ -9,152 +9,149 @@
  */
 
 
-#include "vpx_ports/config.h"
+#include "vpx_config.h"
 #include "loopfilter.h"
 #include "onyxc_int.h"
+#include "vpx_mem/vpx_mem.h"
 
 typedef unsigned char uc;
 
-
 prototype_loopfilter(vp8_loop_filter_horizontal_edge_c);
 prototype_loopfilter(vp8_loop_filter_vertical_edge_c);
 prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_c);
 prototype_loopfilter(vp8_mbloop_filter_vertical_edge_c);
-prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_c);
-prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_c);
+
+prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_c);
+prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_c);
 
 /* Horizontal MB filtering */
-void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                           int y_stride, int uv_stride, loop_filter_info *lfi)
+void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr,
+                           unsigned char *v_ptr, int y_stride, int uv_stride,
+                           loop_filter_info *lfi)
 {
-    vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
+    vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
 
     if (u_ptr)
-        vp8_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
+        vp8_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
 
     if (v_ptr)
-        vp8_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
-}
-
-void vp8_loop_filter_mbhs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                            int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    vp8_loop_filter_simple_horizontal_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
+        vp8_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
 }
 
 /* Vertical MB Filtering */
-void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                           int y_stride, int uv_stride, loop_filter_info *lfi)
+void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr,
+                           unsigned char *v_ptr, int y_stride, int uv_stride,
+                           loop_filter_info *lfi)
 {
-    vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
+    vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
 
     if (u_ptr)
-        vp8_mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
+        vp8_mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
 
     if (v_ptr)
-        vp8_mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
-}
-
-void vp8_loop_filter_mbvs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                            int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    vp8_loop_filter_simple_vertical_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
+        vp8_mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
 }
 
 /* Horizontal B Filtering */
-void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                          int y_stride, int uv_stride, loop_filter_info *lfi)
+void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr,
+                          unsigned char *v_ptr, int y_stride, int uv_stride,
+                          loop_filter_info *lfi)
 {
-    vp8_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
 
     if (u_ptr)
-        vp8_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
+        vp8_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
 
     if (v_ptr)
-        vp8_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
+        vp8_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
 }
 
-void vp8_loop_filter_bhs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                           int y_stride, int uv_stride, loop_filter_info *lfi)
+void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride,
+                           const unsigned char *blimit)
 {
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, blimit);
+    vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, blimit);
+    vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, blimit);
 }
 
 /* Vertical B Filtering */
-void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                          int y_stride, int uv_stride, loop_filter_info *lfi)
+void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr,
+                          unsigned char *v_ptr, int y_stride, int uv_stride,
+                          loop_filter_info *lfi)
 {
-    vp8_loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
 
     if (u_ptr)
-        vp8_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
+        vp8_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
 
     if (v_ptr)
-        vp8_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
+        vp8_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
 }
 
-void vp8_loop_filter_bvs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                           int y_stride, int uv_stride, loop_filter_info *lfi)
+void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride,
+                           const unsigned char *blimit)
 {
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    vp8_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, blimit);
+    vp8_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, blimit);
+    vp8_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, blimit);
 }
 
-void vp8_init_loop_filter(VP8_COMMON *cm)
+static void lf_init_lut(loop_filter_info_n *lfi)
 {
-    loop_filter_info *lfi = cm->lf_info;
-    LOOPFILTERTYPE lft = cm->filter_type;
-    int sharpness_lvl = cm->sharpness_level;
-    int frame_type = cm->frame_type;
-    int i, j;
+    int filt_lvl;
 
-    int block_inside_limit = 0;
-    int HEVThresh;
-
-    /* For each possible value for the loop filter fill out a "loop_filter_info" entry. */
-    for (i = 0; i <= MAX_LOOP_FILTER; i++)
+    for (filt_lvl = 0; filt_lvl <= MAX_LOOP_FILTER; filt_lvl++)
     {
-        int filt_lvl = i;
-
-        if (frame_type == KEY_FRAME)
+        if (filt_lvl >= 40)
         {
-            if (filt_lvl >= 40)
-                HEVThresh = 2;
-            else if (filt_lvl >= 15)
-                HEVThresh = 1;
-            else
-                HEVThresh = 0;
+            lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 2;
+            lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 3;
+        }
+        else if (filt_lvl >= 20)
+        {
+            lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1;
+            lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 2;
+        }
+        else if (filt_lvl >= 15)
+        {
+            lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1;
+            lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 1;
         }
         else
         {
-            if (filt_lvl >= 40)
-                HEVThresh = 3;
-            else if (filt_lvl >= 20)
-                HEVThresh = 2;
-            else if (filt_lvl >= 15)
-                HEVThresh = 1;
-            else
-                HEVThresh = 0;
+            lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 0;
+            lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 0;
         }
+    }
+
+    lfi->mode_lf_lut[DC_PRED] = 1;
+    lfi->mode_lf_lut[V_PRED] = 1;
+    lfi->mode_lf_lut[H_PRED] = 1;
+    lfi->mode_lf_lut[TM_PRED] = 1;
+    lfi->mode_lf_lut[B_PRED]  = 0;
+
+    lfi->mode_lf_lut[ZEROMV]  = 1;
+    lfi->mode_lf_lut[NEARESTMV] = 2;
+    lfi->mode_lf_lut[NEARMV] = 2;
+    lfi->mode_lf_lut[NEWMV] = 2;
+    lfi->mode_lf_lut[SPLITMV] = 3;
+
+}
+
+void vp8_loop_filter_update_sharpness(loop_filter_info_n *lfi,
+                                      int sharpness_lvl)
+{
+    int i;
+
+    /* For each possible value for the loop filter fill out limits */
+    for (i = 0; i <= MAX_LOOP_FILTER; i++)
+    {
+        int filt_lvl = i;
+        int block_inside_limit = 0;
 
         /* Set loop filter paramaeters that control sharpness. */
         block_inside_limit = filt_lvl >> (sharpness_lvl > 0);
@@ -169,119 +166,120 @@
         if (block_inside_limit < 1)
             block_inside_limit = 1;
 
-        for (j = 0; j < 16; j++)
-        {
-            lfi[i].lim[j] = block_inside_limit;
-            lfi[i].mbflim[j] = filt_lvl + 2;
-            lfi[i].flim[j] = filt_lvl;
-            lfi[i].thr[j] = HEVThresh;
-        }
-
-    }
-
-    /* Set up the function pointers depending on the type of loop filtering selected */
-    if (lft == NORMAL_LOOPFILTER)
-    {
-        cm->lf_mbv = LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_v);
-        cm->lf_bv  = LF_INVOKE(&cm->rtcd.loopfilter, normal_b_v);
-        cm->lf_mbh = LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_h);
-        cm->lf_bh  = LF_INVOKE(&cm->rtcd.loopfilter, normal_b_h);
-    }
-    else
-    {
-        cm->lf_mbv = LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_v);
-        cm->lf_bv  = LF_INVOKE(&cm->rtcd.loopfilter, simple_b_v);
-        cm->lf_mbh = LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_h);
-        cm->lf_bh  = LF_INVOKE(&cm->rtcd.loopfilter, simple_b_h);
+        vpx_memset(lfi->lim[i], block_inside_limit, SIMD_WIDTH);
+        vpx_memset(lfi->blim[i], (2 * filt_lvl + block_inside_limit),
+                SIMD_WIDTH);
+        vpx_memset(lfi->mblim[i], (2 * (filt_lvl + 2) + block_inside_limit),
+                SIMD_WIDTH);
     }
 }
 
-/* Put vp8_init_loop_filter() in vp8dx_create_decompressor(). Only call vp8_frame_init_loop_filter() while decoding
- * each frame. Check last_frame_type to skip the function most of times.
- */
-void vp8_frame_init_loop_filter(loop_filter_info *lfi, int frame_type)
+void vp8_loop_filter_init(VP8_COMMON *cm)
 {
-    int HEVThresh;
-    int i, j;
+    loop_filter_info_n *lfi = &cm->lf_info;
+    int i;
 
-    /* For each possible value for the loop filter fill out a "loop_filter_info" entry. */
-    for (i = 0; i <= MAX_LOOP_FILTER; i++)
+    /* init limits for given sharpness*/
+    vp8_loop_filter_update_sharpness(lfi, cm->sharpness_level);
+    cm->last_sharpness_level = cm->sharpness_level;
+
+    /* init LUT for lvl  and hev thr picking */
+    lf_init_lut(lfi);
+
+    /* init hev threshold const vectors */
+    for(i = 0; i < 4 ; i++)
     {
-        int filt_lvl = i;
-
-        if (frame_type == KEY_FRAME)
-        {
-            if (filt_lvl >= 40)
-                HEVThresh = 2;
-            else if (filt_lvl >= 15)
-                HEVThresh = 1;
-            else
-                HEVThresh = 0;
-        }
-        else
-        {
-            if (filt_lvl >= 40)
-                HEVThresh = 3;
-            else if (filt_lvl >= 20)
-                HEVThresh = 2;
-            else if (filt_lvl >= 15)
-                HEVThresh = 1;
-            else
-                HEVThresh = 0;
-        }
-
-        for (j = 0; j < 16; j++)
-        {
-            /*lfi[i].lim[j] = block_inside_limit;
-            lfi[i].mbflim[j] = filt_lvl+2;*/
-            /*lfi[i].flim[j] = filt_lvl;*/
-            lfi[i].thr[j] = HEVThresh;
-        }
+        vpx_memset(lfi->hev_thr[i], i, SIMD_WIDTH);
     }
 }
 
-
-int vp8_adjust_mb_lf_value(MACROBLOCKD *mbd, int filter_level)
+void vp8_loop_filter_frame_init(VP8_COMMON *cm,
+                                MACROBLOCKD *mbd,
+                                int default_filt_lvl,
+                                int sharpness_lvl)
 {
-    MB_MODE_INFO *mbmi = &mbd->mode_info_context->mbmi;
+    int seg,  /* segment number */
+        ref,  /* index in ref_lf_deltas */
+        mode; /* index in mode_lf_deltas */
 
-    if (mbd->mode_ref_lf_delta_enabled)
+    loop_filter_info_n *lfi = &cm->lf_info;
+
+    /* update limits if sharpness has changed */
+    if(cm->last_sharpness_level != sharpness_lvl)
     {
+        vp8_loop_filter_update_sharpness(lfi, sharpness_lvl);
+        cm->last_sharpness_level = sharpness_lvl;
+    }
+
+    for(seg = 0; seg < MAX_MB_SEGMENTS; seg++)
+    {
+        int lvl_seg = default_filt_lvl;
+        int lvl_ref, lvl_mode;
+
+        /* Note the baseline filter values for each segment */
+        if (mbd->segmentation_enabled)
+        {
+            /* Abs value */
+            if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
+            {
+                lvl_seg = mbd->segment_feature_data[MB_LVL_ALT_LF][seg];
+            }
+            else  /* Delta Value */
+            {
+                lvl_seg += mbd->segment_feature_data[MB_LVL_ALT_LF][seg];
+                lvl_seg = (lvl_seg > 0) ? ((lvl_seg > 63) ? 63: lvl_seg) : 0;
+            }
+        }
+
+        if (!mbd->mode_ref_lf_delta_enabled)
+        {
+            /* we could get rid of this if we assume that deltas are set to
+             * zero when not in use; encoder always uses deltas
+             */
+            vpx_memset(lfi->lvl[seg][0], lvl_seg, 4 * 4 );
+            continue;
+        }
+
+        lvl_ref = lvl_seg;
+
+        /* INTRA_FRAME */
+        ref = INTRA_FRAME;
+
         /* Apply delta for reference frame */
-        filter_level += mbd->ref_lf_deltas[mbmi->ref_frame];
+        lvl_ref += mbd->ref_lf_deltas[ref];
 
-        /* Apply delta for mode */
-        if (mbmi->ref_frame == INTRA_FRAME)
+        /* Apply delta for Intra modes */
+        mode = 0; /* B_PRED */
+        /* Only the split mode BPRED has a further special case */
+        lvl_mode = lvl_ref +  mbd->mode_lf_deltas[mode];
+        lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; /* clamp */
+
+        lfi->lvl[seg][ref][mode] = lvl_mode;
+
+        mode = 1; /* all the rest of Intra modes */
+        lvl_mode = (lvl_ref > 0) ? (lvl_ref > 63 ? 63 : lvl_ref)  : 0; /* clamp */
+        lfi->lvl[seg][ref][mode] = lvl_mode;
+
+        /* LAST, GOLDEN, ALT */
+        for(ref = 1; ref < MAX_REF_FRAMES; ref++)
         {
-            /* Only the split mode BPRED has a further special case */
-            if (mbmi->mode == B_PRED)
-                filter_level +=  mbd->mode_lf_deltas[0];
+            int lvl_ref = lvl_seg;
+
+            /* Apply delta for reference frame */
+            lvl_ref += mbd->ref_lf_deltas[ref];
+
+            /* Apply delta for Inter modes */
+            for (mode = 1; mode < 4; mode++)
+            {
+                lvl_mode = lvl_ref + mbd->mode_lf_deltas[mode];
+                lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; /* clamp */
+
+                lfi->lvl[seg][ref][mode] = lvl_mode;
+            }
         }
-        else
-        {
-            /* Zero motion mode */
-            if (mbmi->mode == ZEROMV)
-                filter_level +=  mbd->mode_lf_deltas[1];
-
-            /* Split MB motion mode */
-            else if (mbmi->mode == SPLITMV)
-                filter_level +=  mbd->mode_lf_deltas[3];
-
-            /* All other inter motion modes (Nearest, Near, New) */
-            else
-                filter_level +=  mbd->mode_lf_deltas[2];
-        }
-
-        /* Range check */
-        if (filter_level > MAX_LOOP_FILTER)
-            filter_level = MAX_LOOP_FILTER;
-        else if (filter_level < 0)
-            filter_level = 0;
     }
-    return filter_level;
 }
 
-
 void vp8_loop_filter_frame
 (
     VP8_COMMON *cm,
@@ -290,49 +288,23 @@
 )
 {
     YV12_BUFFER_CONFIG *post = cm->frame_to_show;
-    loop_filter_info *lfi = cm->lf_info;
+    loop_filter_info_n *lfi_n = &cm->lf_info;
+    loop_filter_info lfi;
+
     FRAME_TYPE frame_type = cm->frame_type;
 
     int mb_row;
     int mb_col;
 
-
-    int baseline_filter_level[MAX_MB_SEGMENTS];
     int filter_level;
-    int alt_flt_enabled = mbd->segmentation_enabled;
 
-    int i;
     unsigned char *y_ptr, *u_ptr, *v_ptr;
 
-    mbd->mode_info_context = cm->mi;          /* Point at base of Mb MODE_INFO list */
-
-    /* Note the baseline filter values for each segment */
-    if (alt_flt_enabled)
-    {
-        for (i = 0; i < MAX_MB_SEGMENTS; i++)
-        {
-            /* Abs value */
-            if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
-                baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i];
-            /* Delta Value */
-            else
-            {
-                baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i];
-                baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0;  /* Clamp to valid range */
-            }
-        }
-    }
-    else
-    {
-        for (i = 0; i < MAX_MB_SEGMENTS; i++)
-            baseline_filter_level[i] = default_filt_lvl;
-    }
+    /* Point at base of Mb MODE_INFO list */
+    const MODE_INFO *mode_info_context = cm->mi;
 
     /* Initialize the loop filter for this frame. */
-    if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level))
-        vp8_init_loop_filter(cm);
-    else if (frame_type != cm->last_frame_type)
-        vp8_frame_init_loop_filter(lfi, frame_type);
+    vp8_loop_filter_frame_init( cm, mbd, default_filt_lvl, cm->sharpness_level);
 
     /* Set up the buffer pointers */
     y_ptr = post->y_buffer;
@@ -344,51 +316,79 @@
     {
         for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
         {
-            int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0;
-            int skip_lf = (mbd->mode_info_context->mbmi.mode != B_PRED &&
-                            mbd->mode_info_context->mbmi.mode != SPLITMV &&
-                            mbd->mode_info_context->mbmi.mb_skip_coeff);
+            int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+                            mode_info_context->mbmi.mode != SPLITMV &&
+                            mode_info_context->mbmi.mb_skip_coeff);
 
-            filter_level = baseline_filter_level[Segment];
+            const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
+            const int seg = mode_info_context->mbmi.segment_id;
+            const int ref_frame = mode_info_context->mbmi.ref_frame;
 
-            /* Distance of Mb to the various image edges.
-             * These specified to 8th pel as they are always compared to values that are in 1/8th pel units
-             * Apply any context driven MB level adjustment
-             */
-            filter_level = vp8_adjust_mb_lf_value(mbd, filter_level);
+            filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
 
             if (filter_level)
             {
-                if (mb_col > 0)
-                    cm->lf_mbv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level]);
+                if (cm->filter_type == NORMAL_LOOPFILTER)
+                {
+                    const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
+                    lfi.mblim = lfi_n->mblim[filter_level];
+                    lfi.blim = lfi_n->blim[filter_level];
+                    lfi.lim = lfi_n->lim[filter_level];
+                    lfi.hev_thr = lfi_n->hev_thr[hev_index];
 
-                if (!skip_lf)
-                    cm->lf_bv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level]);
+                    if (mb_col > 0)
+                        LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_v)
+                        (y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi);
 
-                /* don't apply across umv border */
-                if (mb_row > 0)
-                    cm->lf_mbh(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level]);
+                    if (!skip_lf)
+                        LF_INVOKE(&cm->rtcd.loopfilter, normal_b_v)
+                        (y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi);
 
-                if (!skip_lf)
-                    cm->lf_bh(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level]);
+                    /* don't apply across umv border */
+                    if (mb_row > 0)
+                        LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_h)
+                        (y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi);
+
+                    if (!skip_lf)
+                        LF_INVOKE(&cm->rtcd.loopfilter, normal_b_h)
+                        (y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi);
+                }
+                else
+                {
+                    if (mb_col > 0)
+                        LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_v)
+                        (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
+
+                    if (!skip_lf)
+                        LF_INVOKE(&cm->rtcd.loopfilter, simple_b_v)
+                        (y_ptr, post->y_stride, lfi_n->blim[filter_level]);
+
+                    /* don't apply across umv border */
+                    if (mb_row > 0)
+                        LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_h)
+                        (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
+
+                    if (!skip_lf)
+                        LF_INVOKE(&cm->rtcd.loopfilter, simple_b_h)
+                        (y_ptr, post->y_stride, lfi_n->blim[filter_level]);
+                }
             }
 
             y_ptr += 16;
             u_ptr += 8;
             v_ptr += 8;
 
-            mbd->mode_info_context++;     /* step to next MB */
+            mode_info_context++;     /* step to next MB */
         }
 
         y_ptr += post->y_stride  * 16 - post->y_width;
         u_ptr += post->uv_stride *  8 - post->uv_width;
         v_ptr += post->uv_stride *  8 - post->uv_width;
 
-        mbd->mode_info_context++;         /* Skip border mb */
+        mode_info_context++;         /* Skip border mb */
     }
 }
 
-
 void vp8_loop_filter_frame_yonly
 (
     VP8_COMMON *cm,
@@ -399,49 +399,28 @@
 {
     YV12_BUFFER_CONFIG *post = cm->frame_to_show;
 
-    int i;
     unsigned char *y_ptr;
     int mb_row;
     int mb_col;
 
-    loop_filter_info *lfi = cm->lf_info;
-    int baseline_filter_level[MAX_MB_SEGMENTS];
+    loop_filter_info_n *lfi_n = &cm->lf_info;
+    loop_filter_info lfi;
+
     int filter_level;
-    int alt_flt_enabled = mbd->segmentation_enabled;
     FRAME_TYPE frame_type = cm->frame_type;
 
-    (void) sharpness_lvl;
+    /* Point at base of Mb MODE_INFO list */
+    const MODE_INFO *mode_info_context = cm->mi;
 
-    /*MODE_INFO * this_mb_mode_info = cm->mi;*/ /* Point at base of Mb MODE_INFO list */
-    mbd->mode_info_context = cm->mi;          /* Point at base of Mb MODE_INFO list */
+    sharpness_lvl = cm->sharpness_level;
 
-    /* Note the baseline filter values for each segment */
-    if (alt_flt_enabled)
-    {
-        for (i = 0; i < MAX_MB_SEGMENTS; i++)
-        {
-            /* Abs value */
-            if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
-                baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i];
-            /* Delta Value */
-            else
-            {
-                baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i];
-                baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0;  /* Clamp to valid range */
-            }
-        }
-    }
-    else
-    {
-        for (i = 0; i < MAX_MB_SEGMENTS; i++)
-            baseline_filter_level[i] = default_filt_lvl;
-    }
+#if 0
+    if(default_filt_lvl == 0) /* no filter applied */
+        return;
+#endif
 
     /* Initialize the loop filter for this frame. */
-    if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level))
-        vp8_init_loop_filter(cm);
-    else if (frame_type != cm->last_frame_type)
-        vp8_frame_init_loop_filter(lfi, frame_type);
+    vp8_loop_filter_frame_init( cm, mbd, default_filt_lvl, sharpness_lvl);
 
     /* Set up the buffer pointers */
     y_ptr = post->y_buffer;
@@ -451,44 +430,75 @@
     {
         for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
         {
-            int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0;
-            int skip_lf = (mbd->mode_info_context->mbmi.mode != B_PRED &&
-                            mbd->mode_info_context->mbmi.mode != SPLITMV &&
-                            mbd->mode_info_context->mbmi.mb_skip_coeff);
+            int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+                            mode_info_context->mbmi.mode != SPLITMV &&
+                            mode_info_context->mbmi.mb_skip_coeff);
 
-            filter_level = baseline_filter_level[Segment];
+            const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
+            const int seg = mode_info_context->mbmi.segment_id;
+            const int ref_frame = mode_info_context->mbmi.ref_frame;
 
-            /* Apply any context driven MB level adjustment */
-            filter_level = vp8_adjust_mb_lf_value(mbd, filter_level);
+            filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
 
             if (filter_level)
             {
-                if (mb_col > 0)
-                    cm->lf_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]);
+                if (cm->filter_type == NORMAL_LOOPFILTER)
+                {
+                    const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
+                    lfi.mblim = lfi_n->mblim[filter_level];
+                    lfi.blim = lfi_n->blim[filter_level];
+                    lfi.lim = lfi_n->lim[filter_level];
+                    lfi.hev_thr = lfi_n->hev_thr[hev_index];
 
-                if (!skip_lf)
-                    cm->lf_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]);
+                    if (mb_col > 0)
+                        LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_v)
+                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);
 
-                /* don't apply across umv border */
-                if (mb_row > 0)
-                    cm->lf_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]);
+                    if (!skip_lf)
+                        LF_INVOKE(&cm->rtcd.loopfilter, normal_b_v)
+                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);
 
-                if (!skip_lf)
-                    cm->lf_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]);
+                    /* don't apply across umv border */
+                    if (mb_row > 0)
+                        LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_h)
+                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);
+
+                    if (!skip_lf)
+                        LF_INVOKE(&cm->rtcd.loopfilter, normal_b_h)
+                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);
+                }
+                else
+                {
+                    if (mb_col > 0)
+                        LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_v)
+                        (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
+
+                    if (!skip_lf)
+                        LF_INVOKE(&cm->rtcd.loopfilter, simple_b_v)
+                        (y_ptr, post->y_stride, lfi_n->blim[filter_level]);
+
+                    /* don't apply across umv border */
+                    if (mb_row > 0)
+                        LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_h)
+                        (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
+
+                    if (!skip_lf)
+                        LF_INVOKE(&cm->rtcd.loopfilter, simple_b_h)
+                        (y_ptr, post->y_stride, lfi_n->blim[filter_level]);
+                }
             }
 
             y_ptr += 16;
-            mbd->mode_info_context ++;        /* step to next MB */
+            mode_info_context ++;        /* step to next MB */
 
         }
 
         y_ptr += post->y_stride  * 16 - post->y_width;
-        mbd->mode_info_context ++;            /* Skip border mb */
+        mode_info_context ++;            /* Skip border mb */
     }
 
 }
 
-
 void vp8_loop_filter_partial_frame
 (
     VP8_COMMON *cm,
@@ -500,25 +510,32 @@
 {
     YV12_BUFFER_CONFIG *post = cm->frame_to_show;
 
-    int i;
     unsigned char *y_ptr;
     int mb_row;
     int mb_col;
-    /*int mb_rows = post->y_height >> 4;*/
     int mb_cols = post->y_width  >> 4;
 
-    int linestocopy;
+    int linestocopy, i;
 
-    loop_filter_info *lfi = cm->lf_info;
-    int baseline_filter_level[MAX_MB_SEGMENTS];
+    loop_filter_info_n *lfi_n = &cm->lf_info;
+    loop_filter_info lfi;
+
     int filter_level;
     int alt_flt_enabled = mbd->segmentation_enabled;
     FRAME_TYPE frame_type = cm->frame_type;
 
-    (void) sharpness_lvl;
+    const MODE_INFO *mode_info_context;
 
-    /*MODE_INFO * this_mb_mode_info = cm->mi + (post->y_height>>5) * (mb_cols + 1);*/ /* Point at base of Mb MODE_INFO list */
-    mbd->mode_info_context = cm->mi + (post->y_height >> 5) * (mb_cols + 1);        /* Point at base of Mb MODE_INFO list */
+    int lvl_seg[MAX_MB_SEGMENTS];
+
+    sharpness_lvl = cm->sharpness_level;
+
+#if 0
+    if(default_filt_lvl == 0) /* no filter applied */
+        return;
+#endif
+
+    mode_info_context = cm->mi + (post->y_height >> 5) * (mb_cols + 1);
 
     linestocopy = (post->y_height >> (4 + Fraction));
 
@@ -531,29 +548,24 @@
     if (alt_flt_enabled)
     {
         for (i = 0; i < MAX_MB_SEGMENTS; i++)
-        {
-            /* Abs value */
+        {    /* Abs value */
             if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
-                baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i];
+            {
+                lvl_seg[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i];
+            }
             /* Delta Value */
             else
             {
-                baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i];
-                baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0;  /* Clamp to valid range */
+                lvl_seg[i] = default_filt_lvl
+                        + mbd->segment_feature_data[MB_LVL_ALT_LF][i];
+                lvl_seg[i] = (lvl_seg[i] > 0) ?
+                        ((lvl_seg[i] > 63) ? 63: lvl_seg[i]) : 0;
             }
         }
     }
     else
-    {
-        for (i = 0; i < MAX_MB_SEGMENTS; i++)
-            baseline_filter_level[i] = default_filt_lvl;
-    }
+        lvl_seg[0] = default_filt_lvl;
 
-    /* Initialize the loop filter for this frame. */
-    if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level))
-        vp8_init_loop_filter(cm);
-    else if (frame_type != cm->last_frame_type)
-        vp8_frame_init_loop_filter(lfi, frame_type);
 
     /* Set up the buffer pointers */
     y_ptr = post->y_buffer + (post->y_height >> 5) * 16 * post->y_stride;
@@ -563,32 +575,64 @@
     {
         for (mb_col = 0; mb_col < mb_cols; mb_col++)
         {
-            int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0;
-            int skip_lf = (mbd->mode_info_context->mbmi.mode != B_PRED &&
-                            mbd->mode_info_context->mbmi.mode != SPLITMV &&
-                            mbd->mode_info_context->mbmi.mb_skip_coeff);
+            int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+                           mode_info_context->mbmi.mode != SPLITMV &&
+                           mode_info_context->mbmi.mb_skip_coeff);
 
-            filter_level = baseline_filter_level[Segment];
+            if (alt_flt_enabled)
+                filter_level = lvl_seg[mode_info_context->mbmi.segment_id];
+            else
+                filter_level = lvl_seg[0];
 
             if (filter_level)
             {
-                if (mb_col > 0)
-                    cm->lf_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]);
+                if (cm->filter_type == NORMAL_LOOPFILTER)
+                {
+                    const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
+                    lfi.mblim = lfi_n->mblim[filter_level];
+                    lfi.blim = lfi_n->blim[filter_level];
+                    lfi.lim = lfi_n->lim[filter_level];
+                    lfi.hev_thr = lfi_n->hev_thr[hev_index];
 
-                if (!skip_lf)
-                    cm->lf_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]);
+                    if (mb_col > 0)
+                        LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_v)
+                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);
 
-                cm->lf_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]);
+                    if (!skip_lf)
+                        LF_INVOKE(&cm->rtcd.loopfilter, normal_b_v)
+                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);
 
-                if (!skip_lf)
-                    cm->lf_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]);
+                    LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_h)
+                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);
+
+                    if (!skip_lf)
+                        LF_INVOKE(&cm->rtcd.loopfilter, normal_b_h)
+                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);
+                }
+                else
+                {
+                    if (mb_col > 0)
+                        LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_v)
+                        (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
+
+                    if (!skip_lf)
+                        LF_INVOKE(&cm->rtcd.loopfilter, simple_b_v)
+                        (y_ptr, post->y_stride, lfi_n->blim[filter_level]);
+
+                    LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_h)
+                        (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
+
+                    if (!skip_lf)
+                        LF_INVOKE(&cm->rtcd.loopfilter, simple_b_h)
+                        (y_ptr, post->y_stride, lfi_n->blim[filter_level]);
+                }
             }
 
             y_ptr += 16;
-            mbd->mode_info_context += 1;      /* step to next MB */
+            mode_info_context += 1;      /* step to next MB */
         }
 
         y_ptr += post->y_stride  * 16 - post->y_width;
-        mbd->mode_info_context += 1;          /* Skip border mb */
+        mode_info_context += 1;          /* Skip border mb */
     }
 }
diff --git a/vp8/common/loopfilter.h b/vp8/common/loopfilter.h
index ca136b3..2d6dad3 100644
--- a/vp8/common/loopfilter.h
+++ b/vp8/common/loopfilter.h
@@ -13,6 +13,7 @@
 #define loopfilter_h
 
 #include "vpx_ports/mem.h"
+#include "vpx_config.h"
 
 #define MAX_LOOP_FILTER 63
 
@@ -22,27 +23,46 @@
     SIMPLE_LOOPFILTER = 1
 } LOOPFILTERTYPE;
 
-/* FRK
- * Need to align this structure so when it is declared and
+#if ARCH_ARM
+#define SIMD_WIDTH 1
+#else
+#define SIMD_WIDTH 16
+#endif
+
+/* Need to align this structure so when it is declared and
  * passed it can be loaded into vector registers.
  */
 typedef struct
 {
-    DECLARE_ALIGNED(16, signed char, lim[16]);
-    DECLARE_ALIGNED(16, signed char, flim[16]);
-    DECLARE_ALIGNED(16, signed char, thr[16]);
-    DECLARE_ALIGNED(16, signed char, mbflim[16]);
+    DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, mblim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
+    DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, blim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
+    DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, lim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
+    DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, hev_thr[4][SIMD_WIDTH]);
+    unsigned char lvl[4][4][4];
+    unsigned char hev_thr_lut[2][MAX_LOOP_FILTER + 1];
+    unsigned char mode_lf_lut[10];
+} loop_filter_info_n;
+
+typedef struct
+{
+    const unsigned char * mblim;
+    const unsigned char * blim;
+    const unsigned char * lim;
+    const unsigned char * hev_thr;
 } loop_filter_info;
 
 
 #define prototype_loopfilter(sym) \
-    void sym(unsigned char *src, int pitch, const signed char *flimit,\
-             const signed char *limit, const signed char *thresh, int count)
+    void sym(unsigned char *src, int pitch, const unsigned char *blimit,\
+             const unsigned char *limit, const unsigned char *thresh, int count)
 
 #define prototype_loopfilter_block(sym) \
-    void sym(unsigned char *y, unsigned char *u, unsigned char *v,\
+    void sym(unsigned char *y, unsigned char *u, unsigned char *v, \
              int ystride, int uv_stride, loop_filter_info *lfi)
 
+#define prototype_simple_loopfilter(sym) \
+    void sym(unsigned char *y, int ystride, const unsigned char *blimit)
+
 #if ARCH_X86 || ARCH_X86_64
 #include "x86/loopfilter_x86.h"
 #endif
@@ -71,38 +91,39 @@
 #endif
 extern prototype_loopfilter_block(vp8_lf_normal_b_h);
 
-
 #ifndef vp8_lf_simple_mb_v
-#define vp8_lf_simple_mb_v vp8_loop_filter_mbvs_c
+#define vp8_lf_simple_mb_v vp8_loop_filter_simple_vertical_edge_c
 #endif
-extern prototype_loopfilter_block(vp8_lf_simple_mb_v);
+extern prototype_simple_loopfilter(vp8_lf_simple_mb_v);
 
 #ifndef vp8_lf_simple_b_v
 #define vp8_lf_simple_b_v vp8_loop_filter_bvs_c
 #endif
-extern prototype_loopfilter_block(vp8_lf_simple_b_v);
+extern prototype_simple_loopfilter(vp8_lf_simple_b_v);
 
 #ifndef vp8_lf_simple_mb_h
-#define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_c
+#define vp8_lf_simple_mb_h vp8_loop_filter_simple_horizontal_edge_c
 #endif
-extern prototype_loopfilter_block(vp8_lf_simple_mb_h);
+extern prototype_simple_loopfilter(vp8_lf_simple_mb_h);
 
 #ifndef vp8_lf_simple_b_h
 #define vp8_lf_simple_b_h vp8_loop_filter_bhs_c
 #endif
-extern prototype_loopfilter_block(vp8_lf_simple_b_h);
+extern prototype_simple_loopfilter(vp8_lf_simple_b_h);
 
 typedef prototype_loopfilter_block((*vp8_lf_block_fn_t));
+typedef prototype_simple_loopfilter((*vp8_slf_block_fn_t));
+
 typedef struct
 {
     vp8_lf_block_fn_t  normal_mb_v;
     vp8_lf_block_fn_t  normal_b_v;
     vp8_lf_block_fn_t  normal_mb_h;
     vp8_lf_block_fn_t  normal_b_h;
-    vp8_lf_block_fn_t  simple_mb_v;
-    vp8_lf_block_fn_t  simple_b_v;
-    vp8_lf_block_fn_t  simple_mb_h;
-    vp8_lf_block_fn_t  simple_b_h;
+    vp8_slf_block_fn_t  simple_mb_v;
+    vp8_slf_block_fn_t  simple_b_v;
+    vp8_slf_block_fn_t  simple_mb_h;
+    vp8_slf_block_fn_t  simple_b_h;
 } vp8_loopfilter_rtcd_vtable_t;
 
 #if CONFIG_RUNTIME_CPU_DETECT
@@ -115,9 +136,9 @@
 (
     unsigned char *u,   /* source pointer */
     int p,              /* pitch */
-    const signed char *flimit,
-    const signed char *limit,
-    const signed char *thresh,
+    const unsigned char *blimit,
+    const unsigned char *limit,
+    const unsigned char *thresh,
     unsigned char *v
 );
 
diff --git a/vp8/common/loopfilter_filters.c b/vp8/common/loopfilter_filters.c
index 6940529..10228ae 100644
--- a/vp8/common/loopfilter_filters.c
+++ b/vp8/common/loopfilter_filters.c
@@ -24,8 +24,9 @@
 
 
 /* should we apply any filter at all ( 11111111 yes, 00000000 no) */
-static __inline signed char vp8_filter_mask(signed char limit, signed char flimit,
-                                     uc p3, uc p2, uc p1, uc p0, uc q0, uc q1, uc q2, uc q3)
+static __inline signed char vp8_filter_mask(uc limit, uc blimit,
+                                     uc p3, uc p2, uc p1, uc p0,
+                                     uc q0, uc q1, uc q2, uc q3)
 {
     signed char mask = 0;
     mask |= (abs(p3 - p2) > limit) * -1;
@@ -34,13 +35,13 @@
     mask |= (abs(q1 - q0) > limit) * -1;
     mask |= (abs(q2 - q1) > limit) * -1;
     mask |= (abs(q3 - q2) > limit) * -1;
-    mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > flimit * 2 + limit) * -1;
+    mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
     mask = ~mask;
     return mask;
 }
 
 /* is there high variance internal edge ( 11111111 yes, 00000000 no) */
-static __inline signed char vp8_hevmask(signed char thresh, uc p1, uc p0, uc q0, uc q1)
+static __inline signed char vp8_hevmask(uc thresh, uc p1, uc p0, uc q0, uc q1)
 {
     signed char hev = 0;
     hev  |= (abs(p1 - p0) > thresh) * -1;
@@ -48,7 +49,8 @@
     return hev;
 }
 
-static __inline void vp8_filter(signed char mask, signed char hev, uc *op1, uc *op0, uc *oq0, uc *oq1)
+static __inline void vp8_filter(signed char mask, uc hev, uc *op1,
+        uc *op0, uc *oq0, uc *oq1)
 
 {
     signed char ps0, qs0;
@@ -98,9 +100,9 @@
 (
     unsigned char *s,
     int p, /* pitch */
-    const signed char *flimit,
-    const signed char *limit,
-    const signed char *thresh,
+    const unsigned char *blimit,
+    const unsigned char *limit,
+    const unsigned char *thresh,
     int count
 )
 {
@@ -113,11 +115,11 @@
      */
     do
     {
-        mask = vp8_filter_mask(limit[i], flimit[i],
+        mask = vp8_filter_mask(limit[0], blimit[0],
                                s[-4*p], s[-3*p], s[-2*p], s[-1*p],
                                s[0*p], s[1*p], s[2*p], s[3*p]);
 
-        hev = vp8_hevmask(thresh[i], s[-2*p], s[-1*p], s[0*p], s[1*p]);
+        hev = vp8_hevmask(thresh[0], s[-2*p], s[-1*p], s[0*p], s[1*p]);
 
         vp8_filter(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p);
 
@@ -130,9 +132,9 @@
 (
     unsigned char *s,
     int p,
-    const signed char *flimit,
-    const signed char *limit,
-    const signed char *thresh,
+    const unsigned char *blimit,
+    const unsigned char *limit,
+    const unsigned char *thresh,
     int count
 )
 {
@@ -145,10 +147,10 @@
      */
     do
     {
-        mask = vp8_filter_mask(limit[i], flimit[i],
+        mask = vp8_filter_mask(limit[0], blimit[0],
                                s[-4], s[-3], s[-2], s[-1], s[0], s[1], s[2], s[3]);
 
-        hev = vp8_hevmask(thresh[i], s[-2], s[-1], s[0], s[1]);
+        hev = vp8_hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
 
         vp8_filter(mask, hev, s - 2, s - 1, s, s + 1);
 
@@ -157,7 +159,7 @@
     while (++i < count * 8);
 }
 
-static __inline void vp8_mbfilter(signed char mask, signed char hev,
+static __inline void vp8_mbfilter(signed char mask, uc hev,
                            uc *op2, uc *op1, uc *op0, uc *oq0, uc *oq1, uc *oq2)
 {
     signed char s, u;
@@ -216,9 +218,9 @@
 (
     unsigned char *s,
     int p,
-    const signed char *flimit,
-    const signed char *limit,
-    const signed char *thresh,
+    const unsigned char *blimit,
+    const unsigned char *limit,
+    const unsigned char *thresh,
     int count
 )
 {
@@ -232,11 +234,11 @@
     do
     {
 
-        mask = vp8_filter_mask(limit[i], flimit[i],
+        mask = vp8_filter_mask(limit[0], blimit[0],
                                s[-4*p], s[-3*p], s[-2*p], s[-1*p],
                                s[0*p], s[1*p], s[2*p], s[3*p]);
 
-        hev = vp8_hevmask(thresh[i], s[-2*p], s[-1*p], s[0*p], s[1*p]);
+        hev = vp8_hevmask(thresh[0], s[-2*p], s[-1*p], s[0*p], s[1*p]);
 
         vp8_mbfilter(mask, hev, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p);
 
@@ -251,9 +253,9 @@
 (
     unsigned char *s,
     int p,
-    const signed char *flimit,
-    const signed char *limit,
-    const signed char *thresh,
+    const unsigned char *blimit,
+    const unsigned char *limit,
+    const unsigned char *thresh,
     int count
 )
 {
@@ -264,10 +266,10 @@
     do
     {
 
-        mask = vp8_filter_mask(limit[i], flimit[i],
+        mask = vp8_filter_mask(limit[0], blimit[0],
                                s[-4], s[-3], s[-2], s[-1], s[0], s[1], s[2], s[3]);
 
-        hev = vp8_hevmask(thresh[i], s[-2], s[-1], s[0], s[1]);
+        hev = vp8_hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
 
         vp8_mbfilter(mask, hev, s - 3, s - 2, s - 1, s, s + 1, s + 2);
 
@@ -278,13 +280,13 @@
 }
 
 /* should we apply any filter at all ( 11111111 yes, 00000000 no) */
-static __inline signed char vp8_simple_filter_mask(signed char limit, signed char flimit, uc p1, uc p0, uc q0, uc q1)
+static __inline signed char vp8_simple_filter_mask(uc blimit, uc p1, uc p0, uc q0, uc q1)
 {
 /* Why does this cause problems for win32?
  * error C2143: syntax error : missing ';' before 'type'
  *  (void) limit;
  */
-    signed char mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  <= flimit * 2 + limit) * -1;
+    signed char mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  <= blimit) * -1;
     return mask;
 }
 
@@ -317,47 +319,37 @@
 (
     unsigned char *s,
     int p,
-    const signed char *flimit,
-    const signed char *limit,
-    const signed char *thresh,
-    int count
+    const unsigned char *blimit
 )
 {
     signed char mask = 0;
     int i = 0;
-    (void) thresh;
 
     do
     {
-        /*mask = vp8_simple_filter_mask( limit[i], flimit[i],s[-1*p],s[0*p]);*/
-        mask = vp8_simple_filter_mask(limit[i], flimit[i], s[-2*p], s[-1*p], s[0*p], s[1*p]);
+        mask = vp8_simple_filter_mask(blimit[0], s[-2*p], s[-1*p], s[0*p], s[1*p]);
         vp8_simple_filter(mask, s - 2 * p, s - 1 * p, s, s + 1 * p);
         ++s;
     }
-    while (++i < count * 8);
+    while (++i < 16);
 }
 
 void vp8_loop_filter_simple_vertical_edge_c
 (
     unsigned char *s,
     int p,
-    const signed char *flimit,
-    const signed char *limit,
-    const signed char *thresh,
-    int count
+    const unsigned char *blimit
 )
 {
     signed char mask = 0;
     int i = 0;
-    (void) thresh;
 
     do
     {
-        /*mask = vp8_simple_filter_mask( limit[i], flimit[i],s[-1],s[0]);*/
-        mask = vp8_simple_filter_mask(limit[i], flimit[i], s[-2], s[-1], s[0], s[1]);
+        mask = vp8_simple_filter_mask(blimit[0], s[-2], s[-1], s[0], s[1]);
         vp8_simple_filter(mask, s - 2, s - 1, s, s + 1);
         s += p;
     }
-    while (++i < count * 8);
+    while (++i < 16);
 
 }
diff --git a/vp8/common/onyxc_int.h b/vp8/common/onyxc_int.h
index a381dfe..4356b51 100644
--- a/vp8/common/onyxc_int.h
+++ b/vp8/common/onyxc_int.h
@@ -83,6 +83,7 @@
 } VP8_COMMON_RTCD;
 
 typedef struct VP8Common
+
 {
     struct vpx_internal_error_info  error;
 
@@ -107,7 +108,8 @@
     YV12_BUFFER_CONFIG post_proc_buffer;
     YV12_BUFFER_CONFIG temp_scale_frame;
 
-    FRAME_TYPE last_frame_type;  /* Save last frame's frame type for loopfilter init checking and motion search. */
+
+    FRAME_TYPE last_frame_type;  /* Save last frame's frame type for motion search. */
     FRAME_TYPE frame_type;
 
     int show_frame;
@@ -148,11 +150,9 @@
     INTERPOLATIONFILTERTYPE mcomp_filter_type;
     LOOPFILTERTYPE last_filter_type;
     LOOPFILTERTYPE filter_type;
-    loop_filter_info lf_info[MAX_LOOP_FILTER+1];
-    prototype_loopfilter_block((*lf_mbv));
-    prototype_loopfilter_block((*lf_mbh));
-    prototype_loopfilter_block((*lf_bv));
-    prototype_loopfilter_block((*lf_bh));
+
+    loop_filter_info_n lf_info;
+
     int filter_level;
     int last_sharpness_level;
     int sharpness_level;
@@ -205,10 +205,9 @@
     struct postproc_state  postproc_state;
 } VP8_COMMON;
 
-
-int vp8_adjust_mb_lf_value(MACROBLOCKD *mbd, int filter_level);
-void vp8_init_loop_filter(VP8_COMMON *cm);
-void vp8_frame_init_loop_filter(loop_filter_info *lfi, int frame_type);
-extern void vp8_loop_filter_frame(VP8_COMMON *cm,    MACROBLOCKD *mbd,  int filt_val);
+void vp8_loop_filter_init(VP8_COMMON *cm);
+void vp8_loop_filter_frame_init(VP8_COMMON *cm, MACROBLOCKD *mbd,
+                                int default_filt_lvl, int sharpness_lvl);
+void vp8_loop_filter_frame(VP8_COMMON *cm,    MACROBLOCKD *mbd,  int filt_val);
 
 #endif
diff --git a/vp8/common/x86/loopfilter_mmx.asm b/vp8/common/x86/loopfilter_mmx.asm
index c6c215c..ad47284 100644
--- a/vp8/common/x86/loopfilter_mmx.asm
+++ b/vp8/common/x86/loopfilter_mmx.asm
@@ -16,7 +16,7 @@
 ;(
 ;    unsigned char *src_ptr,
 ;    int src_pixel_step,
-;    const char *flimit,
+;    const char *blimit,
 ;    const char *limit,
 ;    const char *thresh,
 ;    int  count
@@ -122,12 +122,10 @@
         paddusb     mm5, mm5              ; abs(p0-q0)*2
         paddusb     mm5, mm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
 
-        mov         rdx, arg(2) ;flimit           ; get flimit
-        movq        mm2, [rdx]            ; flimit mm2
-        paddb       mm2, mm2              ; flimit*2 (less than 255)
-        paddb       mm7, mm2              ; flimit * 2 + limit (less than 255)
+        mov         rdx, arg(2) ;blimit           ; get blimit
+        movq        mm7, [rdx]            ; blimit
 
-        psubusb     mm5,    mm7           ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
+        psubusb     mm5,    mm7           ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
         por         mm1,    mm5
         pxor        mm5,    mm5
         pcmpeqb     mm1,    mm5           ; mask mm1
@@ -230,7 +228,7 @@
 ;(
 ;    unsigned char *src_ptr,
 ;    int  src_pixel_step,
-;    const char *flimit,
+;    const char *blimit,
 ;    const char *limit,
 ;    const char *thresh,
 ;    int count
@@ -406,9 +404,9 @@
         pand        mm5,        [GLOBAL(tfe)]               ; set lsb of each byte to zero
         psrlw       mm5,        1                           ; abs(p1-q1)/2
 
-        mov         rdx,        arg(2) ;flimit                      ;
+        mov         rdx,        arg(2) ;blimit                      ;
 
-        movq        mm2,        [rdx]                       ;flimit  mm2
+        movq        mm4,        [rdx]                       ;blimit
         movq        mm1,        mm3                         ; mm1=mm3=p0
 
         movq        mm7,        mm6                         ; mm7=mm6=q0
@@ -419,10 +417,7 @@
         paddusb     mm1,        mm1                         ; abs(q0-p0)*2
         paddusb     mm1,        mm5                         ; abs (p0 - q0) *2 + abs(p1-q1)/2
 
-        paddb       mm2,        mm2                         ; flimit*2 (less than 255)
-        paddb       mm4,        mm2                         ; flimit * 2 + limit (less than 255)
-
-        psubusb     mm1,        mm4                         ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
+        psubusb     mm1,        mm4                         ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
         por         mm1,        mm0;                        ; mask
 
         pxor        mm0,        mm0
@@ -603,7 +598,7 @@
 ;(
 ;    unsigned char *src_ptr,
 ;    int  src_pixel_step,
-;    const char *flimit,
+;    const char *blimit,
 ;    const char *limit,
 ;    const char *thresh,
 ;    int count
@@ -719,17 +714,15 @@
         paddusb     mm5, mm5              ; abs(p0-q0)*2
         paddusb     mm5, mm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
 
-        mov         rdx, arg(2) ;flimit           ; get flimit
-        movq        mm2, [rdx]            ; flimit mm2
-        paddb       mm2, mm2              ; flimit*2 (less than 255)
-        paddb       mm7, mm2              ; flimit * 2 + limit (less than 255)
+        mov         rdx, arg(2) ;blimit           ; get blimit
+        movq        mm7, [rdx]            ; blimit
 
-        psubusb     mm5,    mm7           ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
+        psubusb     mm5,    mm7           ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
         por         mm1,    mm5
         pxor        mm5,    mm5
         pcmpeqb     mm1,    mm5           ; mask mm1
 
-        ; mm1 = mask, mm0=q0,  mm7 = flimit, t0 = abs(q0-q1) t1 = abs(p1-p0)
+        ; mm1 = mask, mm0=q0,  mm7 = blimit, t0 = abs(q0-q1) t1 = abs(p1-p0)
         ; mm6 = p0,
 
         ; calculate high edge variance
@@ -922,7 +915,7 @@
 ;(
 ;    unsigned char *src_ptr,
 ;    int  src_pixel_step,
-;    const char *flimit,
+;    const char *blimit,
 ;    const char *limit,
 ;    const char *thresh,
 ;    int count
@@ -1108,9 +1101,9 @@
         pand        mm5,        [GLOBAL(tfe)]               ; set lsb of each byte to zero
         psrlw       mm5,        1                           ; abs(p1-q1)/2
 
-        mov         rdx,        arg(2) ;flimit                      ;
+        mov         rdx,        arg(2) ;blimit                      ;
 
-        movq        mm2,        [rdx]                       ;flimit  mm2
+        movq        mm4,        [rdx]                       ;blimit
         movq        mm1,        mm3                         ; mm1=mm3=p0
 
         movq        mm7,        mm6                         ; mm7=mm6=q0
@@ -1121,10 +1114,7 @@
         paddusb     mm1,        mm1                         ; abs(q0-p0)*2
         paddusb     mm1,        mm5                         ; abs (p0 - q0) *2 + abs(p1-q1)/2
 
-        paddb       mm2,        mm2                         ; flimit*2 (less than 255)
-        paddb       mm4,        mm2                         ; flimit * 2 + limit (less than 255)
-
-        psubusb     mm1,        mm4                         ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
+        psubusb     mm1,        mm4                         ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
         por         mm1,        mm0;                        ; mask
 
         pxor        mm0,        mm0
@@ -1392,16 +1382,13 @@
 ;(
 ;    unsigned char *src_ptr,
 ;    int  src_pixel_step,
-;    const char *flimit,
-;    const char *limit,
-;    const char *thresh,
-;    int count
+;    const char *blimit
 ;)
 global sym(vp8_loop_filter_simple_horizontal_edge_mmx)
 sym(vp8_loop_filter_simple_horizontal_edge_mmx):
     push        rbp
     mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
+    SHADOW_ARGS_TO_STACK 3
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -1410,14 +1397,10 @@
         mov         rsi, arg(0) ;src_ptr
         movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
 
-        movsxd      rcx, dword ptr arg(5) ;count
+        mov         rcx, 2                ; count
 nexts8_h:
-        mov         rdx, arg(3) ;limit
-        movq        mm7, [rdx]
-        mov         rdx, arg(2) ;flimit           ; get flimit
+        mov         rdx, arg(2) ;blimit           ; get blimit
         movq        mm3, [rdx]            ;
-        paddb       mm3, mm3              ; flimit*2 (less than 255)
-        paddb       mm3, mm7              ; flimit * 2 + limit (less than 255)
 
         mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
         add         rdi, rax
@@ -1445,7 +1428,7 @@
         paddusb     mm5, mm5              ; abs(p0-q0)*2
         paddusb     mm5, mm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2
 
-        psubusb     mm5, mm3              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
+        psubusb     mm5, mm3              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
         pxor        mm3, mm3
         pcmpeqb     mm5, mm3
 
@@ -1515,16 +1498,13 @@
 ;(
 ;    unsigned char *src_ptr,
 ;    int  src_pixel_step,
-;    const char *flimit,
-;    const char *limit,
-;    const char *thresh,
-;    int count
+;    const char *blimit
 ;)
 global sym(vp8_loop_filter_simple_vertical_edge_mmx)
 sym(vp8_loop_filter_simple_vertical_edge_mmx):
     push        rbp
     mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
+    SHADOW_ARGS_TO_STACK 3
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -1539,7 +1519,7 @@
         movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
 
         lea         rsi, [rsi + rax*4- 2];  ;
-        movsxd      rcx, dword ptr arg(5) ;count
+        mov         rcx, 2                                      ; count
 nexts8_v:
 
         lea         rdi,        [rsi + rax];
@@ -1602,14 +1582,10 @@
         paddusb     mm5,        mm5                             ; abs(p0-q0)*2
         paddusb     mm5,        mm6                             ; abs (p0 - q0) *2 + abs(p1-q1)/2
 
-        mov         rdx,        arg(2) ;flimit                          ; get flimit
+        mov         rdx,        arg(2) ;blimit                          ; get blimit
         movq        mm7,        [rdx]
-        mov         rdx,        arg(3)                          ; get limit
-        movq        mm6,        [rdx]
-        paddb       mm7,        mm7                             ; flimit*2 (less than 255)
-        paddb       mm7,        mm6                             ; flimit * 2 + limit (less than 255)
 
-        psubusb     mm5,        mm7                             ; abs(p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
+        psubusb     mm5,        mm7                             ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
         pxor        mm7,        mm7
         pcmpeqb     mm5,        mm7                             ; mm5 = mask
 
diff --git a/vp8/common/x86/loopfilter_sse2.asm b/vp8/common/x86/loopfilter_sse2.asm
index c2ce1a1..4efff7e 100644
--- a/vp8/common/x86/loopfilter_sse2.asm
+++ b/vp8/common/x86/loopfilter_sse2.asm
@@ -110,7 +110,7 @@
         psubusb     xmm6,                   xmm5              ; p1-=p0
 
         por         xmm6,                   xmm4              ; abs(p1 - p0)
-        mov         rdx,                    arg(2)            ; get flimit
+        mov         rdx,                    arg(2)            ; get blimit
 
         movdqa        t1,                   xmm6              ; save to t1
 
@@ -123,7 +123,7 @@
         psubusb     xmm1,                   xmm7
         por         xmm2,                   xmm3              ; abs(p1-q1)
 
-        movdqa      xmm4,                   XMMWORD PTR [rdx] ; flimit
+        movdqa      xmm7,                   XMMWORD PTR [rdx] ; blimit
 
         movdqa      xmm3,                   xmm0              ; q0
         pand        xmm2,                   [GLOBAL(tfe)]     ; set lsb of each byte to zero
@@ -134,13 +134,11 @@
         psrlw       xmm2,                   1                 ; abs(p1-q1)/2
 
         psubusb     xmm5,                   xmm3              ; p0-=q0
-        paddb       xmm4,                   xmm4              ; flimit*2 (less than 255)
 
         psubusb     xmm3,                   xmm6              ; q0-=p0
         por         xmm5,                   xmm3              ; abs(p0 - q0)
 
         paddusb     xmm5,                   xmm5              ; abs(p0-q0)*2
-        paddb       xmm7,                   xmm4              ; flimit * 2 + limit (less than 255)
 
         movdqa      xmm4,                   t0                ; hev get abs (q1 - q0)
 
@@ -150,7 +148,7 @@
 
         movdqa      xmm2,                   XMMWORD PTR [rdx] ; hev
 
-        psubusb     xmm5,                   xmm7              ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
+        psubusb     xmm5,                   xmm7              ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
         psubusb     xmm4,                   xmm2              ; hev
 
         psubusb     xmm3,                   xmm2              ; hev
@@ -278,7 +276,7 @@
 ;(
 ;    unsigned char *src_ptr,
 ;    int            src_pixel_step,
-;    const char    *flimit,
+;    const char    *blimit,
 ;    const char    *limit,
 ;    const char    *thresh,
 ;    int            count
@@ -328,7 +326,7 @@
 ;(
 ;    unsigned char *src_ptr,
 ;    int            src_pixel_step,
-;    const char    *flimit,
+;    const char    *blimit,
 ;    const char    *limit,
 ;    const char    *thresh,
 ;    int            count
@@ -574,7 +572,7 @@
 ;(
 ;    unsigned char *src_ptr,
 ;    int            src_pixel_step,
-;    const char    *flimit,
+;    const char    *blimit,
 ;    const char    *limit,
 ;    const char    *thresh,
 ;    int            count
@@ -624,7 +622,7 @@
 ;(
 ;    unsigned char *u,
 ;    int            src_pixel_step,
-;    const char    *flimit,
+;    const char    *blimit,
 ;    const char    *limit,
 ;    const char    *thresh,
 ;    unsigned char *v
@@ -904,7 +902,7 @@
         movdqa      xmm4,               XMMWORD PTR [rdx]; limit
 
         pmaxub      xmm0,               xmm7
-        mov         rdx,                arg(2)          ; flimit
+        mov         rdx,                arg(2)          ; blimit
 
         psubusb     xmm0,               xmm4
         movdqa      xmm5,               xmm2            ; q1
@@ -921,12 +919,11 @@
         psrlw       xmm5,               1               ; abs(p1-q1)/2
         psubusb     xmm6,               xmm3            ; q0-p0
 
-        movdqa      xmm2,               XMMWORD PTR [rdx]; flimit
+        movdqa      xmm4,               XMMWORD PTR [rdx]; blimit
 
         mov         rdx,                arg(4)          ; get thresh
 
         por         xmm1,               xmm6            ; abs(q0-p0)
-        paddb       xmm2,               xmm2            ; flimit*2 (less than 255)
 
         movdqa      xmm6,               t0              ; get abs (q1 - q0)
 
@@ -939,10 +936,9 @@
         paddusb     xmm1,               xmm5            ; abs (p0 - q0) *2 + abs(p1-q1)/2
         psubusb     xmm6,               xmm7            ; abs(q1 - q0) > thresh
 
-        paddb       xmm4,               xmm2            ; flimit * 2 + limit (less than 255)
         psubusb     xmm3,               xmm7            ; abs(p1 - p0)> thresh
 
-        psubusb     xmm1,               xmm4            ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
+        psubusb     xmm1,               xmm4            ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
         por         xmm6,               xmm3            ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
 
         por         xmm1,               xmm0            ; mask
@@ -1014,7 +1010,7 @@
 ;(
 ;    unsigned char *src_ptr,
 ;    int            src_pixel_step,
-;    const char    *flimit,
+;    const char    *blimit,
 ;    const char    *limit,
 ;    const char    *thresh,
 ;    int            count
@@ -1081,7 +1077,7 @@
 ;(
 ;    unsigned char *u,
 ;    int            src_pixel_step,
-;    const char    *flimit,
+;    const char    *blimit,
 ;    const char    *limit,
 ;    const char    *thresh,
 ;    unsigned char *v
@@ -1239,7 +1235,7 @@
 ;(
 ;    unsigned char *src_ptr,
 ;    int            src_pixel_step,
-;    const char    *flimit,
+;    const char    *blimit,
 ;    const char    *limit,
 ;    const char    *thresh,
 ;    int            count
@@ -1308,7 +1304,7 @@
 ;(
 ;    unsigned char *u,
 ;    int            src_pixel_step,
-;    const char    *flimit,
+;    const char    *blimit,
 ;    const char    *limit,
 ;    const char    *thresh,
 ;    unsigned char *v
@@ -1376,16 +1372,13 @@
 ;(
 ;    unsigned char *src_ptr,
 ;    int  src_pixel_step,
-;    const char *flimit,
-;    const char *limit,
-;    const char *thresh,
-;    int count
+;    const char *blimit,
 ;)
 global sym(vp8_loop_filter_simple_horizontal_edge_sse2)
 sym(vp8_loop_filter_simple_horizontal_edge_sse2):
     push        rbp
     mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
+    SHADOW_ARGS_TO_STACK 3
     SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
@@ -1394,13 +1387,8 @@
 
         mov         rsi, arg(0)             ;src_ptr
         movsxd      rax, dword ptr arg(1)   ;src_pixel_step     ; destination pitch?
-        mov         rdx, arg(2) ;flimit     ; get flimit
+        mov         rdx, arg(2)             ;blimit
         movdqa      xmm3, XMMWORD PTR [rdx]
-        mov         rdx, arg(3) ;limit
-        movdqa      xmm7, XMMWORD PTR [rdx]
-
-        paddb       xmm3, xmm3              ; flimit*2 (less than 255)
-        paddb       xmm3, xmm7              ; flimit * 2 + limit (less than 255)
 
         mov         rdi, rsi                ; rdi points to row +1 for indirect addressing
         add         rdi, rax
@@ -1428,7 +1416,7 @@
         paddusb     xmm5, xmm5              ; abs(p0-q0)*2
         paddusb     xmm5, xmm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2
 
-        psubusb     xmm5, xmm3              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
+        psubusb     xmm5, xmm3              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
         pxor        xmm3, xmm3
         pcmpeqb     xmm5, xmm3
 
@@ -1493,16 +1481,13 @@
 ;(
 ;    unsigned char *src_ptr,
 ;    int  src_pixel_step,
-;    const char *flimit,
-;    const char *limit,
-;    const char *thresh,
-;    int count
+;    const char *blimit,
 ;)
 global sym(vp8_loop_filter_simple_vertical_edge_sse2)
 sym(vp8_loop_filter_simple_vertical_edge_sse2):
     push        rbp         ; save old base pointer value.
     mov         rbp, rsp    ; set new base pointer value.
-    SHADOW_ARGS_TO_STACK 6
+    SHADOW_ARGS_TO_STACK 3
     SAVE_XMM 7
     GET_GOT     rbx         ; save callee-saved reg
     push        rsi
@@ -1607,14 +1592,10 @@
         paddusb     xmm5,       xmm5                            ; abs(p0-q0)*2
         paddusb     xmm5,       xmm6                            ; abs (p0 - q0) *2 + abs(p1-q1)/2
 
-        mov         rdx,        arg(2)                          ;flimit
+        mov         rdx,        arg(2)                          ;blimit
         movdqa      xmm7, XMMWORD PTR [rdx]
-        mov         rdx,        arg(3)                          ; get limit
-        movdqa      xmm6, XMMWORD PTR [rdx]
-        paddb       xmm7,        xmm7                           ; flimit*2 (less than 255)
-        paddb       xmm7,        xmm6                           ; flimit * 2 + limit (less than 255)
 
-        psubusb     xmm5,        xmm7                           ; abs(p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
+        psubusb     xmm5,        xmm7                           ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
         pxor        xmm7,        xmm7
         pcmpeqb     xmm5,        xmm7                           ; mm5 = mask
 
diff --git a/vp8/common/x86/loopfilter_x86.c b/vp8/common/x86/loopfilter_x86.c
index a52420c..9360ac1 100644
--- a/vp8/common/x86/loopfilter_x86.c
+++ b/vp8/common/x86/loopfilter_x86.c
@@ -9,30 +9,18 @@
  */
 
 
-#include "vpx_ports/config.h"
+#include "vpx_config.h"
 #include "vp8/common/loopfilter.h"
 
-prototype_loopfilter(vp8_loop_filter_horizontal_edge_c);
-prototype_loopfilter(vp8_loop_filter_vertical_edge_c);
-prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_c);
-prototype_loopfilter(vp8_mbloop_filter_vertical_edge_c);
-prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_c);
-prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_c);
-
 prototype_loopfilter(vp8_mbloop_filter_vertical_edge_mmx);
 prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_mmx);
 prototype_loopfilter(vp8_loop_filter_vertical_edge_mmx);
 prototype_loopfilter(vp8_loop_filter_horizontal_edge_mmx);
-prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_mmx);
-prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_mmx);
 
 prototype_loopfilter(vp8_loop_filter_vertical_edge_sse2);
 prototype_loopfilter(vp8_loop_filter_horizontal_edge_sse2);
 prototype_loopfilter(vp8_mbloop_filter_vertical_edge_sse2);
 prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_sse2);
-prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_sse2);
-prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_sse2);
-prototype_loopfilter(vp8_fast_loop_filter_vertical_edges_sse2);
 
 extern loop_filter_uvfunction vp8_loop_filter_horizontal_edge_uv_sse2;
 extern loop_filter_uvfunction vp8_loop_filter_vertical_edge_uv_sse2;
@@ -44,23 +32,13 @@
 void vp8_loop_filter_mbh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                              int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    vp8_mbloop_filter_horizontal_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
+    vp8_mbloop_filter_horizontal_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
 
     if (u_ptr)
-        vp8_mbloop_filter_horizontal_edge_mmx(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
+        vp8_mbloop_filter_horizontal_edge_mmx(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
 
     if (v_ptr)
-        vp8_mbloop_filter_horizontal_edge_mmx(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
-}
-
-
-void vp8_loop_filter_mbhs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
+        vp8_mbloop_filter_horizontal_edge_mmx(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
 }
 
 
@@ -68,23 +46,13 @@
 void vp8_loop_filter_mbv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                              int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    vp8_mbloop_filter_vertical_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
+    vp8_mbloop_filter_vertical_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
 
     if (u_ptr)
-        vp8_mbloop_filter_vertical_edge_mmx(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
+        vp8_mbloop_filter_vertical_edge_mmx(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
 
     if (v_ptr)
-        vp8_mbloop_filter_vertical_edge_mmx(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
-}
-
-
-void vp8_loop_filter_mbvs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
+        vp8_mbloop_filter_vertical_edge_mmx(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
 }
 
 
@@ -92,27 +60,23 @@
 void vp8_loop_filter_bh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                             int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    vp8_loop_filter_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
 
     if (u_ptr)
-        vp8_loop_filter_horizontal_edge_mmx(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
+        vp8_loop_filter_horizontal_edge_mmx(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
 
     if (v_ptr)
-        vp8_loop_filter_horizontal_edge_mmx(v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
+        vp8_loop_filter_horizontal_edge_mmx(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
 }
 
 
-void vp8_loop_filter_bhs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                             int y_stride, int uv_stride, loop_filter_info *lfi)
+void vp8_loop_filter_bhs_mmx(unsigned char *y_ptr, int y_stride, const unsigned char *blimit)
 {
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, blimit);
+    vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, blimit);
+    vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, blimit);
 }
 
 
@@ -120,27 +84,23 @@
 void vp8_loop_filter_bv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                             int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    vp8_loop_filter_vertical_edge_mmx(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_vertical_edge_mmx(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_vertical_edge_mmx(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_vertical_edge_mmx(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_vertical_edge_mmx(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_vertical_edge_mmx(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
 
     if (u_ptr)
-        vp8_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
+        vp8_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
 
     if (v_ptr)
-        vp8_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
+        vp8_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
 }
 
 
-void vp8_loop_filter_bvs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                             int y_stride, int uv_stride, loop_filter_info *lfi)
+void vp8_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride, const unsigned char *blimit)
 {
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, blimit);
+    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, blimit);
+    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, blimit);
 }
 #endif
 
@@ -150,20 +110,10 @@
 void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                               int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    vp8_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
+    vp8_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
 
     if (u_ptr)
-        vp8_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, v_ptr);
-}
-
-
-void vp8_loop_filter_mbhs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                               int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
+        vp8_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, v_ptr);
 }
 
 
@@ -171,20 +121,10 @@
 void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                               int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
+    vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
 
     if (u_ptr)
-        vp8_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, v_ptr);
-}
-
-
-void vp8_loop_filter_mbvs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                               int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    vp8_loop_filter_simple_vertical_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
+        vp8_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, v_ptr);
 }
 
 
@@ -192,24 +132,20 @@
 void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                              int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    vp8_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
 
     if (u_ptr)
-        vp8_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, v_ptr + 4 * uv_stride);
+        vp8_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, v_ptr + 4 * uv_stride);
 }
 
 
-void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi)
+void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit)
 {
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, blimit);
+    vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, blimit);
+    vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, blimit);
 }
 
 
@@ -217,36 +153,20 @@
 void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                              int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    vp8_loop_filter_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
 
     if (u_ptr)
-        vp8_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, v_ptr + 4);
+        vp8_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, v_ptr + 4);
 }
 
 
-void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi)
+void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit)
 {
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, blimit);
+    vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, blimit);
+    vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, blimit);
 }
 
 #endif
-
-#if 0
-void vp8_fast_loop_filter_vertical_edges_sse(unsigned char *y_ptr,
-        int y_stride,
-        loop_filter_info *lfi)
-{
-
-    vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-}
-#endif
diff --git a/vp8/common/x86/loopfilter_x86.h b/vp8/common/x86/loopfilter_x86.h
index 80dbebc..1ed6c21 100644
--- a/vp8/common/x86/loopfilter_x86.h
+++ b/vp8/common/x86/loopfilter_x86.h
@@ -24,10 +24,10 @@
 extern prototype_loopfilter_block(vp8_loop_filter_bv_mmx);
 extern prototype_loopfilter_block(vp8_loop_filter_mbh_mmx);
 extern prototype_loopfilter_block(vp8_loop_filter_bh_mmx);
-extern prototype_loopfilter_block(vp8_loop_filter_mbvs_mmx);
-extern prototype_loopfilter_block(vp8_loop_filter_bvs_mmx);
-extern prototype_loopfilter_block(vp8_loop_filter_mbhs_mmx);
-extern prototype_loopfilter_block(vp8_loop_filter_bhs_mmx);
+extern prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_mmx);
+extern prototype_simple_loopfilter(vp8_loop_filter_bvs_mmx);
+extern prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_mmx);
+extern prototype_simple_loopfilter(vp8_loop_filter_bhs_mmx);
 
 
 #if !CONFIG_RUNTIME_CPU_DETECT
@@ -44,13 +44,13 @@
 #define vp8_lf_normal_b_h vp8_loop_filter_bh_mmx
 
 #undef  vp8_lf_simple_mb_v
-#define vp8_lf_simple_mb_v vp8_loop_filter_mbvs_mmx
+#define vp8_lf_simple_mb_v vp8_loop_filter_simple_vertical_edge_mmx
 
 #undef  vp8_lf_simple_b_v
 #define vp8_lf_simple_b_v vp8_loop_filter_bvs_mmx
 
 #undef  vp8_lf_simple_mb_h
-#define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_mmx
+#define vp8_lf_simple_mb_h vp8_loop_filter_simple_horizontal_edge_mmx
 
 #undef  vp8_lf_simple_b_h
 #define vp8_lf_simple_b_h vp8_loop_filter_bhs_mmx
@@ -63,10 +63,10 @@
 extern prototype_loopfilter_block(vp8_loop_filter_bv_sse2);
 extern prototype_loopfilter_block(vp8_loop_filter_mbh_sse2);
 extern prototype_loopfilter_block(vp8_loop_filter_bh_sse2);
-extern prototype_loopfilter_block(vp8_loop_filter_mbvs_sse2);
-extern prototype_loopfilter_block(vp8_loop_filter_bvs_sse2);
-extern prototype_loopfilter_block(vp8_loop_filter_mbhs_sse2);
-extern prototype_loopfilter_block(vp8_loop_filter_bhs_sse2);
+extern prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_sse2);
+extern prototype_simple_loopfilter(vp8_loop_filter_bvs_sse2);
+extern prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_sse2);
+extern prototype_simple_loopfilter(vp8_loop_filter_bhs_sse2);
 
 
 #if !CONFIG_RUNTIME_CPU_DETECT
@@ -83,13 +83,13 @@
 #define vp8_lf_normal_b_h vp8_loop_filter_bh_sse2
 
 #undef  vp8_lf_simple_mb_v
-#define vp8_lf_simple_mb_v vp8_loop_filter_mbvs_sse2
+#define vp8_lf_simple_mb_v vp8_loop_filter_simple_vertical_edge_sse2
 
 #undef  vp8_lf_simple_b_v
 #define vp8_lf_simple_b_v vp8_loop_filter_bvs_sse2
 
 #undef  vp8_lf_simple_mb_h
-#define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_sse2
+#define vp8_lf_simple_mb_h vp8_loop_filter_simple_horizontal_edge_sse2
 
 #undef  vp8_lf_simple_b_h
 #define vp8_lf_simple_b_h vp8_loop_filter_bhs_sse2
diff --git a/vp8/common/x86/x86_systemdependent.c b/vp8/common/x86/x86_systemdependent.c
index 87374f3..33a984b 100644
--- a/vp8/common/x86/x86_systemdependent.c
+++ b/vp8/common/x86/x86_systemdependent.c
@@ -9,7 +9,7 @@
  */
 
 
-#include "vpx_ports/config.h"
+#include "vpx_config.h"
 #include "vpx_ports/x86.h"
 #include "vp8/common/g_common.h"
 #include "vp8/common/subpixel.h"
@@ -63,9 +63,9 @@
         rtcd->loopfilter.normal_b_v  = vp8_loop_filter_bv_mmx;
         rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_mmx;
         rtcd->loopfilter.normal_b_h  = vp8_loop_filter_bh_mmx;
-        rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_mmx;
+        rtcd->loopfilter.simple_mb_v = vp8_loop_filter_simple_vertical_edge_mmx;
         rtcd->loopfilter.simple_b_v  = vp8_loop_filter_bvs_mmx;
-        rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_mmx;
+        rtcd->loopfilter.simple_mb_h = vp8_loop_filter_simple_horizontal_edge_mmx;
         rtcd->loopfilter.simple_b_h  = vp8_loop_filter_bhs_mmx;
 
 #if CONFIG_POSTPROC
@@ -101,9 +101,9 @@
         rtcd->loopfilter.normal_b_v  = vp8_loop_filter_bv_sse2;
         rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_sse2;
         rtcd->loopfilter.normal_b_h  = vp8_loop_filter_bh_sse2;
-        rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_sse2;
+        rtcd->loopfilter.simple_mb_v = vp8_loop_filter_simple_vertical_edge_sse2;
         rtcd->loopfilter.simple_b_v  = vp8_loop_filter_bvs_sse2;
-        rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_sse2;
+        rtcd->loopfilter.simple_mb_h = vp8_loop_filter_simple_horizontal_edge_sse2;
         rtcd->loopfilter.simple_b_h  = vp8_loop_filter_bhs_sse2;
 
 #if CONFIG_POSTPROC
diff --git a/vp8/decoder/decodemv.c b/vp8/decoder/decodemv.c
index 01d9402..0a7942d 100644
--- a/vp8/decoder/decodemv.c
+++ b/vp8/decoder/decodemv.c
@@ -180,11 +180,11 @@
     return (MB_PREDICTION_MODE)i;
 }
 
-static MB_PREDICTION_MODE sub_mv_ref(vp8_reader *bc, const vp8_prob *p)
+static B_PREDICTION_MODE sub_mv_ref(vp8_reader *bc, const vp8_prob *p)
 {
     const int i = vp8_treed_read(bc, vp8_sub_mv_ref_tree, p);
 
-    return (MB_PREDICTION_MODE)i;
+    return (B_PREDICTION_MODE)i;
 }
 
 #ifdef VPX_MODE_COUNT
@@ -334,7 +334,7 @@
                 abovemv.as_int = above_block_mv(mi, k, mis);
                 mv_contz = vp8_mv_cont(&leftmv, &abovemv);
 
-                switch ((B_PREDICTION_MODE) sub_mv_ref(bc, vp8_sub_mv_ref_prob2 [mv_contz])) /*pc->fc.sub_mv_ref_prob))*/
+                switch (sub_mv_ref(bc, vp8_sub_mv_ref_prob2 [mv_contz])) /*pc->fc.sub_mv_ref_prob))*/
                 {
                 case NEW4X4:
                     read_mv(bc, &blockmv.as_mv, (const MV_CONTEXT *) mvc);
diff --git a/vp8/decoder/onyxd_if.c b/vp8/decoder/onyxd_if.c
index 5f81ee6..aeb1607 100644
--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c
@@ -95,7 +95,7 @@
     {
         VP8_COMMON *cm = &pbi->common;
 
-        vp8_init_loop_filter(cm);
+        vp8_loop_filter_init(cm);
         cm->last_frame_type = KEY_FRAME;
         cm->last_filter_type = cm->filter_type;
         cm->last_sharpness_level = cm->sharpness_level;
diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c
index a7af9ac..0c21689 100644
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -274,9 +274,7 @@
                     int recon_uv_stride = pc->yv12_fb[ref_fb_idx].uv_stride;
 
                     int filter_level;
-                    loop_filter_info *lfi = pc->lf_info;
-                    int alt_flt_enabled = xd->segmentation_enabled;
-                    int Segment;
+                    loop_filter_info_n *lfi_n = &pc->lf_info;
 
                     pbi->mb_row_di[ithread].mb_row = mb_row;
                     pbi->mb_row_di[ithread].mbd.current_bc =  &pbi->mbc[mb_row%num_part];
@@ -362,7 +360,16 @@
 
                         if (pbi->common.filter_level)
                         {
-                            int skip_lf;
+                            int skip_lf = (xd->mode_info_context->mbmi.mode != B_PRED &&
+                                            xd->mode_info_context->mbmi.mode != SPLITMV &&
+                                            xd->mode_info_context->mbmi.mb_skip_coeff);
+
+                            const int mode_index = lfi_n->mode_lf_lut[xd->mode_info_context->mbmi.mode];
+                            const int seg = xd->mode_info_context->mbmi.segment_id;
+                            const int ref_frame = xd->mode_info_context->mbmi.ref_frame;
+
+                            filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
+
                             if( mb_row != pc->mb_rows-1 )
                             {
                                 /* Save decoded MB last row data for next-row decoding */
@@ -388,35 +395,57 @@
                                 }
                             }
 
-                            /* update loopfilter info */
-                            Segment = (alt_flt_enabled) ? xd->mode_info_context->mbmi.segment_id : 0;
-                            skip_lf = (xd->mode_info_context->mbmi.mode != B_PRED &&
-                                            xd->mode_info_context->mbmi.mode != SPLITMV &&
-                                            xd->mode_info_context->mbmi.mb_skip_coeff);
-
-                            filter_level = pbi->mt_baseline_filter_level[Segment];
-                            /* Distance of Mb to the various image edges.
-                             * These are specified to 8th pel as they are always compared to values that are in 1/8th pel units
-                             * Apply any context driven MB level adjustment
-                             */
-                            filter_level = vp8_adjust_mb_lf_value(xd, filter_level);
-
                             /* loopfilter on this macroblock. */
                             if (filter_level)
                             {
-                                if (mb_col > 0)
-                                    pc->lf_mbv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]);
+                                if(pc->filter_type == NORMAL_LOOPFILTER)
+                                {
+                                    loop_filter_info lfi;
+                                    FRAME_TYPE frame_type = pc->frame_type;
+                                    const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
+                                    lfi.mblim = lfi_n->mblim[filter_level];
+                                    lfi.blim = lfi_n->blim[filter_level];
+                                    lfi.lim = lfi_n->lim[filter_level];
+                                    lfi.hev_thr = lfi_n->hev_thr[hev_index];
 
-                                if (!skip_lf)
-                                    pc->lf_bv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]);
+                                    if (mb_col > 0)
+                                        LF_INVOKE(&pc->rtcd.loopfilter, normal_mb_v)
+                                        (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi);
 
-                                /* don't apply across umv border */
-                                if (mb_row > 0)
-                                    pc->lf_mbh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]);
+                                    if (!skip_lf)
+                                        LF_INVOKE(&pc->rtcd.loopfilter, normal_b_v)
+                                        (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi);
 
-                                if (!skip_lf)
-                                    pc->lf_bh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]);
+                                    /* don't apply across umv border */
+                                    if (mb_row > 0)
+                                        LF_INVOKE(&pc->rtcd.loopfilter, normal_mb_h)
+                                        (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi);
+
+                                    if (!skip_lf)
+                                        LF_INVOKE(&pc->rtcd.loopfilter, normal_b_h)
+                                        (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer,  recon_y_stride, recon_uv_stride, &lfi);
+                                }
+                                else
+                                {
+                                    if (mb_col > 0)
+                                        LF_INVOKE(&pc->rtcd.loopfilter, simple_mb_v)
+                                        (xd->dst.y_buffer, recon_y_stride, lfi_n->mblim[filter_level]);
+
+                                    if (!skip_lf)
+                                        LF_INVOKE(&pc->rtcd.loopfilter, simple_b_v)
+                                        (xd->dst.y_buffer, recon_y_stride, lfi_n->blim[filter_level]);
+
+                                    /* don't apply across umv border */
+                                    if (mb_row > 0)
+                                        LF_INVOKE(&pc->rtcd.loopfilter, simple_mb_h)
+                                        (xd->dst.y_buffer, recon_y_stride, lfi_n->mblim[filter_level]);
+
+                                    if (!skip_lf)
+                                        LF_INVOKE(&pc->rtcd.loopfilter, simple_b_h)
+                                        (xd->dst.y_buffer, recon_y_stride, lfi_n->blim[filter_level]);
+                                }
                             }
+
                         }
 
                         recon_yoffset += 16;
@@ -681,53 +710,6 @@
     }
 }
 
-
-static void lpf_init( VP8D_COMP *pbi, int default_filt_lvl)
-{
-    VP8_COMMON *cm  = &pbi->common;
-    MACROBLOCKD *mbd = &pbi->mb;
-    /*YV12_BUFFER_CONFIG *post = &cm->new_frame;*/  /*frame_to_show;*/
-    loop_filter_info *lfi = cm->lf_info;
-    FRAME_TYPE frame_type = cm->frame_type;
-
-    /*int mb_row;
-    int mb_col;
-    int baseline_filter_level[MAX_MB_SEGMENTS];*/
-    int alt_flt_enabled = mbd->segmentation_enabled;
-
-    int i;
-    /*unsigned char *y_ptr, *u_ptr, *v_ptr;*/
-
-    /* Note the baseline filter values for each segment */
-    if (alt_flt_enabled)
-    {
-        for (i = 0; i < MAX_MB_SEGMENTS; i++)
-        {
-            /* Abs value */
-            if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
-                pbi->mt_baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i];
-            /* Delta Value */
-            else
-            {
-                pbi->mt_baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i];
-                pbi->mt_baseline_filter_level[i] = (pbi->mt_baseline_filter_level[i] >= 0) ? ((pbi->mt_baseline_filter_level[i] <= MAX_LOOP_FILTER) ? pbi->mt_baseline_filter_level[i] : MAX_LOOP_FILTER) : 0;  /* Clamp to valid range */
-            }
-        }
-    }
-    else
-    {
-        for (i = 0; i < MAX_MB_SEGMENTS; i++)
-            pbi->mt_baseline_filter_level[i] = default_filt_lvl;
-    }
-
-    /* Initialize the loop filter for this frame. */
-    if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level))
-        vp8_init_loop_filter(cm);
-    else if (frame_type != cm->last_frame_type)
-        vp8_frame_init_loop_filter(lfi, frame_type);
-}
-
-
 void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
 {
     int mb_row;
@@ -738,12 +720,10 @@
     volatile int *last_row_current_mb_col = NULL;
     int nsync = pbi->sync_range;
 
-    int filter_level;
-    loop_filter_info *lfi = pc->lf_info;
-    int alt_flt_enabled = xd->segmentation_enabled;
-    int Segment;
+    int filter_level = pc->filter_level;
+    loop_filter_info_n *lfi_n = &pc->lf_info;
 
-    if(pbi->common.filter_level)
+    if (filter_level)
     {
         /* Set above_row buffer to 127 for decoding first MB row */
         vpx_memset(pbi->mt_yabove_row[0] + VP8BORDERINPIXELS-1, 127, pc->yv12_fb[pc->lst_fb_idx].y_width + 5);
@@ -764,7 +744,9 @@
             vpx_memset(pbi->mt_uleft_col[i], (unsigned char)129, 8);
             vpx_memset(pbi->mt_vleft_col[i], (unsigned char)129, 8);
         }
-        lpf_init(pbi, pc->filter_level);
+
+        /* Initialize the loop filter for this frame. */
+        vp8_loop_filter_frame_init(pc, &pbi->mb, filter_level, pc->sharpness_level);
     }
 
     setup_decoding_thread_data(pbi, xd, pbi->mb_row_di, pbi->decoding_thread_count);
@@ -774,7 +756,6 @@
 
     for (mb_row = 0; mb_row < pc->mb_rows; mb_row += (pbi->decoding_thread_count + 1))
     {
-
         xd->current_bc = &pbi->mbc[mb_row%num_part];
 
         /* vp8_decode_mb_row(pbi, pc, mb_row, xd); */
@@ -875,7 +856,16 @@
 
                 if (pbi->common.filter_level)
                 {
-                    int skip_lf;
+                    int skip_lf = (xd->mode_info_context->mbmi.mode != B_PRED &&
+                                    xd->mode_info_context->mbmi.mode != SPLITMV &&
+                                    xd->mode_info_context->mbmi.mb_skip_coeff);
+
+                    const int mode_index = lfi_n->mode_lf_lut[xd->mode_info_context->mbmi.mode];
+                    const int seg = xd->mode_info_context->mbmi.segment_id;
+                    const int ref_frame = xd->mode_info_context->mbmi.ref_frame;
+
+                    filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
+
                     /* Save decoded MB last row data for next-row decoding */
                     if(mb_row != pc->mb_rows-1)
                     {
@@ -901,36 +891,58 @@
                         }
                     }
 
-                    /* update loopfilter info */
-                    Segment = (alt_flt_enabled) ? xd->mode_info_context->mbmi.segment_id : 0;
-                    skip_lf = (xd->mode_info_context->mbmi.mode != B_PRED &&
-                                    xd->mode_info_context->mbmi.mode != SPLITMV &&
-                                    xd->mode_info_context->mbmi.mb_skip_coeff);
-                    filter_level = pbi->mt_baseline_filter_level[Segment];
-                    /* Distance of Mb to the various image edges.
-                     * These are specified to 8th pel as they are always compared to values that are in 1/8th pel units
-                     * Apply any context driven MB level adjustment
-                     */
-                    filter_level = vp8_adjust_mb_lf_value(xd, filter_level);
-
                     /* loopfilter on this macroblock. */
                     if (filter_level)
                     {
-                        if (mb_col > 0)
-                            pc->lf_mbv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]);
+                        if(pc->filter_type == NORMAL_LOOPFILTER)
+                        {
+                            loop_filter_info lfi;
+                            FRAME_TYPE frame_type = pc->frame_type;
+                            const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
+                            lfi.mblim = lfi_n->mblim[filter_level];
+                            lfi.blim = lfi_n->blim[filter_level];
+                            lfi.lim = lfi_n->lim[filter_level];
+                            lfi.hev_thr = lfi_n->hev_thr[hev_index];
 
-                        if (!skip_lf)
-                            pc->lf_bv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]);
+                            if (mb_col > 0)
+                                LF_INVOKE(&pc->rtcd.loopfilter, normal_mb_v)
+                                (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi);
 
-                        /* don't apply across umv border */
-                        if (mb_row > 0)
-                            pc->lf_mbh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]);
+                            if (!skip_lf)
+                                LF_INVOKE(&pc->rtcd.loopfilter, normal_b_v)
+                                (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi);
 
-                        if (!skip_lf)
-                            pc->lf_bh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]);
+                            /* don't apply across umv border */
+                            if (mb_row > 0)
+                                LF_INVOKE(&pc->rtcd.loopfilter, normal_mb_h)
+                                (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi);
+
+                            if (!skip_lf)
+                                LF_INVOKE(&pc->rtcd.loopfilter, normal_b_h)
+                                (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer,  recon_y_stride, recon_uv_stride, &lfi);
+                        }
+                        else
+                        {
+                            if (mb_col > 0)
+                                LF_INVOKE(&pc->rtcd.loopfilter, simple_mb_v)
+                                (xd->dst.y_buffer, recon_y_stride, lfi_n->mblim[filter_level]);
+
+                            if (!skip_lf)
+                                LF_INVOKE(&pc->rtcd.loopfilter, simple_b_v)
+                                (xd->dst.y_buffer, recon_y_stride, lfi_n->blim[filter_level]);
+
+                            /* don't apply across umv border */
+                            if (mb_row > 0)
+                                LF_INVOKE(&pc->rtcd.loopfilter, simple_mb_h)
+                                (xd->dst.y_buffer, recon_y_stride, lfi_n->mblim[filter_level]);
+
+                            if (!skip_lf)
+                                LF_INVOKE(&pc->rtcd.loopfilter, simple_b_h)
+                                (xd->dst.y_buffer, recon_y_stride, lfi_n->blim[filter_level]);
+                        }
                     }
-                }
 
+                }
                 recon_yoffset += 16;
                 recon_uvoffset += 8;
 
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 73b4c7d..d719f36 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -561,10 +561,6 @@
     sf->recode_loop = 1;
     sf->quarter_pixel_search = 1;
     sf->half_pixel_search = 1;
-    sf->full_freq[0] = 7;
-    sf->full_freq[1] = 7;
-    sf->min_fs_radius = 8;
-    sf->max_fs_radius = 32;
     sf->iterative_sub_pixel = 1;
     sf->optimize_coefficients = 1;
     sf->use_fastquant_for_pick = 0;
@@ -607,8 +603,6 @@
         sf->thresh_mult[THR_SPLITG   ] = 5000;
         sf->thresh_mult[THR_SPLITA   ] = 5000;
 
-        sf->full_freq[0] = 7;
-        sf->full_freq[1] = 15;
 
         sf->first_step = 0;
         sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
@@ -666,8 +660,6 @@
         sf->thresh_mult[THR_SPLITG   ] = 10000;
         sf->thresh_mult[THR_SPLITA   ] = 10000;
 #endif
-        sf->full_freq[0] = 15;
-        sf->full_freq[1] = 31;
 
         if (Speed > 0)
         {
@@ -761,8 +753,6 @@
             // alt ref frames
             sf->recode_loop = 2;
 
-            sf->full_freq[0] = 31;
-            sf->full_freq[1] = 63;
         }
 
         if (Speed > 3)
@@ -783,15 +773,11 @@
             sf->recode_loop = 0; // recode loop off
             sf->RD = 0;         // Turn rd off
 
-            sf->full_freq[0] = 63;
-            sf->full_freq[1] = 127;
         }
 
         if (Speed > 4)
         {
             sf->auto_filter = 0;                     // Faster selection of loop filter
-            sf->full_freq[0] = INT_MAX;
-            sf->full_freq[1] = INT_MAX;
 
             cpi->mode_check_freq[THR_V_PRED] = 2;
             cpi->mode_check_freq[THR_H_PRED] = 2;
@@ -853,8 +839,6 @@
         sf->thresh_mult[THR_SPLITMV  ] = 5000;
         sf->thresh_mult[THR_SPLITG   ] = 10000;
         sf->thresh_mult[THR_SPLITA   ] = 10000;
-        sf->full_freq[0] = 15;
-        sf->full_freq[1] = 31;
         sf->search_method = NSTEP;
 
         if (Speed > 0)
@@ -935,8 +919,6 @@
                 sf->thresh_mult[THR_SPLITA   ] = 50000;
             }
 
-            sf->full_freq[0] = 31;
-            sf->full_freq[1] = 63;
         }
 
         if (Speed > 2)
@@ -963,15 +945,11 @@
             sf->thresh_mult[THR_SPLITG  ] = INT_MAX;
             sf->thresh_mult[THR_SPLITA  ] = INT_MAX;
 
-            sf->full_freq[0] = 63;
-            sf->full_freq[1] = 127;
         }
 
         if (Speed > 3)
         {
             sf->RD = 0;
-            sf->full_freq[0] = INT_MAX;
-            sf->full_freq[1] = INT_MAX;
 
             sf->auto_filter = 1;
         }
@@ -2105,7 +2083,7 @@
     //when needed. This will avoid unnecessary calls of vp8cx_init_quantizer() for every frame.
     vp8cx_init_quantizer(cpi);
     {
-        vp8_init_loop_filter(cm);
+        vp8_loop_filter_init(cm);
         cm->last_frame_type = KEY_FRAME;
         cm->last_filter_type = cm->filter_type;
         cm->last_sharpness_level = cm->sharpness_level;
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 107a681..be79cb0 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -180,9 +180,6 @@
     int half_pixel_search;
     int quarter_pixel_search;
     int thresh_mult[MAX_MODES];
-    int full_freq[2];
-    int min_fs_radius;
-    int max_fs_radius;
     int max_step_search_steps;
     int first_step;
     int optimize_coefficients;
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index 8f87611..725e44e 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -42,9 +42,7 @@
 extern const MV_REFERENCE_FRAME vp8_ref_frame_order[MAX_MODES];
 extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES];
 
-
 extern unsigned int (*vp8_get4x4sse_cs)(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride);
-extern int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x, MV *best_ref_mv, int best_rd, int *, int *, int *, int, int *mvcost[2], int, int fullpixel);
 extern int vp8_cost_mv_ref(MB_PREDICTION_MODE m, const int near_mv_ref_ct[4]);
 
 
@@ -575,18 +573,6 @@
                 continue;
         }
 
-        if(cpi->sf.improved_mv_pred && x->e_mbd.mode_info_context->mbmi.mode == NEWMV)
-        {
-            if(!saddone)
-            {
-                vp8_cal_sad(cpi,xd,x, recon_yoffset ,&near_sadidx[0] );
-                saddone = 1;
-            }
-
-            vp8_mv_pred(cpi, &x->e_mbd, x->e_mbd.mode_info_context, &mvp,
-                        x->e_mbd.mode_info_context->mbmi.ref_frame, cpi->common.ref_frame_sign_bias, &sr, &near_sadidx[0]);
-        }
-
         switch (this_mode)
         {
         case B_PRED:
@@ -666,6 +652,15 @@
 
             if(cpi->sf.improved_mv_pred)
             {
+                if(!saddone)
+                {
+                    vp8_cal_sad(cpi,xd,x, recon_yoffset ,&near_sadidx[0] );
+                    saddone = 1;
+                }
+
+                vp8_mv_pred(cpi, &x->e_mbd, x->e_mbd.mode_info_context, &mvp,
+                            x->e_mbd.mode_info_context->mbmi.ref_frame, cpi->common.ref_frame_sign_bias, &sr, &near_sadidx[0]);
+
                 sr += speed_adjust;
                 //adjust search range according to sr from mv prediction
                 if(sr > step_param)
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 451a818..8b18541 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -1864,18 +1864,6 @@
             lf_or_gf = frame_lf_or_gf[x->e_mbd.mode_info_context->mbmi.ref_frame];
         }
 
-        if(x->e_mbd.mode_info_context->mbmi.mode == NEWMV)
-        {
-            if(!saddone)
-            {
-                vp8_cal_sad(cpi,xd,x, recon_yoffset ,&near_sadidx[0] );
-                saddone = 1;
-            }
-
-            vp8_mv_pred(cpi, &x->e_mbd, x->e_mbd.mode_info_context, &mvp,
-                        x->e_mbd.mode_info_context->mbmi.ref_frame, cpi->common.ref_frame_sign_bias, &sr, &near_sadidx[0]);
-        }
-
         // Check to see if the testing frequency for this mode is at its max
         // If so then prevent it from being tested and increase the threshold for its testing
         if (cpi->mode_test_hit_counts[mode_index] && (cpi->mode_check_freq[mode_index] > 1))
@@ -2016,6 +2004,15 @@
             int tmp_row_min = x->mv_row_min;
             int tmp_row_max = x->mv_row_max;
 
+            if(!saddone)
+            {
+                vp8_cal_sad(cpi,xd,x, recon_yoffset ,&near_sadidx[0] );
+                saddone = 1;
+            }
+
+            vp8_mv_pred(cpi, &x->e_mbd, x->e_mbd.mode_info_context, &mvp,
+                        x->e_mbd.mode_info_context->mbmi.ref_frame, cpi->common.ref_frame_sign_bias, &sr, &near_sadidx[0]);
+
             mvp_full.as_mv.col = mvp.as_mv.col>>3;
             mvp_full.as_mv.row = mvp.as_mv.row>>3;
 
diff --git a/vp8/encoder/temporal_filter.c b/vp8/encoder/temporal_filter.c
index 1c2656e..19913a9 100644
--- a/vp8/encoder/temporal_filter.c
+++ b/vp8/encoder/temporal_filter.c
@@ -262,10 +262,19 @@
     for (mb_row = 0; mb_row < mb_rows; mb_row++)
     {
 #if ALT_REF_MC_ENABLED
-        // Reduced search extent by 3 for 6-tap filter & smaller UMV border
-        cpi->mb.mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 19));
+        // Source frames are extended to 16 pixels.  This is different than
+        //  L/A/G reference frames that have a border of 32 (VP8BORDERINPIXELS)
+        // A 6 tap filter is used for motion search.  This requires 2 pixels
+        //  before and 3 pixels after.  So the largest Y mv on a border would
+        //  then be 16 - 3.  The UV blocks are half the size of the Y and
+        //  therefore only extended by 8.  The largest mv that a UV block
+        //  can support is 8 - 3.  A UV mv is half of a Y mv.
+        //  (16 - 3) >> 1 == 6 which is greater than 8 - 3.
+        // To keep the mv in play for both Y and UV planes the max that it
+        //  can be on a border is therefore 16 - 5.
+        cpi->mb.mv_row_min = -((mb_row * 16) + (16 - 5));
         cpi->mb.mv_row_max = ((cpi->common.mb_rows - 1 - mb_row) * 16)
-                                + (VP8BORDERINPIXELS - 19);
+                                + (16 - 5);
 #endif
 
         for (mb_col = 0; mb_col < mb_cols; mb_col++)
@@ -277,10 +286,9 @@
             vpx_memset(count, 0, 384*sizeof(unsigned short));
 
 #if ALT_REF_MC_ENABLED
-            // Reduced search extent by 3 for 6-tap filter & smaller UMV border
-            cpi->mb.mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 19));
+            cpi->mb.mv_col_min = -((mb_col * 16) + (16 - 5));
             cpi->mb.mv_col_max = ((cpi->common.mb_cols - 1 - mb_col) * 16)
-                                    + (VP8BORDERINPIXELS - 19);
+                                    + (16 - 5);
 #endif
 
             for (frame = 0; frame < frame_count; frame++)
diff --git a/vp8/encoder/tokenize.c b/vp8/encoder/tokenize.c
index e14e6fc..15e7336 100644
--- a/vp8/encoder/tokenize.c
+++ b/vp8/encoder/tokenize.c
@@ -95,101 +95,183 @@
 
 static void tokenize2nd_order_b
 (
-    const BLOCKD *const b,
+    MACROBLOCKD *x,
     TOKENEXTRA **tp,
-    const int type,     /* which plane: 0=Y no DC, 1=Y2, 2=UV, 3=Y with DC */
-    ENTROPY_CONTEXT *a,
-    ENTROPY_CONTEXT *l,
     VP8_COMP *cpi
 )
 {
-    int pt; /* near block/prev token context index */
-    int c = 0;          /* start at DC */
-    const int eob = b->eob;     /* one beyond last nonzero coeff */
-    TOKENEXTRA *t = *tp;        /* store tokens starting here */
-    int x;
-    const short *qcoeff_ptr = b->qcoeff;
+    int pt;             /* near block/prev token context index */
+    int c;              /* start at DC */
+    TOKENEXTRA *t = *tp;/* store tokens starting here */
+    const BLOCKD *b;
+    const short *qcoeff_ptr;
+    ENTROPY_CONTEXT * a;
+    ENTROPY_CONTEXT * l;
+    int band, rc, v, token;
+
+    b = x->block + 24;
+    qcoeff_ptr = b->qcoeff;
+    a = (ENTROPY_CONTEXT *)x->above_context + 8;
+    l = (ENTROPY_CONTEXT *)x->left_context + 8;
+
     VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
 
-    do
+    for (c = 0; c < b->eob; c++)
     {
-        const int band = vp8_coef_bands[c];
+        rc = vp8_default_zig_zag1d[c];
+        band = vp8_coef_bands[c];
+        v = qcoeff_ptr[rc];
 
-        if (c < eob)
-        {
-            int rc = vp8_default_zig_zag1d[c];
-            const int v = qcoeff_ptr[rc];
-#if CONFIG_DEBUG
-            assert(-DCT_MAX_VALUE <= v  &&  v < (DCT_MAX_VALUE));
-#endif
-            t->Extra = vp8_dct_value_tokens_ptr[v].Extra;
-            x        = vp8_dct_value_tokens_ptr[v].Token;
-        }
-        else
-            x = DCT_EOB_TOKEN;
+        t->Extra = vp8_dct_value_tokens_ptr[v].Extra;
+        token    = vp8_dct_value_tokens_ptr[v].Token;
 
-        t->Token = x;
-        t->context_tree = cpi->common.fc.coef_probs [type] [band] [pt];
+        t->Token = token;
+        t->context_tree = cpi->common.fc.coef_probs [1] [band] [pt];
 
-        t->skip_eob_node = pt == 0 && ((band > 0 && type > 0) || (band > 1 && type == 0));
+        t->skip_eob_node = ((pt == 0) && (band > 0));
 
-        ++cpi->coef_counts       [type] [band] [pt] [x];
+        ++cpi->coef_counts       [1] [band] [pt] [token];
+
+        pt = vp8_prev_token_class[token];
+        t++;
     }
-    while (pt = vp8_prev_token_class[x], ++t, c < eob  &&  ++c < 16);
+    if (c < 16)
+    {
+        band = vp8_coef_bands[c];
+        t->Token = DCT_EOB_TOKEN;
+        t->context_tree = cpi->common.fc.coef_probs [1] [band] [pt];
+
+        t->skip_eob_node = ((pt == 0) && (band > 0));
+
+        ++cpi->coef_counts       [1] [band] [pt] [DCT_EOB_TOKEN];
+
+        t++;
+    }
 
     *tp = t;
-    pt = (c != !type); /* 0 <-> all coeff data is zero */
+    pt = (c != 0); /* 0 <-> all coeff data is zero */
     *a = *l = pt;
 
 }
 
 static void tokenize1st_order_b
 (
-    const BLOCKD *const b,
+    MACROBLOCKD *x,
     TOKENEXTRA **tp,
-    const int type,     /* which plane: 0=Y no DC, 1=Y2, 2=UV, 3=Y with DC */
-    ENTROPY_CONTEXT *a,
-    ENTROPY_CONTEXT *l,
+    int type,           /* which plane: 0=Y no DC, 1=Y2, 2=UV, 3=Y with DC */
     VP8_COMP *cpi
 )
 {
-    int pt; /* near block/prev token context index */
-    int c = type ? 0 : 1;       /* start at DC unless type 0 */
-    const int eob = b->eob;     /* one beyond last nonzero coeff */
-    TOKENEXTRA *t = *tp;        /* store tokens starting here */
-    int x;
-    const short *qcoeff_ptr = b->qcoeff;
-    VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+    unsigned int block;
+    const BLOCKD *b;
+    int pt;             /* near block/prev token context index */
+    int c;
+    int token;
+    TOKENEXTRA *t = *tp;/* store tokens starting here */
+    const short *qcoeff_ptr;
+    ENTROPY_CONTEXT * a;
+    ENTROPY_CONTEXT * l;
+    int band, rc, v;
+    int tmp1, tmp2;
 
-    do
+    b = x->block;
+    /* Luma */
+    for (block = 0; block < 16; block++, b++)
     {
-        const int band = vp8_coef_bands[c];
+        tmp1 = vp8_block2above[block];
+        tmp2 = vp8_block2left[block];
+        qcoeff_ptr = b->qcoeff;
+        a = (ENTROPY_CONTEXT *)x->above_context + tmp1;
+        l = (ENTROPY_CONTEXT *)x->left_context + tmp2;
 
-        x = DCT_EOB_TOKEN;
+        VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
 
-        if (c < eob)
+        c = type ? 0 : 1;
+
+        for (; c < b->eob; c++)
         {
-            int rc = vp8_default_zig_zag1d[c];
-            const int v = qcoeff_ptr[rc];
-#if CONFIG_DEBUG
-            assert(-DCT_MAX_VALUE <= v  &&  v < (DCT_MAX_VALUE));
-#endif
+            rc = vp8_default_zig_zag1d[c];
+            band = vp8_coef_bands[c];
+            v = qcoeff_ptr[rc];
+
             t->Extra = vp8_dct_value_tokens_ptr[v].Extra;
-            x        = vp8_dct_value_tokens_ptr[v].Token;
+            token    = vp8_dct_value_tokens_ptr[v].Token;
+
+            t->Token = token;
+            t->context_tree = cpi->common.fc.coef_probs [type] [band] [pt];
+
+            t->skip_eob_node = pt == 0 &&
+                ((band > 0 && type > 0) || (band > 1 && type == 0));
+
+            ++cpi->coef_counts       [type] [band] [pt] [token];
+
+            pt = vp8_prev_token_class[token];
+            t++;
         }
+        if (c < 16)
+        {
+            band = vp8_coef_bands[c];
+            t->Token = DCT_EOB_TOKEN;
+            t->context_tree = cpi->common.fc.coef_probs [type] [band] [pt];
 
-        t->Token = x;
-        t->context_tree = cpi->common.fc.coef_probs [type] [band] [pt];
+            t->skip_eob_node = pt == 0 &&
+                ((band > 0 && type > 0) || (band > 1 && type == 0));
 
-        t->skip_eob_node = pt == 0 && ((band > 0 && type > 0) || (band > 1 && type == 0));
+            ++cpi->coef_counts       [type] [band] [pt] [DCT_EOB_TOKEN];
 
-        ++cpi->coef_counts       [type] [band] [pt] [x];
+            t++;
+        }
+        *tp = t;
+        pt = (c != !type); /* 0 <-> all coeff data is zero */
+        *a = *l = pt;
+
     }
-    while (pt = vp8_prev_token_class[x], ++t, c < eob  &&  ++c < 16);
+    /* Chroma */
+    for (block = 16; block < 24; block++, b++)
+    {
+        tmp1 = vp8_block2above[block];
+        tmp2 = vp8_block2left[block];
+        qcoeff_ptr = b->qcoeff;
+        a = (ENTROPY_CONTEXT *)x->above_context + tmp1;
+        l = (ENTROPY_CONTEXT *)x->left_context + tmp2;
 
-    *tp = t;
-    pt = (c != !type); /* 0 <-> all coeff data is zero */
-    *a = *l = pt;
+        VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+
+        for (c = 0; c < b->eob; c++)
+        {
+            rc = vp8_default_zig_zag1d[c];
+            band = vp8_coef_bands[c];
+            v = qcoeff_ptr[rc];
+
+            t->Extra = vp8_dct_value_tokens_ptr[v].Extra;
+            token    = vp8_dct_value_tokens_ptr[v].Token;
+
+            t->Token = token;
+            t->context_tree = cpi->common.fc.coef_probs [2] [band] [pt];
+
+            t->skip_eob_node = ((pt == 0) && (band > 0));
+
+            ++cpi->coef_counts       [2] [band] [pt] [token];
+
+            pt = vp8_prev_token_class[token];
+            t++;
+        }
+        if (c < 16)
+        {
+            band = vp8_coef_bands[c];
+            t->Token = DCT_EOB_TOKEN;
+            t->context_tree = cpi->common.fc.coef_probs [2] [band] [pt];
+
+            t->skip_eob_node = ((pt == 0) && (band > 0));
+
+            ++cpi->coef_counts       [2] [band] [pt] [DCT_EOB_TOKEN];
+
+            t++;
+        }
+        *tp = t;
+        pt = (c != 0); /* 0 <-> all coeff data is zero */
+        *a = *l = pt;
+    }
 
 }
 
@@ -214,10 +296,7 @@
 
 void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t)
 {
-    ENTROPY_CONTEXT * A = (ENTROPY_CONTEXT *)x->above_context;
-    ENTROPY_CONTEXT * L = (ENTROPY_CONTEXT *)x->left_context;
     int plane_type;
-    int b;
     int has_y2_block;
 
     has_y2_block = (x->mode_info_context->mbmi.mode != B_PRED
@@ -240,26 +319,15 @@
 
     cpi->skip_false_count++;
 
-
-
     plane_type = 3;
     if(has_y2_block)
     {
-        tokenize2nd_order_b(x->block + 24, t, 1,
-                   A + vp8_block2above[24], L + vp8_block2left[24], cpi);
+        tokenize2nd_order_b(x, t, cpi);
         plane_type = 0;
 
     }
 
-    for (b = 0; b < 16; b++)
-        tokenize1st_order_b(x->block + b, t, plane_type,
-                            A + vp8_block2above[b],
-                            L + vp8_block2left[b], cpi);
-
-    for (b = 16; b < 24; b++)
-        tokenize1st_order_b(x->block + b, t, 2,
-                            A + vp8_block2above[b],
-                            L + vp8_block2left[b], cpi);
+    tokenize1st_order_b(x, t, plane_type, cpi);
 
 }
 
diff --git a/vpx_mem/include/nds/vpx_mem_nds.h b/vpx_mem/include/nds/vpx_mem_nds.h
deleted file mode 100644
index e54f54d..0000000
--- a/vpx_mem/include/nds/vpx_mem_nds.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __VPX_MEM_NDS_H__
-#define __VPX_MEM_NDS_H__
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-#include <nitro.h>
-#include <nitro/os.h>
-
-    void *vpx_mem_nds_alloc(osarena_id id, osheap_handle handle, size_t size, size_t align);
-    void vpx_mem_nds_free(osarena_id id, osheap_handle handle, void *mem);
-    int vpx_nds_alloc_heap(osarena_id id, u32 size);
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif /*__VPX_MEM_NDS_H__*/
diff --git a/vpx_mem/vpx_mem_tracker.c b/vpx_mem/vpx_mem_tracker.c
index 938ad07..9e8623a 100644
--- a/vpx_mem/vpx_mem_tracker.c
+++ b/vpx_mem/vpx_mem_tracker.c
@@ -36,9 +36,6 @@
 # include <winbase.h>
 #elif defined(VXWORKS)
 # include <sem_lib.h>
-#elif defined(NDS_NITRO)
-# include <nitro.h>
-# include <nitro/os.h>
 #endif
 
 #include <stdio.h>
@@ -112,8 +109,6 @@
     HANDLE mutex;
 #elif defined(VXWORKS)
     SEM_ID mutex;
-#elif defined(NDS_NITRO)
-    OSMutex mutex;
 #elif defined(NO_MUTEX)
 #else
 #error "No mutex type defined for this platform!"
@@ -193,9 +188,6 @@
             memtrack.mutex = sem_bcreate(SEM_Q_FIFO, /*SEM_Q_FIFO non-priority based mutex*/
                                          SEM_FULL);  /*SEM_FULL initial state is unlocked*/
             ret = !memtrack.mutex;
-#elif defined(NDS_NITRO)
-            os_init_mutex(&memtrack.mutex);
-            ret = 0;
 #elif defined(NO_MUTEX)
             ret = 0;
 #endif
@@ -251,9 +243,7 @@
 
         if (!g_logging.type && g_logging.file && g_logging.file != stderr)
         {
-#if !defined(NDS_NITRO)
             fclose(g_logging.file);
-#endif
             g_logging.file = NULL;
         }
 
@@ -368,15 +358,12 @@
             g_logging.file = stderr;
             ret = 0;
         }
-
-#if !defined(NDS_NITRO)
         else
         {
             if ((g_logging.file = fopen((char *)option, "w")))
                 ret = 0;
         }
 
-#endif
         break;
 #if defined(WIN32) && !defined(_WIN32_WCE)
     case 1:
@@ -506,12 +493,6 @@
                          p->addr, i, p->size,
                          p->file, p->line);
 
-#ifdef NDS_NITRO
-
-        if (!(i % 20)) os_sleep(500);
-
-#endif
-
         p = p->next;
         ++i;
     }
@@ -719,9 +700,6 @@
         ret = WaitForSingleObject(memtrack.mutex, INFINITE);
 #elif defined(VXWORKS)
         ret = sem_take(memtrack.mutex, WAIT_FOREVER);
-#elif defined(NDS_NITRO)
-        os_lock_mutex(&memtrack.mutex);
-        ret = 0;
 #endif
 
         if (ret)
@@ -754,9 +732,6 @@
         ret = !ReleaseMutex(memtrack.mutex);
 #elif defined(VXWORKS)
         ret = sem_give(memtrack.mutex);
-#elif defined(NDS_NITRO)
-        os_unlock_mutex(&memtrack.mutex);
-        ret = 0;
 #endif
 
         if (ret)
diff --git a/vpx_scale/arm/nds/yv12extend.c b/vpx_scale/arm/nds/yv12extend.c
deleted file mode 100644
index 48c0dfb..0000000
--- a/vpx_scale/arm/nds/yv12extend.c
+++ /dev/null
@@ -1,221 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/****************************************************************************
-*
-*   Module Title :     yv12extend.c
-*
-*   Description  :
-*
-***************************************************************************/
-
-/****************************************************************************
-*  Header Files
-****************************************************************************/
-#include "vpx_scale/yv12config.h"
-#include "vpx_mem/vpx_mem.h"
-#include <nitro.h>
-#include <nitro/mi.h>
-#include <nitro/itcm_begin.h>
-
-//---- DMA Number
-#define DMA_NO  3
-
-/****************************************************************************
-*  Exports
-****************************************************************************/
-
-/****************************************************************************
-*
-****************************************************************************/
-void
-vp8_yv12_extend_frame_borders(YV12_BUFFER_CONFIG *ybf)
-{
-    int i;
-    unsigned char *src_ptr1, *src_ptr2;
-    unsigned char *dest_ptr1, *dest_ptr2;
-
-    unsigned int Border;
-    int plane_stride;
-    int plane_height;
-    int plane_width;
-
-    /***********/
-    /* Y Plane */
-    /***********/
-    Border = ybf->border;
-    plane_stride = ybf->y_stride;
-    plane_height = ybf->y_height;
-    plane_width = ybf->y_width;
-
-    // copy the left and right most columns out
-    src_ptr1 = ybf->y_buffer;
-    src_ptr2 = src_ptr1 + plane_width - 1;
-    dest_ptr1 = src_ptr1 - Border;
-    dest_ptr2 = src_ptr2 + 1;
-
-    for (i = 0; i < plane_height; i++)
-    {
-        mi_cpu_fill8(dest_ptr1, src_ptr1[0], Border);
-        mi_cpu_fill8(dest_ptr2, src_ptr2[0], Border);
-        src_ptr1  += plane_stride;
-        src_ptr2  += plane_stride;
-        dest_ptr1 += plane_stride;
-        dest_ptr2 += plane_stride;
-    }
-
-    // Now copy the top and bottom source lines into each line of the respective borders
-    src_ptr1 = ybf->y_buffer - Border;
-    src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
-    dest_ptr1 = src_ptr1 - (Border * plane_stride);
-    dest_ptr2 = src_ptr2 + plane_stride;
-
-    for (i = 0; i < (int)Border; i++)
-    {
-        mi_cpu_copy_fast(src_ptr1, dest_ptr1, plane_stride);
-        mi_cpu_copy_fast(src_ptr2, dest_ptr2, plane_stride);
-        dest_ptr1 += plane_stride;
-        dest_ptr2 += plane_stride;
-    }
-
-    plane_stride /= 2;
-    plane_height /= 2;
-    plane_width /= 2;
-    Border /= 2;
-
-    /***********/
-    /* U Plane */
-    /***********/
-
-    // copy the left and right most columns out
-    src_ptr1 = ybf->u_buffer;
-    src_ptr2 = src_ptr1 + plane_width - 1;
-    dest_ptr1 = src_ptr1 - Border;
-    dest_ptr2 = src_ptr2 + 1;
-
-    for (i = 0; i < plane_height; i++)
-    {
-        mi_cpu_fill8(dest_ptr1, src_ptr1[0], Border);
-        mi_cpu_fill8(dest_ptr2, src_ptr2[0], Border);
-        src_ptr1  += plane_stride;
-        src_ptr2  += plane_stride;
-        dest_ptr1 += plane_stride;
-        dest_ptr2 += plane_stride;
-    }
-
-    // Now copy the top and bottom source lines into each line of the respective borders
-    src_ptr1 = ybf->u_buffer - Border;
-    src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
-    dest_ptr1 = src_ptr1 - (Border * plane_stride);
-    dest_ptr2 = src_ptr2 + plane_stride;
-
-    for (i = 0; i < (int)(Border); i++)
-    {
-        mi_cpu_copy_fast(src_ptr1, dest_ptr1, plane_stride);
-        mi_cpu_copy_fast(src_ptr2, dest_ptr2, plane_stride);
-        dest_ptr1 += plane_stride;
-        dest_ptr2 += plane_stride;
-    }
-
-    /***********/
-    /* V Plane */
-    /***********/
-
-    // copy the left and right most columns out
-    src_ptr1 = ybf->v_buffer;
-    src_ptr2 = src_ptr1 + plane_width - 1;
-    dest_ptr1 = src_ptr1 - Border;
-    dest_ptr2 = src_ptr2 + 1;
-
-    for (i = 0; i < plane_height; i++)
-    {
-        mi_cpu_fill8(dest_ptr1, src_ptr1[0], Border);
-        mi_cpu_fill8(dest_ptr2, src_ptr2[0], Border);
-        src_ptr1  += plane_stride;
-        src_ptr2  += plane_stride;
-        dest_ptr1 += plane_stride;
-        dest_ptr2 += plane_stride;
-    }
-
-    // Now copy the top and bottom source lines into each line of the respective borders
-    src_ptr1 = ybf->v_buffer - Border;
-    src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
-    dest_ptr1 = src_ptr1 - (Border * plane_stride);
-    dest_ptr2 = src_ptr2 + plane_stride;
-
-    for (i = 0; i < (int)(Border); i++)
-    {
-        mi_cpu_copy_fast(src_ptr1, dest_ptr1, plane_stride);
-        mi_cpu_copy_fast(src_ptr2, dest_ptr2, plane_stride);
-        dest_ptr1 += plane_stride;
-        dest_ptr2 += plane_stride;
-    }
-}
-
-
-
-/****************************************************************************
-*
-*  ROUTINE       : vp8_yv12_copy_frame
-*
-*  INPUTS        :
-*
-*  OUTPUTS       : None.
-*
-*  RETURNS       : void
-*
-*  FUNCTION      : Copies the source image into the destination image and
-*                  updates the destination's UMV borders.
-*
-*  SPECIAL NOTES : The frames are assumed to be identical in size.
-*
-****************************************************************************/
-void
-vp8_yv12_copy_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc)
-{
-    int yplane_size = (src_ybc->y_height + 2 * src_ybc->border) * (src_ybc->y_stride);
-    int mem_size = (yplane_size * 3 / 2) + (src_ybc->y_stride * 2);
-
-    mi_cpu_copy_fast(src_ybc->buffer_alloc, dst_ybc->buffer_alloc, mem_size);
-
-    /*  unsigned char *src_y, *dst_y;
-        unsigned char *src_u, *dst_u;
-        unsigned char *src_v, *dst_v;
-
-        int yheight, uv_height;
-        int ystride, uv_stride;
-        int border;
-        int yoffset, uvoffset;
-
-        border   = src_ybc->border;
-        yheight  = src_ybc->y_height;
-        uv_height = src_ybc->uv_height;
-
-        ystride  = src_ybc->y_stride;
-        uv_stride = src_ybc->uv_stride;
-
-        yoffset  = border * (ystride + 1);
-        uvoffset = border/2 * (uv_stride + 1);
-
-        src_y = src_ybc->y_buffer - yoffset;
-        dst_y = dst_ybc->y_buffer - yoffset;
-        src_u = src_ybc->u_buffer - uvoffset;
-        dst_u = dst_ybc->u_buffer - uvoffset;
-        src_v = src_ybc->v_buffer - uvoffset;
-        dst_v = dst_ybc->v_buffer - uvoffset;
-
-        mi_cpu_copy_fast (src_y, dst_y, ystride *  (yheight + 2 * border));
-        mi_cpu_copy_fast (src_u, dst_u, uv_stride * (uv_height + border));
-        mi_cpu_copy_fast (src_v, dst_v, uv_stride * (uv_height + border));
-    */
-}
-
-#include <nitro/itcm_end.h>
diff --git a/vpx_scale/generic/yv12config.c b/vpx_scale/generic/yv12config.c
index cb0ab94..d02cde2 100644
--- a/vpx_scale/generic/yv12config.c
+++ b/vpx_scale/generic/yv12config.c
@@ -24,9 +24,12 @@
 {
     if (ybf)
     {
-            duck_free(ybf->buffer_alloc);
+        vpx_free(ybf->buffer_alloc);
 
-        ybf->buffer_alloc = 0;
+        /* buffer_alloc isn't accessed by most functions.  Rather y_buffer,
+          u_buffer and v_buffer point to buffer_alloc and are used.  Clear out
+          all of this so that a freed pointer isn't inadvertently used */
+        vpx_memset (ybf, 0, sizeof (YV12_BUFFER_CONFIG));
     }
     else
     {
@@ -44,38 +47,37 @@
 {
 /*NOTE:*/
 
-    int yplane_size = (height + 2 * border) * (width + 2 * border);
-    int uvplane_size = ((1 + height) / 2 + border) * ((1 + width) / 2 + border);
-
     if (ybf)
     {
+        int uv_width = width >> 1;
+        int uv_height = height >> 1;
+        int yplane_size = (height + 2 * border) * (width + 2 * border);
+        int uvplane_size = (uv_height + border) * (uv_width + border);
+
         vp8_yv12_de_alloc_frame_buffer(ybf);
 
+        /* only support allocating buffers that have
+          a height and width that are multiples of 16 */
+        if ((width & 0xf) | (height & 0xf))
+            return -3;
+
         ybf->y_width  = width;
         ybf->y_height = height;
         ybf->y_stride = width + 2 * border;
 
-        ybf->uv_width = (1 + width) / 2;
-        ybf->uv_height = (1 + height) / 2;
-        ybf->uv_stride = ybf->uv_width + border;
+        ybf->uv_width = uv_width;
+        ybf->uv_height = uv_height;
+        ybf->uv_stride = uv_width + border;
 
         ybf->border = border;
         ybf->frame_size = yplane_size + 2 * uvplane_size;
 
-        /* Added 2 extra lines to framebuffer so that copy12x12 doesn't fail
-         * when we have a large motion vector in V on the last v block.
-         * Note : We never use these pixels anyway so this doesn't hurt.
-         */
-        ybf->buffer_alloc = (unsigned char *) duck_memalign(32,  ybf->frame_size + (ybf->y_stride * 2) + 32, 0);
+        ybf->buffer_alloc = (unsigned char *) vpx_memalign(32, ybf->frame_size);
 
         if (ybf->buffer_alloc == NULL)
             return -1;
 
         ybf->y_buffer = ybf->buffer_alloc + (border * ybf->y_stride) + border;
-
-        if (yplane_size & 0xf)
-            yplane_size += 16 - (yplane_size & 0xf);
-
         ybf->u_buffer = ybf->buffer_alloc + yplane_size + (border / 2  * ybf->uv_stride) + border / 2;
         ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size + (border / 2  * ybf->uv_stride) + border / 2;
 
diff --git a/vpxenc.c b/vpxenc.c
index 042f07b..d82a97f 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -501,15 +501,42 @@
     if(fwrite(buffer_in, 1, len, glob->stream));
 }
 
-
-void Ebml_Serialize(EbmlGlobal *glob, const void *buffer_in, unsigned long len)
-{
-    const unsigned char *q = (const unsigned char *)buffer_in + len - 1;
-
-    for(; len; len--)
-        Ebml_Write(glob, q--, 1);
+#define WRITE_BUFFER(s) \
+for(i = len-1; i>=0; i--)\
+{ \
+    x = *(const s *)buffer_in >> (i * CHAR_BIT); \
+    Ebml_Write(glob, &x, 1); \
 }
+void Ebml_Serialize(EbmlGlobal *glob, const void *buffer_in, int buffer_size, unsigned long len)
+{
+    char x;
+    int i;
 
+    /* buffer_size:
+     * 1 - int8_t;
+     * 2 - int16_t;
+     * 3 - int32_t;
+     * 4 - int64_t;
+     */
+    switch (buffer_size)
+    {
+        case 1:
+            WRITE_BUFFER(int8_t)
+            break;
+        case 2:
+            WRITE_BUFFER(int16_t)
+            break;
+        case 4:
+            WRITE_BUFFER(int32_t)
+            break;
+        case 8:
+            WRITE_BUFFER(int64_t)
+            break;
+        default:
+            break;
+    }
+}
+#undef WRITE_BUFFER
 
 /* Need a fixed size serializer for the track ID. libmkv provdes a 64 bit
  * one, but not a 32 bit one.
@@ -518,8 +545,8 @@
 {
     unsigned char sizeSerialized = 4 | 0x80;
     Ebml_WriteID(glob, class_id);
-    Ebml_Serialize(glob, &sizeSerialized, 1);
-    Ebml_Serialize(glob, &ui, 4);
+    Ebml_Serialize(glob, &sizeSerialized, sizeof(sizeSerialized), 1);
+    Ebml_Serialize(glob, &ui, sizeof(ui), 4);
 }
 
 
@@ -533,7 +560,7 @@
 
     Ebml_WriteID(glob, class_id);
     *ebmlLoc = ftello(glob->stream);
-    Ebml_Serialize(glob, &unknownLen, 8);
+    Ebml_Serialize(glob, &unknownLen, sizeof(unknownLen), 8);
 }
 
 static void
@@ -551,7 +578,7 @@
 
     /* Seek back to the beginning of the element and write the new size */
     fseeko(glob->stream, *ebmlLoc, SEEK_SET);
-    Ebml_Serialize(glob, &size, 8);
+    Ebml_Serialize(glob, &size, sizeof(size), 8);
 
     /* Reset the stream pointer */
     fseeko(glob->stream, pos, SEEK_SET);
@@ -741,13 +768,13 @@
 
     block_length = pkt->data.frame.sz + 4;
     block_length |= 0x10000000;
-    Ebml_Serialize(glob, &block_length, 4);
+    Ebml_Serialize(glob, &block_length, sizeof(block_length), 4);
 
     track_number = 1;
     track_number |= 0x80;
     Ebml_Write(glob, &track_number, 1);
 
-    Ebml_Serialize(glob, &block_timecode, 2);
+    Ebml_Serialize(glob, &block_timecode, sizeof(block_timecode), 2);
 
     flags = 0;
     if(is_keyframe)
@@ -1312,6 +1339,11 @@
      * adjustment (5/4) to account for alt-refs
      */
     hist->samples = cfg->rc_buf_sz * 5 / 4 * fps->num / fps->den / 1000;
+
+    // prevent division by zero
+    if (hist->samples == 0)
+      hist->samples=1;
+
     hist->pts = calloc(hist->samples, sizeof(*hist->pts));
     hist->sz = calloc(hist->samples, sizeof(*hist->sz));
     for(i=0; i<RATE_BINS; i++)