Initial WebM release
diff --git a/vp8/decoder/arm/armv5/dequantize_v5.asm b/vp8/decoder/arm/armv5/dequantize_v5.asm
new file mode 100644
index 0000000..eb3f030
--- /dev/null
+++ b/vp8/decoder/arm/armv5/dequantize_v5.asm
@@ -0,0 +1,51 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_dequantize_b_armv5|
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+
+q       RN  r0
+dqc     RN  r1
+cnt     RN  r2
+
+;void dequantize_b_armv5(short *Q, short *DQC)
+|vp8_dequantize_b_armv5| PROC
+    stmdb   sp!, {r4, lr}
+    ldr     r3, [q]
+    ldr     r4, [dqc], #8
+
+    mov     cnt, #4
+dequant_loop
+    smulbb  lr, r3, r4
+    smultt  r12, r3, r4
+
+    ldr     r3, [q, #4]
+    ldr     r4, [dqc, #-4]
+
+    strh    lr, [q], #2
+    strh    r12, [q], #2
+
+    smulbb  lr, r3, r4
+    smultt  r12, r3, r4
+
+    subs    cnt, cnt, #1
+    ldrne   r3, [q, #4]
+    ldrne   r4, [dqc], #8
+
+    strh    lr, [q], #2
+    strh    r12, [q], #2
+
+    bne     dequant_loop
+
+    ldmia   sp!, {r4, pc}
+    ENDP    ;|vp8_dequantize_b_arm|
+
+    END
diff --git a/vp8/decoder/arm/armv6/dboolhuff_v6.asm b/vp8/decoder/arm/armv6/dboolhuff_v6.asm
new file mode 100644
index 0000000..143e33e
--- /dev/null
+++ b/vp8/decoder/arm/armv6/dboolhuff_v6.asm
@@ -0,0 +1,162 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_decode_value_v6|
+    EXPORT  |vp8dx_start_decode_v6|
+    EXPORT  |vp8dx_stop_decode_v6|
+    EXPORT  |vp8dx_decode_bool_v6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    INCLUDE vpx_asm_offsets.asm
+
+br      RN  r0
+prob    RN  r1
+bits    RN  r1
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+
+;   int z = 0;
+;   int bit;
+;   for ( bit=bits-1; bit>=0; bit-- )
+;   {
+;       z |= (vp8dx_decode_bool(br, 0x80)<<bit);
+;   }
+;   return z;
+
+;int vp8_decode_value_v6 ( BOOL_DECODER *br, int bits )
+|vp8_decode_value_v6| PROC
+    stmdb   sp!, {r4 - r6, lr}
+    mov     r4, br
+    mov     r5, bits
+    mov     r6, #0
+
+    subs    r5, r5, #1
+    bmi     decode_value_exit
+
+decode_value_loop
+    mov     prob, #0x80
+    mov     br, r4
+    bl      vp8dx_decode_bool_v6_internal     ; needed for conversion to s file
+    orr     r6, r6, r0, lsl r5
+    subs    r5, r5, #1
+    bpl     decode_value_loop
+
+decode_value_exit
+    mov     r0, r6
+    ldmia   sp!, {r4 - r6, pc}
+    ENDP    ; |vp8_decode_value_v6|
+
+
+;void vp8dx_start_decode_v6 ( BOOL_DECODER *br, unsigned char *source )
+|vp8dx_start_decode_v6| PROC
+    stmdb   sp!, {r4 - r5, lr}
+    mov     r2, #0
+    mov     r3, #255
+
+    str     r2, [br, #bool_decoder_lowvalue]
+    str     r3, [br, #bool_decoder_range]
+    str     r1, [br, #bool_decoder_buffer]
+
+    mov     r3, #8
+    mov     r2, #4
+    str     r3, [br, #bool_decoder_count]
+    str     r2, [br, #bool_decoder_pos]
+
+    ldrb    r2, [r1, #3]
+    ldrb    r3, [r1, #2]
+    ldrb    r4, [r1, #1]
+    ldrb    r5, [r1]
+
+    orr     r1, r2, r3, lsl #8
+    orr     r1, r1, r4, lsl #16
+    orr     r1, r1, r5, lsl #24
+
+    str     r1, [br, #bool_decoder_value]
+
+    ldmia   sp!, {r4 - r5, pc}
+    ENDP    ; |vp8dx_start_decode_v6|
+
+
+;void vp8dx_stop_decode_v6 ( BOOL_DECODER *bc );
+|vp8dx_stop_decode_v6| PROC
+    mov     pc, lr
+    ENDP    ; |vp8dx_stop_decode_v6|
+
+
+; bigsplit  RN  r1
+; buffer_v  RN  r1
+; count_v       RN  r4
+; range_v       RN  r2
+; value_v       RN  r3
+; pos_v     RN  r5
+; split     RN  r6
+; bit           RN  lr
+;int vp8dx_decode_bool_v6 ( BOOL_DECODER *br, int probability )
+|vp8dx_decode_bool_v6| PROC
+vp8dx_decode_bool_v6_internal
+    stmdb   sp!, {r4 - r6, lr}
+
+    ldr     r2, [br, #bool_decoder_range]
+    ldr     r3, [br, #bool_decoder_value]
+
+    mov     r6, r2, lsl #8
+    sub     r6, r6, #256                ;   split = 1 +  (((range-1) * probability) >> 8)
+    mov     r12, #1
+    smlawb  r6, r6, prob, r12
+
+    mov     lr, #0
+    subs    r5, r3, r6, lsl #24
+
+    ;cmp        r3, r1
+    movhs   lr, #1
+    movhs   r3, r5
+    subhs   r2, r2, r6
+    movlo   r2, r6
+
+    cmp     r2, #0x80
+    blt     range_less_0x80
+    ;strd   r2, r3, [br, #bool_decoder_range]
+    str     r2, [br, #bool_decoder_range]
+    str     r3, [br, #bool_decoder_value]
+    mov     r0, lr
+    ldmia   sp!, {r4 - r6, pc}
+
+range_less_0x80
+    ldr     r5, [br, #bool_decoder_pos]
+    ldr     r1, [br, #bool_decoder_buffer]
+    ldr     r4, [br, #bool_decoder_count]
+    add     r1, r1, r5
+
+    clz       r12, r2
+    sub       r12, r12, #24
+    subs      r4, r4, r12
+    ldrleb    r6, [r1], #1
+    mov       r2, r2, lsl r12
+    mov       r3, r3, lsl r12
+    addle     r4, r4, #8
+    rsble     r12, r4, #8
+    addle     r5, r5, #1
+    orrle     r3, r3, r6, lsl r12
+
+    ;strd       r2, r3, [br, #bool_decoder_range]
+    ;strd       r4, r5, [br, #bool_decoder_count]
+    str         r2, [br, #bool_decoder_range]
+    str         r3, [br, #bool_decoder_value]
+    str         r4, [br, #bool_decoder_count]
+    str         r5, [br, #bool_decoder_pos]
+
+    mov     r0, lr
+
+    ldmia   sp!, {r4 - r6, pc}
+    ENDP    ; |vp8dx_decode_bool_v6|
+
+    END
diff --git a/vp8/decoder/arm/armv6/dequantdcidct_v6.asm b/vp8/decoder/arm/armv6/dequantdcidct_v6.asm
new file mode 100644
index 0000000..3daa9b3
--- /dev/null
+++ b/vp8/decoder/arm/armv6/dequantdcidct_v6.asm
@@ -0,0 +1,202 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_dequant_dc_idct_v6|
+    ; ARM
+    ; REQUIRE8
+    ; PRESERVE8
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+;void vp8_dequant_dc_idct_v6(short *input, short *dq, short *output, int pitch,int Dc)
+|vp8_dequant_dc_idct_v6| PROC
+    stmdb   sp!, {r4-r11, lr}
+
+    ldr     r6, [sp, #36]           ;load Dc
+
+    ldr     r4, [r0]                ;input
+    ldr     r5, [r1], #4            ;dq
+
+    sub     sp, sp, #4
+    str     r0, [sp]
+
+    smultt  r7, r4, r5
+
+    ldr     r4, [r0, #4]            ;input
+    ldr     r5, [r1], #4            ;dq
+
+    strh    r6, [r0], #2
+    strh    r7, [r0], #2
+
+    smulbb  r6, r4, r5
+    smultt  r7, r4, r5
+
+    ldr     r4, [r0, #4]            ;input
+    ldr     r5, [r1], #4            ;dq
+
+    strh    r6, [r0], #2
+    strh    r7, [r0], #2
+
+    mov     r12, #3
+
+dequant_dc_idct_loop
+    smulbb  r6, r4, r5
+    smultt  r7, r4, r5
+
+    ldr     r4, [r0, #4]            ;input
+    ldr     r5, [r1], #4            ;dq
+
+    strh    r6, [r0], #2
+    strh    r7, [r0], #2
+
+    smulbb  r6, r4, r5
+    smultt  r7, r4, r5
+
+    subs    r12, r12, #1
+
+    ldrne   r4, [r0, #4]
+    ldrne   r5, [r1], #4
+
+    strh    r6, [r0], #2
+    strh    r7, [r0], #2
+
+    bne     dequant_dc_idct_loop
+
+    sub     r0, r0, #32
+    mov     r1, r2
+    mov     r2, r3
+
+; short_idct4x4llm_v6_dual
+
+    mov r3, #0x00004E00 ;                   cos
+    orr r3, r3, #0x0000007B ; cospi8sqrt2minus1
+    mov r4, #0x00008A00 ;                       sin
+    orr r4, r4, #0x0000008C ; sinpi8sqrt2
+    mov r5, #0x2    ; i=2                           i
+loop1_dual_11
+    ldr r6, [r0, #(4*2)]    ; i5 | i4                               5|4
+    ldr r12, [r0, #(12*2)]  ; i13 | i12                                                     13|12
+    ldr r14, [r0, #(8*2)]   ; i9 | i8                                                               9|8
+
+    smulwt  r9, r3, r6  ; (ip[5] * cospi8sqrt2minus1) >> 16                                         5c
+    smulwb  r7, r3, r6  ; (ip[4] * cospi8sqrt2minus1) >> 16                                 4c
+    smulwt  r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16                                               5s
+    smulwb  r8, r4, r6  ; (ip[4] * sinpi8sqrt2) >> 16                                       4s
+    pkhbt   r7, r7, r9, lsl #16 ; 5c | 4c
+    smulwt  r11, r3, r12    ; (ip[13] * cospi8sqrt2minus1) >> 16                                                    13c
+    pkhbt   r8, r8, r10, lsl #16    ; 5s | 4s
+    uadd16  r6, r6, r7  ; 5c+5 | 4c+4
+    smulwt  r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16                                  13s
+    smulwb  r9, r3, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16                                            12c
+    smulwb  r10, r4, r12    ; (ip[12] * sinpi8sqrt2) >> 16                                              12s
+    subs    r5, r5, #0x1    ; i--                           --
+    pkhbt   r9, r9, r11, lsl #16    ; 13c | 12c
+    ldr r11, [r0], #0x4 ; i1 | i0       ++                                          1|0
+    pkhbt   r10, r10, r7, lsl #16   ; 13s | 12s
+    uadd16  r7, r12, r9 ; 13c+13 | 12c+12
+    usub16  r7, r8, r7  ; c                                 c
+    uadd16  r6, r6, r10 ; d                             d
+    uadd16  r10, r11, r14   ; a                                             a
+    usub16  r8, r11, r14    ; b                                     b
+    uadd16  r9, r10, r6 ; a+d                                           a+d
+    usub16  r10, r10, r6    ; a-d                                               a-d
+    uadd16  r6, r8, r7  ; b+c                               b+c
+    usub16  r7, r8, r7  ; b-c                                   b-c
+    str r6, [r1, r2]    ; o5 | o4
+    add r6, r2, r2  ; pitch * 2                             p2
+    str r7, [r1, r6]    ; o9 | o8
+    add r6,  r6, r2 ; pitch * 3                             p3
+    str r10, [r1, r6]   ; o13 | o12
+    str r9, [r1], #0x4  ; o1 | o0           ++
+    bne loop1_dual_11   ;
+    mov r5, #0x2    ; i=2                           i
+    sub r0, r1, #8  ; reset input/output        i/o
+loop2_dual_22
+    ldr r6, [r0, r2]    ; i5 | i4                               5|4
+    ldr r1, [r0]    ; i1 | i0           1|0
+    ldr r12, [r0, #0x4] ; i3 | i2                                                       3|2
+    add r14, r2, #0x4   ; pitch + 2                                                             p+2
+    ldr r14, [r0, r14]  ; i7 | i6                                                               7|6
+    smulwt  r9, r3, r6  ; (ip[5] * cospi8sqrt2minus1) >> 16                                         5c
+    smulwt  r7, r3, r1  ; (ip[1] * cospi8sqrt2minus1) >> 16                                 1c
+    smulwt  r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16                                               5s
+    smulwt  r8, r4, r1  ; (ip[1] * sinpi8sqrt2) >> 16                                       1s
+    pkhbt   r11, r6, r1, lsl #16    ; i0 | i4                                                   0|4
+    pkhbt   r7, r9, r7, lsl #16 ; 1c | 5c
+    pkhbt   r8, r10, r8, lsl #16    ; 1s | 5s = temp1 ©                                     tc1
+    pkhtb   r1, r1, r6, asr #16 ; i1 | i5           1|5
+    uadd16  r1, r7, r1  ; 1c+1 | 5c+5 = temp2 (d)           td2
+    pkhbt   r9, r14, r12, lsl #16   ; i2 | i6                                           2|6
+    uadd16  r10, r11, r9    ; a                                             a
+    usub16  r9, r11, r9 ; b                                         b
+    pkhtb   r6, r12, r14, asr #16   ; i3 | i7                               3|7
+    subs    r5, r5, #0x1    ; i--                           --
+    smulwt  r7, r3, r6  ; (ip[3] * cospi8sqrt2minus1) >> 16                                 3c
+    smulwt  r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16                                                   3s
+    smulwb  r12, r3, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16                                                     7c
+    smulwb  r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16                                                               7s
+
+    pkhbt   r7, r12, r7, lsl #16    ; 3c | 7c
+    pkhbt   r11, r14, r11, lsl #16  ; 3s | 7s = temp1 (d)                                                   td1
+    uadd16  r6, r7, r6  ; 3c+3 | 7c+7 = temp2  (c)                              tc2
+    usub16  r12, r8, r6 ; c (o1 | o5)                                                       c
+    uadd16  r6, r11, r1 ; d (o3 | o7)                               d
+    uadd16  r7, r10, r6 ; a+d                                   a+d
+    mov r8, #0x4    ; set up 4's                                        4
+    orr r8, r8, #0x40000    ;                                       4|4
+    usub16  r6, r10, r6 ; a-d                               a-d
+    uadd16  r6, r6, r8  ; a-d+4                             3|7
+    uadd16  r7, r7, r8  ; a+d+4                                 0|4
+    uadd16  r10, r9, r12    ; b+c                                               b+c
+    usub16  r1, r9, r12 ; b-c           b-c
+    uadd16  r10, r10, r8    ; b+c+4                                             1|5
+    uadd16  r1, r1, r8  ; b-c+4         2|6
+    mov r8, r10, asr #19    ; o1 >> 3
+    strh    r8, [r0, #2]    ; o1
+    mov r8, r1, asr #19 ; o2 >> 3
+    strh    r8, [r0, #4]    ; o2
+    mov r8, r6, asr #19 ; o3 >> 3
+    strh    r8, [r0, #6]    ; o3
+    mov r8, r7, asr #19 ; o0 >> 3
+    strh    r8, [r0], r2    ; o0        +p
+    sxth    r10, r10    ;
+    mov r8, r10, asr #3 ; o5 >> 3
+    strh    r8, [r0, #2]    ; o5
+    sxth    r1, r1  ;
+    mov r8, r1, asr #3  ; o6 >> 3
+    strh    r8, [r0, #4]    ; o6
+    sxth    r6, r6  ;
+    mov r8, r6, asr #3  ; o7 >> 3
+    strh    r8, [r0, #6]    ; o7
+    sxth    r7, r7  ;
+    mov r8, r7, asr #3  ; o4 >> 3
+    strh    r8, [r0], r2    ; o4        +p
+;;;;;   subs    r5, r5, #0x1    ; i--                           --
+    bne loop2_dual_22   ;
+
+
+;vpx_memset
+    ldr     r0, [sp]
+    add     sp, sp, #4
+
+    mov     r12, #0
+    str     r12, [r0]
+    str     r12, [r0, #4]
+    str     r12, [r0, #8]
+    str     r12, [r0, #12]
+    str     r12, [r0, #16]
+    str     r12, [r0, #20]
+    str     r12, [r0, #24]
+    str     r12, [r0, #28]
+
+    ldmia   sp!, {r4 - r11, pc} ; replace vars, return                      restore
+
+    ENDP    ;|vp8_dequant_dc_idct_v68|
+
+    END
diff --git a/vp8/decoder/arm/armv6/dequantidct_v6.asm b/vp8/decoder/arm/armv6/dequantidct_v6.asm
new file mode 100644
index 0000000..61bb48d
--- /dev/null
+++ b/vp8/decoder/arm/armv6/dequantidct_v6.asm
@@ -0,0 +1,183 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_dequant_idct_v6|
+    ; ARM
+    ; REQUIRE8
+    ; PRESERVE8
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+;void vp8_dequant_idct_v6(short *input, short *dq, short *output, int pitch)
+|vp8_dequant_idct_v6| PROC
+    stmdb   sp!, {r4-r11, lr}
+
+    ldr     r4, [r0]            ;input
+    ldr     r5, [r1], #4            ;dq
+
+    sub     sp, sp, #4
+    str     r0, [sp]
+
+    mov     r12, #4
+
+dequant_idct_loop
+    smulbb  r6, r4, r5
+    smultt  r7, r4, r5
+
+    ldr     r4, [r0, #4]            ;input
+    ldr     r5, [r1], #4        ;dq
+
+    strh    r6, [r0], #2
+    strh    r7, [r0], #2
+
+    smulbb  r6, r4, r5
+    smultt  r7, r4, r5
+
+    subs    r12, r12, #1
+
+    ldrne   r4, [r0, #4]
+    ldrne   r5, [r1], #4
+
+    strh    r6, [r0], #2
+    strh    r7, [r0], #2
+
+    bne     dequant_idct_loop
+
+    sub     r0, r0, #32
+    mov     r1, r2
+    mov     r2, r3
+
+; short_idct4x4llm_v6_dual
+
+    mov r3, #0x00004E00 ;                   cos
+    orr r3, r3, #0x0000007B ; cospi8sqrt2minus1
+    mov r4, #0x00008A00 ;                       sin
+    orr r4, r4, #0x0000008C ; sinpi8sqrt2
+    mov r5, #0x2    ; i=2                           i
+loop1_dual_1
+    ldr r6, [r0, #(4*2)]    ; i5 | i4                               5|4
+    ldr r12, [r0, #(12*2)]  ; i13 | i12                                                     13|12
+    ldr r14, [r0, #(8*2)]   ; i9 | i8                                                               9|8
+
+    smulwt  r9, r3, r6  ; (ip[5] * cospi8sqrt2minus1) >> 16                                         5c
+    smulwb  r7, r3, r6  ; (ip[4] * cospi8sqrt2minus1) >> 16                                 4c
+    smulwt  r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16                                               5s
+    smulwb  r8, r4, r6  ; (ip[4] * sinpi8sqrt2) >> 16                                       4s
+    pkhbt   r7, r7, r9, lsl #16 ; 5c | 4c
+    smulwt  r11, r3, r12    ; (ip[13] * cospi8sqrt2minus1) >> 16                                                    13c
+    pkhbt   r8, r8, r10, lsl #16    ; 5s | 4s
+    uadd16  r6, r6, r7  ; 5c+5 | 4c+4
+    smulwt  r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16                                  13s
+    smulwb  r9, r3, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16                                            12c
+    smulwb  r10, r4, r12    ; (ip[12] * sinpi8sqrt2) >> 16                                              12s
+    subs    r5, r5, #0x1    ; i--                           --
+    pkhbt   r9, r9, r11, lsl #16    ; 13c | 12c
+    ldr r11, [r0], #0x4 ; i1 | i0       ++                                          1|0
+    pkhbt   r10, r10, r7, lsl #16   ; 13s | 12s
+    uadd16  r7, r12, r9 ; 13c+13 | 12c+12
+    usub16  r7, r8, r7  ; c                                 c
+    uadd16  r6, r6, r10 ; d                             d
+    uadd16  r10, r11, r14   ; a                                             a
+    usub16  r8, r11, r14    ; b                                     b
+    uadd16  r9, r10, r6 ; a+d                                           a+d
+    usub16  r10, r10, r6    ; a-d                                               a-d
+    uadd16  r6, r8, r7  ; b+c                               b+c
+    usub16  r7, r8, r7  ; b-c                                   b-c
+    str r6, [r1, r2]    ; o5 | o4
+    add r6, r2, r2  ; pitch * 2                             p2
+    str r7, [r1, r6]    ; o9 | o8
+    add r6,  r6, r2 ; pitch * 3                             p3
+    str r10, [r1, r6]   ; o13 | o12
+    str r9, [r1], #0x4  ; o1 | o0           ++
+    bne loop1_dual_1    ;
+    mov r5, #0x2    ; i=2                           i
+    sub r0, r1, #8  ; reset input/output        i/o
+loop2_dual_2
+    ldr r6, [r0, r2]    ; i5 | i4                               5|4
+    ldr r1, [r0]    ; i1 | i0           1|0
+    ldr r12, [r0, #0x4] ; i3 | i2                                                       3|2
+    add r14, r2, #0x4   ; pitch + 2                                                             p+2
+    ldr r14, [r0, r14]  ; i7 | i6                                                               7|6
+    smulwt  r9, r3, r6  ; (ip[5] * cospi8sqrt2minus1) >> 16                                         5c
+    smulwt  r7, r3, r1  ; (ip[1] * cospi8sqrt2minus1) >> 16                                 1c
+    smulwt  r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16                                               5s
+    smulwt  r8, r4, r1  ; (ip[1] * sinpi8sqrt2) >> 16                                       1s
+    pkhbt   r11, r6, r1, lsl #16    ; i0 | i4                                                   0|4
+    pkhbt   r7, r9, r7, lsl #16 ; 1c | 5c
+    pkhbt   r8, r10, r8, lsl #16    ; 1s | 5s = temp1 ©                                     tc1
+    pkhtb   r1, r1, r6, asr #16 ; i1 | i5           1|5
+    uadd16  r1, r7, r1  ; 1c+1 | 5c+5 = temp2 (d)           td2
+    pkhbt   r9, r14, r12, lsl #16   ; i2 | i6                                           2|6
+    uadd16  r10, r11, r9    ; a                                             a
+    usub16  r9, r11, r9 ; b                                         b
+    pkhtb   r6, r12, r14, asr #16   ; i3 | i7                               3|7
+    subs    r5, r5, #0x1    ; i--                           --
+    smulwt  r7, r3, r6  ; (ip[3] * cospi8sqrt2minus1) >> 16                                 3c
+    smulwt  r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16                                                   3s
+    smulwb  r12, r3, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16                                                     7c
+    smulwb  r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16                                                               7s
+
+    pkhbt   r7, r12, r7, lsl #16    ; 3c | 7c
+    pkhbt   r11, r14, r11, lsl #16  ; 3s | 7s = temp1 (d)                                                   td1
+    uadd16  r6, r7, r6  ; 3c+3 | 7c+7 = temp2  (c)                              tc2
+    usub16  r12, r8, r6 ; c (o1 | o5)                                                       c
+    uadd16  r6, r11, r1 ; d (o3 | o7)                               d
+    uadd16  r7, r10, r6 ; a+d                                   a+d
+    mov r8, #0x4    ; set up 4's                                        4
+    orr r8, r8, #0x40000    ;                                       4|4
+    usub16  r6, r10, r6 ; a-d                               a-d
+    uadd16  r6, r6, r8  ; a-d+4                             3|7
+    uadd16  r7, r7, r8  ; a+d+4                                 0|4
+    uadd16  r10, r9, r12    ; b+c                                               b+c
+    usub16  r1, r9, r12 ; b-c           b-c
+    uadd16  r10, r10, r8    ; b+c+4                                             1|5
+    uadd16  r1, r1, r8  ; b-c+4         2|6
+    mov r8, r10, asr #19    ; o1 >> 3
+    strh    r8, [r0, #2]    ; o1
+    mov r8, r1, asr #19 ; o2 >> 3
+    strh    r8, [r0, #4]    ; o2
+    mov r8, r6, asr #19 ; o3 >> 3
+    strh    r8, [r0, #6]    ; o3
+    mov r8, r7, asr #19 ; o0 >> 3
+    strh    r8, [r0], r2    ; o0        +p
+    sxth    r10, r10    ;
+    mov r8, r10, asr #3 ; o5 >> 3
+    strh    r8, [r0, #2]    ; o5
+    sxth    r1, r1  ;
+    mov r8, r1, asr #3  ; o6 >> 3
+    strh    r8, [r0, #4]    ; o6
+    sxth    r6, r6  ;
+    mov r8, r6, asr #3  ; o7 >> 3
+    strh    r8, [r0, #6]    ; o7
+    sxth    r7, r7  ;
+    mov r8, r7, asr #3  ; o4 >> 3
+    strh    r8, [r0], r2    ; o4        +p
+;;;;;   subs    r5, r5, #0x1    ; i--                           --
+    bne loop2_dual_2    ;
+            ;
+
+;vpx_memset
+    ldr     r0, [sp]
+    add     sp, sp, #4
+
+    mov     r12, #0
+    str     r12, [r0]
+    str     r12, [r0, #4]
+    str     r12, [r0, #8]
+    str     r12, [r0, #12]
+    str     r12, [r0, #16]
+    str     r12, [r0, #20]
+    str     r12, [r0, #24]
+    str     r12, [r0, #28]
+
+    ldmia   sp!, {r4 - r11, pc} ; replace vars, return                      restore
+
+    ENDP    ;|vp8_dequant_idct_v6|
+
+    END
diff --git a/vp8/decoder/arm/armv6/dequantize_v6.asm b/vp8/decoder/arm/armv6/dequantize_v6.asm
new file mode 100644
index 0000000..95e3859
--- /dev/null
+++ b/vp8/decoder/arm/armv6/dequantize_v6.asm
@@ -0,0 +1,68 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_dequantize_b_loop_v6|
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+;-------------------------------
+;void   vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);
+; r0    short *Q,
+; r1    short *DQC
+; r2    short *DQ
+|vp8_dequantize_b_loop_v6| PROC
+    stmdb   sp!, {r4-r9, lr}
+
+    ldr     r3, [r0]                ;load Q
+    ldr     r4, [r1]                ;load DQC
+    ldr     r5, [r0, #4]
+    ldr     r6, [r1, #4]
+
+    mov     r12, #2                 ;loop counter
+
+dequant_loop
+    smulbb  r7, r3, r4              ;multiply
+    smultt  r8, r3, r4
+    smulbb  r9, r5, r6
+    smultt  lr, r5, r6
+
+    ldr     r3, [r0, #8]
+    ldr     r4, [r1, #8]
+    ldr     r5, [r0, #12]
+    ldr     r6, [r1, #12]
+
+    strh    r7, [r2], #2            ;store result
+    smulbb  r7, r3, r4              ;multiply
+    strh    r8, [r2], #2
+    smultt  r8, r3, r4
+    strh    r9, [r2], #2
+    smulbb  r9, r5, r6
+    strh    lr, [r2], #2
+    smultt  lr, r5, r6
+
+    subs    r12, r12, #1
+
+    add     r0, r0, #16
+    add     r1, r1, #16
+
+    ldrne       r3, [r0]
+    strh    r7, [r2], #2            ;store result
+    ldrne       r4, [r1]
+    strh    r8, [r2], #2
+    ldrne       r5, [r0, #4]
+    strh    r9, [r2], #2
+    ldrne       r6, [r1, #4]
+    strh    lr, [r2], #2
+
+    bne     dequant_loop
+
+    ldmia   sp!, {r4-r9, pc}
+    ENDP    ;|vp8_dequantize_b_loop_v6|
+
+    END
diff --git a/vp8/decoder/arm/dboolhuff_arm.h b/vp8/decoder/arm/dboolhuff_arm.h
new file mode 100644
index 0000000..495004f
--- /dev/null
+++ b/vp8/decoder/arm/dboolhuff_arm.h
@@ -0,0 +1,49 @@
+#ifndef DBOOLHUFF_ARM_H
+#define DBOOLHUFF_ARM_H
+
+/* JLK
+ * There are currently no arm-optimized versions of
+ * these functions. As they are implemented, they
+ * can be uncommented below and added to
+ * arm/dsystemdependent.c
+ *
+ * The existing asm code is likely so different as
+ * to be useless. However, its been left (for now)
+ * for reference.
+ */
+/*
+#if HAVE_ARMV6
+#undef vp8_dbool_start
+#define vp8_dbool_start vp8dx_start_decode_v6
+
+#undef vp8_dbool_stop
+#define vp8_dbool_stop vp8dx_stop_decode_v6
+
+#undef vp8_dbool_fill
+#define vp8_dbool_fill vp8_bool_decoder_fill_v6
+
+#undef vp8_dbool_debool
+#define vp8_dbool_debool vp8_decode_bool_v6
+
+#undef vp8_dbool_devalue
+#define vp8_dbool_devalue vp8_decode_value_v6
+#endif // HAVE_ARMV6
+
+#if HAVE_ARMV7
+#undef vp8_dbool_start
+#define vp8_dbool_start vp8dx_start_decode_neon
+
+#undef vp8_dbool_stop
+#define vp8_dbool_stop vp8dx_stop_decode_neon
+
+#undef vp8_dbool_fill
+#define vp8_dbool_fill vp8_bool_decoder_fill_neon
+
+#undef vp8_dbool_debool
+#define vp8_dbool_debool vp8_decode_bool_neon
+
+#undef vp8_dbool_devalue
+#define vp8_dbool_devalue vp8_decode_value_neon
+#endif // HAVE_ARMV7
+*/
+#endif // DBOOLHUFF_ARM_H
diff --git a/vp8/decoder/arm/dequantize_arm.c b/vp8/decoder/arm/dequantize_arm.c
new file mode 100644
index 0000000..54006a9
--- /dev/null
+++ b/vp8/decoder/arm/dequantize_arm.c
@@ -0,0 +1,48 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "dequantize.h"
+#include "predictdc.h"
+#include "idct.h"
+#include "vpx_mem/vpx_mem.h"
+
+#if HAVE_ARMV7
+extern void vp8_dequantize_b_loop_neon(short *Q, short *DQC, short *DQ);
+#endif
+
+#if HAVE_ARMV6
+extern void vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);
+#endif
+
+#if HAVE_ARMV7
+
+void vp8_dequantize_b_neon(BLOCKD *d)
+{
+    int i;
+    short *DQ  = d->dqcoeff;
+    short *Q   = d->qcoeff;
+    short *DQC = &d->dequant[0][0];
+
+    vp8_dequantize_b_loop_neon(Q, DQC, DQ);
+}
+#endif
+
+#if HAVE_ARMV6
+void vp8_dequantize_b_v6(BLOCKD *d)
+{
+    int i;
+    short *DQ  = d->dqcoeff;
+    short *Q   = d->qcoeff;
+    short *DQC = &d->dequant[0][0];
+
+    vp8_dequantize_b_loop_v6(Q, DQC, DQ);
+}
+#endif
diff --git a/vp8/decoder/arm/dequantize_arm.h b/vp8/decoder/arm/dequantize_arm.h
new file mode 100644
index 0000000..c8a61a4
--- /dev/null
+++ b/vp8/decoder/arm/dequantize_arm.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef DEQUANTIZE_ARM_H
+#define DEQUANTIZE_ARM_H
+
+#if HAVE_ARMV6
+extern prototype_dequant_block(vp8_dequantize_b_v6);
+extern prototype_dequant_idct(vp8_dequant_idct_v6);
+extern prototype_dequant_idct_dc(vp8_dequant_dc_idct_v6);
+
+#undef  vp8_dequant_block
+#define vp8_dequant_block vp8_dequantize_b_v6
+
+#undef  vp8_dequant_idct
+#define vp8_dequant_idct vp8_dequant_idct_v6
+
+#undef  vp8_dequant_idct_dc
+#define vp8_dequant_idct_dc vp8_dequant_dc_idct_v6
+#endif
+
+#if HAVE_ARMV7
+extern prototype_dequant_block(vp8_dequantize_b_neon);
+extern prototype_dequant_idct(vp8_dequant_idct_neon);
+extern prototype_dequant_idct_dc(vp8_dequant_dc_idct_neon);
+
+#undef  vp8_dequant_block
+#define vp8_dequant_block vp8_dequantize_b_neon
+
+#undef  vp8_dequant_idct
+#define vp8_dequant_idct vp8_dequant_idct_neon
+
+#undef  vp8_dequant_idct_dc
+#define vp8_dequant_idct_dc vp8_dequant_dc_idct_neon
+#endif
+
+#endif
diff --git a/vp8/decoder/arm/detokenizearm_sjl.c b/vp8/decoder/arm/detokenizearm_sjl.c
new file mode 100644
index 0000000..c714452
--- /dev/null
+++ b/vp8/decoder/arm/detokenizearm_sjl.c
@@ -0,0 +1,730 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "type_aliases.h"
+#include "blockd.h"
+#include "onyxd_int.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+
+#define BR_COUNT 8
+#define BOOL_DATA UINT8
+
+#define OCB_X PREV_COEF_CONTEXTS * ENTROPY_NODES
+//ALIGN16 UINT16 onyx_coef_bands_x[16] = { 0, 1*OCB_X, 2*OCB_X, 3*OCB_X, 6*OCB_X, 4*OCB_X, 5*OCB_X, 6*OCB_X, 6*OCB_X, 6*OCB_X, 6*OCB_X, 6*OCB_X, 6*OCB_X, 6*OCB_X, 6*OCB_X, 7*OCB_X};
+DECLARE_ALIGNED(16, UINT8, vp8_coef_bands_x[16]) = { 0, 1 * OCB_X, 2 * OCB_X, 3 * OCB_X, 6 * OCB_X, 4 * OCB_X, 5 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 7 * OCB_X};
+
+#define EOB_CONTEXT_NODE            0
+#define ZERO_CONTEXT_NODE           1
+#define ONE_CONTEXT_NODE            2
+#define LOW_VAL_CONTEXT_NODE        3
+#define TWO_CONTEXT_NODE            4
+#define THREE_CONTEXT_NODE          5
+#define HIGH_LOW_CONTEXT_NODE       6
+#define CAT_ONE_CONTEXT_NODE        7
+#define CAT_THREEFOUR_CONTEXT_NODE  8
+#define CAT_THREE_CONTEXT_NODE      9
+#define CAT_FIVE_CONTEXT_NODE       10
+
+
+
+
+DECLARE_ALIGNED(16, static const TOKENEXTRABITS, vp8d_token_extra_bits2[MAX_ENTROPY_TOKENS]) =
+{
+    {  0, -1, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },  //ZERO_TOKEN
+    {  1, 0, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },   //ONE_TOKEN
+    {  2, 0, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },   //TWO_TOKEN
+    {  3, 0, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },   //THREE_TOKEN
+    {  4, 0, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },   //FOUR_TOKEN
+    {  5, 0, { 159, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },  //DCT_VAL_CATEGORY1
+    {  7, 1, { 145, 165, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } }, //DCT_VAL_CATEGORY2
+    { 11, 2, { 140, 148, 173, 0,  0,  0,  0,  0,  0,  0,  0,  0   } }, //DCT_VAL_CATEGORY3
+    { 19, 3, { 135, 140, 155, 176, 0,  0,  0,  0,  0,  0,  0,  0   } }, //DCT_VAL_CATEGORY4
+    { 35, 4, { 130, 134, 141, 157, 180, 0,  0,  0,  0,  0,  0,  0   } }, //DCT_VAL_CATEGORY5
+    { 67, 10, { 129, 130, 133, 140, 153, 177, 196, 230, 243, 254, 254, 0   } }, //DCT_VAL_CATEGORY6
+    {  0, -1, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },  // EOB TOKEN
+};
+
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/
+DECLARE_ALIGNED(16, const UINT8, vp8_block2context_leftabove[25*3]) =
+{
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, //end of vp8_block2context
+    0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 0, 0, 1, 1, 0, 0, 1, 1, 0, //end of vp8_block2left
+    0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0 //end of vp8_block2above
+};
+
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/
+
+void vp8_reset_mb_tokens_context(MACROBLOCKD *x)
+{
+    ENTROPY_CONTEXT **const A = x->above_context;
+    ENTROPY_CONTEXT(* const L)[4] = x->left_context;
+
+    ENTROPY_CONTEXT *a;
+    ENTROPY_CONTEXT *l;
+    int i;
+
+    for (i = 0; i < 24; i++)
+    {
+
+        a = A[ vp8_block2context[i] ] + vp8_block2above[i];
+        l = L[ vp8_block2context[i] ] + vp8_block2left[i];
+
+        *a = *l = 0;
+    }
+
+    if (x->mbmi.mode != B_PRED && x->mbmi.mode != SPLITMV)
+    {
+        a = A[Y2CONTEXT] + vp8_block2above[24];
+        l = L[Y2CONTEXT] + vp8_block2left[24];
+        *a = *l = 0;
+    }
+
+
+}
+
+#define ONYXBLOCK2CONTEXT_OFFSET    0
+#define ONYXBLOCK2LEFT_OFFSET       25
+#define ONYXBLOCK2ABOVE_OFFSET 50
+
+DECLARE_ALIGNED(16, const static unsigned char, norm[128]) =
+{
+    0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+};
+
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/
+void init_detokenizer(VP8D_COMP *dx)
+{
+    const VP8_COMMON *const oc = & dx->common;
+    MACROBLOCKD *x = & dx->mb;
+
+    dx->detoken.norm_ptr = (unsigned char *)norm;
+    dx->detoken.vp8_coef_tree_ptr = (vp8_tree_index *)vp8_coef_tree;
+    dx->detoken.ptr_onyxblock2context_leftabove = (UINT8 *)vp8_block2context_leftabove;
+    dx->detoken.ptr_onyx_coef_bands_x = vp8_coef_bands_x;
+    dx->detoken.scan = (int *)vp8_default_zig_zag1d;
+    dx->detoken.teb_base_ptr = (TOKENEXTRABITS *)vp8d_token_extra_bits2;
+
+    dx->detoken.qcoeff_start_ptr = &x->qcoeff[0];
+
+
+    dx->detoken.coef_probs[0] = (unsigned char *)(oc->fc.coef_probs [0] [ 0 ] [0]);
+    dx->detoken.coef_probs[1] = (unsigned char *)(oc->fc.coef_probs [1] [ 0 ] [0]);
+    dx->detoken.coef_probs[2] = (unsigned char *)(oc->fc.coef_probs [2] [ 0 ] [0]);
+    dx->detoken.coef_probs[3] = (unsigned char *)(oc->fc.coef_probs [3] [ 0 ] [0]);
+
+}
+
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/
+
+
+//shift = norm[range]; \
+//      shift = norm_ptr[range]; \
+
+#define NORMALIZE \
+    /*if(range < 0x80)*/                            \
+    { \
+        shift = detoken->norm_ptr[range]; \
+        range <<= shift; \
+        value <<= shift; \
+        count -= shift; \
+        if(count <= 0) \
+        { \
+            count += BR_COUNT ; \
+            value |= (*bufptr) << (BR_COUNT-count); \
+            bufptr++; \
+        } \
+    }
+#if 1
+#define DECODE_AND_APPLYSIGN(value_to_sign) \
+    split = (range + 1) >> 1; \
+    if ( (value >> 24) < split ) \
+    { \
+        range = split; \
+        v= value_to_sign; \
+    } \
+    else \
+    { \
+        range = range-split; \
+        value = value-(split<<24); \
+        v = -value_to_sign; \
+    } \
+    range +=range;                   \
+    value +=value;                   \
+    if (!--count) \
+    { \
+        count = BR_COUNT; \
+        value |= *bufptr; \
+        bufptr++; \
+    }
+
+#define DECODE_AND_BRANCH_IF_ZERO(probability,branch) \
+    { \
+        split = 1 +  ((( probability*(range-1) ) )>> 8); \
+        if ( (value >> 24) < split ) \
+        { \
+            range = split; \
+            NORMALIZE \
+            goto branch; \
+        } \
+        value -= (split<<24); \
+        range = range - split; \
+        NORMALIZE \
+    }
+
+#define DECODE_AND_LOOP_IF_ZERO(probability,branch) \
+    { \
+        split = 1 + ((( probability*(range-1) ) ) >> 8); \
+        if ( (value >> 24) < split ) \
+        { \
+            range = split; \
+            NORMALIZE \
+            Prob = coef_probs; \
+            ++c; \
+            Prob += vp8_coef_bands_x[c]; \
+            goto branch; \
+        } \
+        value -= (split<<24); \
+        range = range - split; \
+        NORMALIZE \
+    }
+
+#define DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val) \
+    DECODE_AND_APPLYSIGN(val) \
+    Prob = coef_probs + (ENTROPY_NODES*2); \
+    if(c < 15){\
+        qcoeff_ptr [ scan[c] ] = (INT16) v; \
+        ++c; \
+        goto DO_WHILE; }\
+    qcoeff_ptr [ scan[15] ] = (INT16) v; \
+    goto BLOCK_FINISHED;
+
+
+#define DECODE_EXTRABIT_AND_ADJUST_VAL(t,bits_count)\
+    split = 1 +  (((range-1) * vp8d_token_extra_bits2[t].Probs[bits_count]) >> 8); \
+    if(value >= (split<<24))\
+    {\
+        range = range-split;\
+        value = value-(split<<24);\
+        val += ((UINT16)1<<bits_count);\
+    }\
+    else\
+    {\
+        range = split;\
+    }\
+    NORMALIZE
+#endif
+
+#if 0
+int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x)
+{
+    ENTROPY_CONTEXT **const A = x->above_context;
+    ENTROPY_CONTEXT(* const L)[4] = x->left_context;
+    const VP8_COMMON *const oc = & dx->common;
+
+    BOOL_DECODER *bc = x->current_bc;
+
+    ENTROPY_CONTEXT *a;
+    ENTROPY_CONTEXT *l;
+    int i;
+
+    int eobtotal = 0;
+
+    register int count;
+
+    BOOL_DATA *bufptr;
+    register unsigned int range;
+    register unsigned int value;
+    const int *scan;
+    register unsigned int shift;
+    UINT32 split;
+    INT16 *qcoeff_ptr;
+
+    UINT8 *coef_probs;
+    int type;
+    int stop;
+    INT16 val, bits_count;
+    INT16 c;
+    INT16 t;
+    INT16 v;
+    vp8_prob *Prob;
+
+    //int *scan;
+    type = 3;
+    i = 0;
+    stop = 16;
+
+    if (x->mbmi.mode != B_PRED && x->mbmi.mode != SPLITMV)
+    {
+        i = 24;
+        stop = 24;
+        type = 1;
+        qcoeff_ptr = &x->qcoeff[24*16];
+        scan = vp8_default_zig_zag1d;
+        eobtotal -= 16;
+    }
+    else
+    {
+        scan = vp8_default_zig_zag1d;
+        qcoeff_ptr = &x->qcoeff[0];
+    }
+
+    count   = bc->count;
+    range   = bc->range;
+    value   = bc->value;
+    bufptr  = &bc->buffer[bc->pos];
+
+
+    coef_probs = (unsigned char *)(oc->fc.coef_probs [type] [ 0 ] [0]);
+
+BLOCK_LOOP:
+    a = A[ vp8_block2context[i] ] + vp8_block2above[i];
+    l = L[ vp8_block2context[i] ] + vp8_block2left[i];
+    c = (INT16)(!type);
+
+    VP8_COMBINEENTROPYCONTEXTS(t, *a, *l);
+    Prob = coef_probs;
+    Prob += t * ENTROPY_NODES;
+
+DO_WHILE:
+    Prob += vp8_coef_bands_x[c];
+    DECODE_AND_BRANCH_IF_ZERO(Prob[EOB_CONTEXT_NODE], BLOCK_FINISHED);
+
+CHECK_0_:
+    DECODE_AND_LOOP_IF_ZERO(Prob[ZERO_CONTEXT_NODE], CHECK_0_);
+    DECODE_AND_BRANCH_IF_ZERO(Prob[ONE_CONTEXT_NODE], ONE_CONTEXT_NODE_0_);
+    DECODE_AND_BRANCH_IF_ZERO(Prob[LOW_VAL_CONTEXT_NODE], LOW_VAL_CONTEXT_NODE_0_);
+    DECODE_AND_BRANCH_IF_ZERO(Prob[HIGH_LOW_CONTEXT_NODE], HIGH_LOW_CONTEXT_NODE_0_);
+    DECODE_AND_BRANCH_IF_ZERO(Prob[CAT_THREEFOUR_CONTEXT_NODE], CAT_THREEFOUR_CONTEXT_NODE_0_);
+    DECODE_AND_BRANCH_IF_ZERO(Prob[CAT_FIVE_CONTEXT_NODE], CAT_FIVE_CONTEXT_NODE_0_);
+    val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY6].min_val;
+    bits_count = vp8d_token_extra_bits2[DCT_VAL_CATEGORY6].Length;
+
+    do
+    {
+        DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY6, bits_count);
+        bits_count -- ;
+    }
+    while (bits_count >= 0);
+
+    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
+
+CAT_FIVE_CONTEXT_NODE_0_:
+    val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY5].min_val;
+    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY5, 4);
+    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY5, 3);
+    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY5, 2);
+    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY5, 1);
+    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY5, 0);
+    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
+
+CAT_THREEFOUR_CONTEXT_NODE_0_:
+    DECODE_AND_BRANCH_IF_ZERO(Prob[CAT_THREE_CONTEXT_NODE], CAT_THREE_CONTEXT_NODE_0_);
+    val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY4].min_val;
+    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY4, 3);
+    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY4, 2);
+    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY4, 1);
+    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY4, 0);
+    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
+
+CAT_THREE_CONTEXT_NODE_0_:
+    val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY3].min_val;
+    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY3, 2);
+    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY3, 1);
+    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY3, 0);
+    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
+
+HIGH_LOW_CONTEXT_NODE_0_:
+    DECODE_AND_BRANCH_IF_ZERO(Prob[CAT_ONE_CONTEXT_NODE], CAT_ONE_CONTEXT_NODE_0_);
+
+    val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY2].min_val;
+    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY2, 1);
+    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY2, 0);
+    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
+
+CAT_ONE_CONTEXT_NODE_0_:
+    val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY1].min_val;
+    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY1, 0);
+    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
+
+LOW_VAL_CONTEXT_NODE_0_:
+    DECODE_AND_BRANCH_IF_ZERO(Prob[TWO_CONTEXT_NODE], TWO_CONTEXT_NODE_0_);
+    DECODE_AND_BRANCH_IF_ZERO(Prob[THREE_CONTEXT_NODE], THREE_CONTEXT_NODE_0_);
+    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(4);
+
+THREE_CONTEXT_NODE_0_:
+    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(3);
+
+TWO_CONTEXT_NODE_0_:
+    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(2);
+
+ONE_CONTEXT_NODE_0_:
+    DECODE_AND_APPLYSIGN(1);
+    Prob = coef_probs + ENTROPY_NODES;
+
+    if (c < 15)
+    {
+        qcoeff_ptr [ scan[c] ] = (INT16) v;
+        ++c;
+        goto DO_WHILE;
+    }
+
+    qcoeff_ptr [ scan[15] ] = (INT16) v;
+BLOCK_FINISHED:
+    t = ((x->Block[i].eob = c) != !type);   // any nonzero data?
+    eobtotal += x->Block[i].eob;
+    *a = *l = t;
+    qcoeff_ptr += 16;
+
+    i++;
+
+    if (i < stop)
+        goto BLOCK_LOOP;
+
+    if (i == 25)
+    {
+        scan = vp8_default_zig_zag1d;//x->scan_order1d;
+        type = 0;
+        i = 0;
+        stop = 16;
+        coef_probs = (unsigned char *)(oc->fc.coef_probs [type] [ 0 ] [0]);
+        qcoeff_ptr = &x->qcoeff[0];
+        goto BLOCK_LOOP;
+    }
+
+    if (i == 16)
+    {
+        type = 2;
+        coef_probs = (unsigned char *)(oc->fc.coef_probs [type] [ 0 ] [0]);
+        stop = 24;
+        goto BLOCK_LOOP;
+    }
+
+    bc->count = count;
+    bc->value = value;
+    bc->range = range;
+    bc->pos  = bufptr - bc->buffer;
+    return eobtotal;
+
+}
+//#endif
+#else
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/
+
+#if 0
+//uses relative offsets
+
+const vp8_tree_index vp8_coef_tree_x[ 22] =   /* corresponding _CONTEXT_NODEs */
+{
+    -DCT_EOB_TOKEN, 1,                             /* 0 = EOB */
+    -ZERO_TOKEN, 1,                               /* 1 = ZERO */
+    -ONE_TOKEN, 1,                               /* 2 = ONE */
+    2, 5,                                       /* 3 = LOW_VAL */
+    -TWO_TOKEN, 1,                         /* 4 = TWO */
+    -THREE_TOKEN, -FOUR_TOKEN,                /* 5 = THREE */
+    2, 3,                                  /* 6 = HIGH_LOW */
+    -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2,   /* 7 = CAT_ONE */
+    2, 3,                                 /* 8 = CAT_THREEFOUR */
+    -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4,  /* 9 = CAT_THREE */
+    -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6   /* 10 = CAT_FIVE */
+};
+#endif
+
+#define _SCALEDOWN 8 //16 //8
+
+int vp8_decode_mb_tokens_v5(DETOK *detoken, int type);
+
+int vp8_decode_mb_tokens_v5_c(DETOK *detoken, int type)
+{
+    BOOL_DECODER *bc = detoken->current_bc;
+
+    ENTROPY_CONTEXT *a;
+    ENTROPY_CONTEXT *l;
+    int i;
+
+    register int count;
+
+    BOOL_DATA *bufptr;
+    register unsigned int range;
+    register unsigned int value;
+    register unsigned int shift;
+    UINT32 split;
+    INT16 *qcoeff_ptr;
+
+    UINT8 *coef_probs;
+//  int type;
+    int stop;
+    INT16 c;
+    INT16 t;
+    INT16 v;
+    vp8_prob *Prob;
+
+
+
+//  type = 3;
+    i = 0;
+    stop = 16;
+    qcoeff_ptr = detoken->qcoeff_start_ptr;
+
+//  if( detoken->mode != B_PRED && detoken->mode != SPLITMV)
+    if (type == 1)
+    {
+        i += 24;
+        stop += 8; //24;
+//      type = 1;
+        qcoeff_ptr += 24 * 16;
+//      eobtotal-=16;
+    }
+
+    count   = bc->count;
+    range   = bc->range;
+    value   = bc->value;
+    bufptr  = &bc->buffer[bc->pos];
+
+
+    coef_probs = detoken->coef_probs[type]; //(unsigned char *)( oc->fc.coef_probs [type] [ 0 ] [0]);
+
+BLOCK_LOOP:
+    a = detoken->A[ detoken->ptr_onyxblock2context_leftabove[i] ];
+    l = detoken->L[ detoken->ptr_onyxblock2context_leftabove[i] ];
+    c = !type;
+    a += detoken->ptr_onyxblock2context_leftabove[i + ONYXBLOCK2ABOVE_OFFSET];
+    l += detoken->ptr_onyxblock2context_leftabove[i + ONYXBLOCK2LEFT_OFFSET];
+
+    //#define ONYX_COMBINEENTROPYCONTEXTS( Dest, A, B) \
+    //Dest = ((A)!=0) + ((B)!=0);
+
+    VP8_COMBINEENTROPYCONTEXTS(t, *a, *l);
+
+    Prob = coef_probs;
+    Prob += t * ENTROPY_NODES;
+    t = 0;
+
+    do
+    {
+
+        {
+//                  onyx_tree_index * onyx_coef_tree_ptr = onyx_coef_tree_x;
+
+            Prob += detoken->ptr_onyx_coef_bands_x[c];
+
+        GET_TOKEN_START:
+
+            do
+            {
+                split = 1 + (((range - 1) * (Prob[t>>1])) >> 8);
+
+                if (value >> 24 >= split)
+                {
+                    range = range - split;
+                    value = value - (split << 24);
+                    t += 1;
+
+                    //used to eliminate else branch
+                    split = range;
+                }
+
+                range = split;
+
+                t = detoken->vp8_coef_tree_ptr[ t ];
+
+                NORMALIZE
+
+            }
+            while (t  > 0) ;
+        }
+    GET_TOKEN_STOP:
+
+        if (t == -DCT_EOB_TOKEN)
+        {
+            break;
+        }
+
+        v = -t;
+
+        if (v > FOUR_TOKEN)
+        {
+            INT16 bits_count;
+            TOKENEXTRABITS *teb_ptr;
+
+//                      teb_ptr = &onyxd_token_extra_bits2[t];
+//                  teb_ptr = &onyxd_token_extra_bits2[v];
+            teb_ptr = &detoken->teb_base_ptr[v];
+
+
+            v = teb_ptr->min_val;
+            bits_count = teb_ptr->Length;
+
+            do
+            {
+                split = 1 + (((range - 1) * teb_ptr->Probs[bits_count]) >> _SCALEDOWN);
+
+                if ((value >> 24) >= split)
+                {
+                    range = range - split;
+                    value = value - (split << 24);
+                    v += ((UINT16)1 << bits_count);
+
+                    //used to eliminate else branch
+                    split = range;
+                }
+
+                range = split;
+
+                NORMALIZE
+
+                bits_count -- ;
+            }
+            while (bits_count >= 0);
+        }
+
+        Prob = coef_probs;
+
+        if (t)
+        {
+            split = 1 + (((range - 1) * vp8_prob_half) >> 8);
+
+            if ((value >> 24) >= split)
+            {
+                range = range - split;
+                value = value - (split << 24);
+                v = (v ^ -1) + 1;           /* negate w/out conditionals */
+
+                //used to eliminate else branch
+                split = range;
+            }
+
+            range = split;
+
+            NORMALIZE
+            Prob += ENTROPY_NODES;
+
+            if (t < -ONE_TOKEN)
+                Prob += ENTROPY_NODES;
+
+            t = -2;
+        }
+
+        //if t is zero, we will skip the eob table check
+        t += 2;
+        qcoeff_ptr [detoken->scan [c] ] = (INT16) v;
+
+    }
+    while (++c < 16);
+
+    if (t != -DCT_EOB_TOKEN)
+    {
+        --c;
+    }
+
+    t = ((detoken->eob[i] = c) != !type);   // any nonzero data?
+//  eobtotal += detoken->eob[i];
+    *a = *l = t;
+    qcoeff_ptr += 16;
+
+    i++;
+
+    if (i < stop)
+        goto BLOCK_LOOP;
+
+    if (i == 25)
+    {
+        type = 0;
+        i = 0;
+        stop = 16;
+//      coef_probs = (unsigned char *)(oc->fc.coef_probs [type] [ 0 ] [0]);
+        coef_probs = detoken->coef_probs[type]; //(unsigned char *)( oc->fc.coef_probs [type] [ 0 ] [0]);
+        qcoeff_ptr = detoken->qcoeff_start_ptr;
+        goto BLOCK_LOOP;
+    }
+
+    if (i == 16)
+    {
+        type = 2;
+//      coef_probs =(unsigned char *)( oc->fc.coef_probs [type] [ 0 ] [0]);
+        coef_probs = detoken->coef_probs[type]; //(unsigned char *)( oc->fc.coef_probs [type] [ 0 ] [0]);
+        stop = 24;
+        goto BLOCK_LOOP;
+    }
+
+    bc->count = count;
+    bc->value = value;
+    bc->range = range;
+    bc->pos  = bufptr - bc->buffer;
+    return 0;
+}
+//#if 0
+int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x)
+{
+//  const ONYX_COMMON * const oc = & dx->common;
+    int eobtotal = 0;
+    int i, type;
+    /*
+        dx->detoken.norm_ptr = norm;
+        dx->detoken.onyx_coef_tree_ptr = onyx_coef_tree;
+        dx->detoken.ptr_onyxblock2context_leftabove = ONYXBLOCK2CONTEXT_LEFTABOVE;
+        dx->detoken.ptr_onyx_coef_bands_x = onyx_coef_bands_x;
+        dx->detoken.scan = default_zig_zag1d;
+        dx->detoken.teb_base_ptr = onyxd_token_extra_bits2;
+
+        dx->detoken.qcoeff_start_ptr = &x->qcoeff[0];
+
+        dx->detoken.A = x->above_context;
+        dx->detoken.L = x->left_context;
+
+        dx->detoken.coef_probs[0] = (unsigned char *)( oc->fc.coef_probs [0] [ 0 ] [0]);
+        dx->detoken.coef_probs[1] = (unsigned char *)( oc->fc.coef_probs [1] [ 0 ] [0]);
+        dx->detoken.coef_probs[2] = (unsigned char *)( oc->fc.coef_probs [2] [ 0 ] [0]);
+        dx->detoken.coef_probs[3] = (unsigned char *)( oc->fc.coef_probs [3] [ 0 ] [0]);
+    */
+
+    dx->detoken.current_bc = x->current_bc;
+    dx->detoken.A = x->above_context;
+    dx->detoken.L = x->left_context;
+
+    type = 3;
+
+    if (x->mbmi.mode != B_PRED && x->mbmi.mode != SPLITMV)
+    {
+        type = 1;
+        eobtotal -= 16;
+    }
+
+    vp8_decode_mb_tokens_v5(&dx->detoken, type);
+
+    for (i = 0; i < 25; i++)
+    {
+        x->Block[i].eob = dx->detoken.eob[i];
+        eobtotal += dx->detoken.eob[i];
+    }
+
+    return eobtotal;
+}
+#endif
diff --git a/vp8/decoder/arm/detokenizearm_v6.asm b/vp8/decoder/arm/detokenizearm_v6.asm
new file mode 100644
index 0000000..4d87ee5
--- /dev/null
+++ b/vp8/decoder/arm/detokenizearm_v6.asm
@@ -0,0 +1,364 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_decode_mb_tokens_v5|
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+
+    INCLUDE vpx_asm_offsets.asm
+
+l_qcoeff    EQU     0
+l_i         EQU     4
+l_type      EQU     8
+l_stop      EQU     12
+l_c         EQU     16
+l_l_ptr      EQU     20
+l_a_ptr      EQU     24
+l_bc        EQU     28
+l_coef_ptr   EQU     32
+l_stacksize EQU     64
+
+
+;; constant offsets -- these should be created at build time
+c_onyxblock2left_offset      EQU 25
+c_onyxblock2above_offset     EQU 50
+c_entropy_nodes              EQU 11
+c_dct_eob_token              EQU 11
+
+|vp8_decode_mb_tokens_v5| PROC
+    stmdb       sp!, {r4 - r11, lr}
+    sub         sp, sp, #l_stacksize
+    mov         r7, r1
+    mov         r9, r0                      ;DETOK *detoken
+
+    ldr         r1, [r9, #detok_current_bc]
+    ldr         r0, [r9, #detok_qcoeff_start_ptr]
+    mov         r11, #0
+    mov         r3, #0x10
+
+    cmp         r7, #1
+    addeq       r11, r11, #24
+    addeq       r3, r3, #8
+    addeq       r0, r0, #3, 24
+
+    str         r0, [sp, #l_qcoeff]
+    str         r11, [sp, #l_i]
+    str         r7, [sp, #l_type]
+    str         r3, [sp, #l_stop]
+    str         r1, [sp, #l_bc]
+
+    add         lr, r9, r7, lsl #2
+
+    ldr         r2, [r1, #bool_decoder_buffer]
+    ldr         r3, [r1, #bool_decoder_pos]
+
+    ldr         r10, [lr, #detok_coef_probs]
+    ldr         r5, [r1, #bool_decoder_count]
+    ldr         r6, [r1, #bool_decoder_range]
+    ldr         r4, [r1, #bool_decoder_value]
+    add         r8, r2, r3
+
+    str         r10, [sp, #l_coef_ptr]
+
+
+    ;align 4
+BLOCK_LOOP
+    ldr         r3, [r9, #detok_ptr_onyxblock2context_leftabove]
+    ldr         r2, [r9, #DETOK_A]
+    ldr         r1, [r9, #DETOK_L]
+    ldrb        r12, [r3, +r11]                                 ; detoken->ptr_onyxblock2context_leftabove[i]
+
+    cmp         r7, #0                                          ; check type
+    moveq       r7, #1
+    movne       r7, #0
+
+    ldr         r0, [r2, +r12, lsl #2]                          ; a
+    add         r1, r1, r12, lsl #4
+    add         r3, r3, r11
+
+    ldrb        r2, [r3, #c_onyxblock2above_offset]
+    ldrb        r3, [r3, #c_onyxblock2left_offset]
+    mov         lr, #c_entropy_nodes
+;;  ;++
+
+    ldr         r2, [r0, +r2, lsl #2]!
+    add         r3, r1, r3, lsl #2
+    str         r3, [sp, #l_l_ptr]
+    ldr         r3, [r3]
+
+    cmp         r2, #0
+    movne       r2, #1
+    cmp         r3, #0
+    addne       r2, r2, #1
+
+    str         r0, [sp, #l_a_ptr]
+    smlabb      r0, r2, lr, r10
+    mov         r1, #0                                          ; t = 0
+    str         r7, [sp, #l_c]
+
+    ;align 4
+COEFF_LOOP
+    ldr         r3, [r9, #detok_ptr_onyx_coef_bands_x]
+    ldr         lr, [r9, #detok_onyx_coef_tree_ptr]
+
+;;the following two lines are used if onyx_coef_bands_x is UINT16
+;;  add         r3, r3, r7, lsl #1
+;;  ldrh        r3, [r3]
+
+;;the following line is used if onyx_coef_bands_x is UINT8
+    ldrb        r3, [r7, +r3]
+
+
+;;  ;++
+;;  pld         [r8]
+    ;++
+    add         r0, r0, r3
+
+    ;align 4
+get_token_loop
+    ldrb        r2, [r0, +r1, asr #1]
+    mov         r3, r6, lsl #8
+    sub         r3, r3, #256                    ;split = 1 +  (((range-1) * probability) >> 8)
+    mov         r10, #1
+
+    smlawb      r2, r3, r2, r10
+    ldrb        r12, [r8]                       ;load cx data byte in stall slot
+    ;++
+
+    subs        r3, r4, r2, lsl #24             ;x = value-(split<<24)
+    addhs       r1, r1, #1                      ;t += 1
+    movhs       r4, r3                          ;update value
+    subhs       r2, r6, r2                      ;range = range - split
+    movlo       r6, r2
+
+;;; ldrsbhs     r1, [r1, +lr]
+    ldrsb     r1, [r1, +lr]
+
+
+;; use branch for short pipelines ???
+;;  cmp         r2, #0x80
+;;  bcs         |$LN22@decode_mb_to|
+
+    clz         r3, r2
+    sub         r3, r3, #24
+    subs        r5, r5, r3
+    mov         r6, r2, lsl r3
+    mov         r4, r4, lsl r3
+
+;; use branch for short pipelines ???
+;;  bgt         |$LN22@decode_mb_to|
+
+    addle         r5, r5, #8
+    rsble         r3, r5, #8
+    addle         r8, r8, #1
+    orrle         r4, r4, r12, lsl r3
+
+;;|$LN22@decode_mb_to|
+
+    cmp         r1, #0
+    bgt         get_token_loop
+
+    cmn         r1, #c_dct_eob_token             ;if(t == -DCT_EOB_TOKEN)
+    beq         END_OF_BLOCK
+
+    rsb         lr, r1, #0                      ;v = -t;
+
+    cmp         lr, #4                          ;if(v > FOUR_TOKEN)
+    ble         SKIP_EXTRABITS
+
+    ldr         r3, [r9, #detok_teb_base_ptr]
+    mov         r11, #1
+    add         r7, r3, lr, lsl #4
+
+    ldrsh       lr, [r7, #tokenextrabits_min_val];v = teb_ptr->min_val
+    ldrsh       r0, [r7, #tokenextrabits_length];bits_count = teb_ptr->Length
+
+extrabits_loop
+    add         r3, r0, r7
+
+    ldrb        r2, [r3, #4]
+    mov         r3, r6, lsl #8
+    sub         r3, r3, #256                    ;split = 1 +  (((range-1) * probability) >> 8)
+    mov         r10, #1
+
+    smlawb      r2, r3, r2, r10
+    ldrb        r12, [r8]
+    ;++
+
+    subs        r10, r4, r2, lsl #24            ;x = value-(split<<24)
+    movhs       r4, r10                         ;update value
+    subhs       r2, r6, r2                      ;range = range - split
+    addhs       lr, lr, r11, lsl r0             ;v += ((UINT16)1<<bits_count)
+    movlo       r6, r2                          ;range = split
+
+
+;; use branch for short pipelines ???
+;;  cmp         r2, #0x80
+;;  bcs         |$LN10@decode_mb_to|
+
+    clz         r3, r2
+    sub         r3, r3, #24
+    subs        r5, r5, r3
+    mov         r6, r2, lsl r3                  ;range
+    mov         r4, r4, lsl r3                  ;value
+
+    addle       r5, r5, #8
+    addle       r8, r8, #1
+    rsble       r3, r5, #8
+    orrle       r4, r4, r12, lsl r3
+
+;;|$LN10@decode_mb_to|
+    subs         r0, r0, #1
+    bpl         extrabits_loop
+
+
+SKIP_EXTRABITS
+    ldr         r11, [sp, #l_qcoeff]
+    ldr         r0, [sp, #l_coef_ptr]
+
+    cmp         r1, #0                          ;check for nonzero token
+    beq         SKIP_EOB_CHECK              ;if t is zero, we will skip the eob table chec
+
+    sub         r3, r6, #1                      ;range - 1
+    ;++
+    mov         r3, r3, lsl #7                  ; *= onyx_prob_half  (128)
+    ;++
+    mov         r3, r3, lsr #8
+    add         r2, r3, #1                      ;split
+
+    subs        r3, r4, r2, lsl #24             ;x = value-(split<<24)
+    movhs       r4, r3                          ;update value
+    subhs       r2, r6, r2                      ;range = range - split
+    mvnhs       r3, lr
+    addhs       lr, r3, #1                      ;v = (v ^ -1) + 1
+    movlo       r6, r2                          ;range = split
+
+;; use branch for short pipelines ???
+;;  cmp         r2, #0x80
+;;  bcs         |$LN6@decode_mb_to|
+
+    clz         r3, r2
+    sub         r3, r3, #24
+    subs        r5, r5, r3
+    mov         r6, r2, lsl r3
+    mov         r4, r4, lsl r3
+    ldrleb      r2, [r8], #1
+    addle       r5, r5, #8
+    rsble       r3, r5, #8
+    orrle       r4, r4, r2, lsl r3
+
+;;|$LN6@decode_mb_to|
+    add         r0, r0, #0xB
+
+    cmn         r1, #1
+
+    addlt       r0, r0, #0xB
+
+    mvn         r1, #1
+
+SKIP_EOB_CHECK
+    ldr         r7, [sp, #l_c]
+    ldr         r3, [r9, #detok_scan]
+    add         r1, r1, #2
+    cmp         r7, #(0x10 - 1)                     ;assume one less for now.... increment below
+
+    ldr         r3, [r3, +r7, lsl #2]
+    add         r7, r7, #1
+    add         r3, r11, r3, lsl #1
+
+    str         r7, [sp, #l_c]
+    strh        lr, [r3]
+
+    blt         COEFF_LOOP
+
+    sub         r7, r7, #1                          ;if(t != -DCT_EOB_TOKEN) --c
+
+END_OF_BLOCK
+    ldr         r3, [sp, #l_type]
+    ldr         r10, [sp, #l_coef_ptr]
+    ldr         r0, [sp, #l_qcoeff]
+    ldr         r11, [sp, #l_i]
+    ldr         r12, [sp, #l_stop]
+
+    cmp         r3, #0
+    moveq       r1, #1
+    movne       r1, #0
+    add         r3, r11, r9
+
+    cmp         r7, r1
+    strb        r7, [r3, #detok_eob]
+
+    ldr         r7, [sp, #l_l_ptr]
+    ldr         r2, [sp, #l_a_ptr]
+    movne       r3, #1
+    moveq       r3, #0
+
+    add         r0, r0, #0x20
+    add         r11, r11, #1
+    str         r3, [r7]
+    str         r3, [r2]
+    str         r0, [sp, #l_qcoeff]
+    str         r11, [sp, #l_i]
+
+    cmp         r11, r12                            ;i >= stop ?
+    ldr         r7, [sp, #l_type]
+    mov         lr, #0xB
+
+    blt         BLOCK_LOOP
+
+    cmp         r11, #0x19
+    bne         ln2_decode_mb_to
+
+    ldr         r12, [r9, #detok_qcoeff_start_ptr]
+    ldr         r10, [r9, #detok_coef_probs]
+    mov         r7, #0
+    mov         r3, #0x10
+    str         r12, [sp, #l_qcoeff]
+    str         r7, [sp, #l_i]
+    str         r7, [sp, #l_type]
+    str         r3, [sp, #l_stop]
+
+    str         r10, [sp, #l_coef_ptr]
+
+    b           BLOCK_LOOP
+
+ln2_decode_mb_to
+    cmp         r11, #0x10
+    bne         ln1_decode_mb_to
+
+    ldr         r10, [r9, #0x30]
+
+    mov         r7, #2
+    mov         r3, #0x18
+
+    str         r7, [sp, #l_type]
+    str         r3, [sp, #l_stop]
+
+    str         r10, [sp, #l_coef_ptr]
+    b           BLOCK_LOOP
+
+ln1_decode_mb_to
+    ldr         r2, [sp, #l_bc]
+    mov         r0, #0
+    nop
+
+    ldr         r3, [r2, #bool_decoder_buffer]
+    str         r5, [r2, #bool_decoder_count]
+    str         r4, [r2, #bool_decoder_value]
+    sub         r3, r8, r3
+    str         r3, [r2, #bool_decoder_pos]
+    str         r6, [r2, #bool_decoder_range]
+
+    add         sp, sp, #l_stacksize
+    ldmia       sp!, {r4 - r11, pc}
+
+    ENDP  ; |vp8_decode_mb_tokens_v5|
+
+    END
diff --git a/vp8/decoder/arm/dsystemdependent.c b/vp8/decoder/arm/dsystemdependent.c
new file mode 100644
index 0000000..455c83a
--- /dev/null
+++ b/vp8/decoder/arm/dsystemdependent.c
@@ -0,0 +1,44 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "blockd.h"
+#include "pragmas.h"
+#include "postproc.h"
+#include "dboolhuff.h"
+#include "dequantize.h"
+#include "onyxd_int.h"
+
+void vp8_dmachine_specific_config(VP8D_COMP *pbi)
+{
+#if CONFIG_RUNTIME_CPU_DETECT
+    pbi->mb.rtcd         = &pbi->common.rtcd;
+#if HAVE_ARMV7
+    pbi->dequant.block   = vp8_dequantize_b_neon;
+    pbi->dequant.idct    = vp8_dequant_idct_neon;
+    pbi->dequant.idct_dc = vp8_dequant_dc_idct_neon;
+    pbi->dboolhuff.start = vp8dx_start_decode_c;
+    pbi->dboolhuff.stop  = vp8dx_stop_decode_c;
+    pbi->dboolhuff.fill  = vp8dx_bool_decoder_fill_c;
+    pbi->dboolhuff.debool = vp8dx_decode_bool_c;
+    pbi->dboolhuff.devalue = vp8dx_decode_value_c;
+
+#elif HAVE_ARMV6
+    pbi->dequant.block   = vp8_dequantize_b_v6;
+    pbi->dequant.idct    = vp8_dequant_idct_v6;
+    pbi->dequant.idct_dc = vp8_dequant_dc_idct_v6;
+    pbi->dboolhuff.start = vp8dx_start_decode_c;
+    pbi->dboolhuff.stop  = vp8dx_stop_decode_c;
+    pbi->dboolhuff.fill  = vp8dx_bool_decoder_fill_c;
+    pbi->dboolhuff.debool = vp8dx_decode_bool_c;
+    pbi->dboolhuff.devalue = vp8dx_decode_value_c;
+#endif
+#endif
+}
diff --git a/vp8/decoder/arm/neon/dboolhuff_neon.asm b/vp8/decoder/arm/neon/dboolhuff_neon.asm
new file mode 100644
index 0000000..7ec62a3
--- /dev/null
+++ b/vp8/decoder/arm/neon/dboolhuff_neon.asm
@@ -0,0 +1,159 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_decode_value_neon|
+    EXPORT  |vp8dx_start_decode_neon|
+    EXPORT  |vp8dx_stop_decode_neon|
+    EXPORT  |vp8dx_decode_bool_neon|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    INCLUDE vpx_asm_offsets.asm
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+
+;   int z = 0;
+;   int bit;
+;   for ( bit=bits-1; bit>=0; bit-- )
+;   {
+;       z |= (vp8dx_decode_bool(br, 0x80)<<bit);
+;   }
+;   return z;
+
+;int vp8_decode_value_neon ( BOOL_DECODER *br, int bits )
+|vp8_decode_value_neon| PROC
+    stmdb   sp!, {r4 - r6, lr}
+    mov     r4, r0
+    mov     r5, r1
+    mov     r6, #0
+
+    subs    r5, r5, #1
+    bmi     decode_value_exit
+
+decode_value_loop
+    mov     r1, #0x80
+    mov     r0, r4
+    bl      vp8dx_decode_bool_neon_internal       ; needed for conversion to s file
+    orr     r6, r6, r0, lsl r5
+    subs    r5, r5, #1
+    bpl     decode_value_loop
+
+decode_value_exit
+    mov     r0, r6
+    ldmia   sp!, {r4 - r6, pc}
+    ENDP    ; |vp8_decode_value_neon|
+
+
+;void vp8dx_start_decode_neon ( BOOL_DECODER *br, unsigned char *source )
+|vp8dx_start_decode_neon| PROC
+    stmdb   sp!, {r4 - r5, lr}
+    mov     r2, #0
+    mov     r3, #255
+
+    str     r2, [r0, #bool_decoder_lowvalue]
+    str     r3, [r0, #bool_decoder_range]
+    str     r1, [r0, #bool_decoder_buffer]
+
+    mov     r3, #8
+    mov     r2, #4
+    str     r3, [r0, #bool_decoder_count]
+    str     r2, [r0, #bool_decoder_pos]
+
+    ldrb    r2, [r1, #3]
+    ldrb    r3, [r1, #2]
+    ldrb    r4, [r1, #1]
+    ldrb    r5, [r1]
+
+    orr     r1, r2, r3, lsl #8
+    orr     r1, r1, r4, lsl #16
+    orr     r1, r1, r5, lsl #24
+
+    str     r1, [r0, #bool_decoder_value]
+
+    ldmia   sp!, {r4 - r5, pc}
+    ENDP    ; |vp8dx_start_decode_neon|
+
+
+;void vp8dx_stop_decode_neon ( BOOL_DECODER *bc );
+|vp8dx_stop_decode_neon| PROC
+    mov     pc, lr
+    ENDP    ; |vp8dx_stop_decode_neon|
+
+
+; bigsplit  RN  r1
+; buffer_v  RN  r1
+; count_v       RN  r4
+; range_v       RN  r2
+; value_v       RN  r3
+; pos_v     RN  r5
+; split     RN  r6
+; bit           RN  lr
+;int vp8dx_decode_bool_neon ( BOOL_DECODER *br, int probability )
+|vp8dx_decode_bool_neon| PROC
+vp8dx_decode_bool_neon_internal
+;LDRD and STRD doubleword data transfers must be eight-byte aligned. Use ALIGN 8
+;before memory allocation
+    stmdb   sp!, {r4 - r5, lr}
+
+    ldr     r2, [r0, #bool_decoder_range]       ;load range (r2), value(r3)
+    ldr     r3, [r0, #bool_decoder_value]
+    ;ldrd   r2, r3, [r0, #bool_decoder_range]   ;ldrd costs 2 cycles
+    ;
+
+    mov     r4, r2, lsl #8
+    sub     r4, r4, #256
+    mov     r12, #1
+
+    smlawb  r4, r4, r1, r12         ;split = 1 +  (((range-1) * probability) >> 8)
+
+    mov     lr, r0
+    mov     r0, #0                  ;bit = 0
+    ;
+    subs    r5, r3, r4, lsl #24
+
+    subhs   r2, r2, r4              ;range = br->range-split
+    movlo   r2, r4                  ;range = split
+    movhs   r0, #1                  ;bit = 1
+    movhs   r3, r5                  ;value = value-bigsplit
+
+    cmp     r2, #0x80
+    blt     range_less_0x80
+    strd    r2, r3, [lr, #bool_decoder_range]   ;store result
+
+    ldmia   sp!, {r4 - r5, pc}
+
+range_less_0x80
+
+    ldrd    r4, r5, [lr, #bool_decoder_count]   ;load count, pos, buffer
+    ldr     r1, [lr, #bool_decoder_buffer]
+
+    clz     r12, r2
+    add     r1, r1, r5
+
+    sub     r12, r12, #24
+    subs    r4, r4, r12             ;count -= shift
+    mov     r2, r2, lsl r12         ;range <<= shift
+    mov     r3, r3, lsl r12         ;value <<= shift
+    addle   r4, r4, #8              ;count += 8
+    ldrleb  r12, [r1], #1           ;br->buffer[br->pos]
+
+    rsble   r1, r4, #8              ;-count
+    addle   r5, r5, #1              ;br->pos++
+    orrle   r3, r3, r12, lsl r1     ;value |= (br->buffer[br->pos]) << (-count)
+
+    strd    r2, r3, [lr, #bool_decoder_range]   ;store result
+    strd    r4, r5, [lr, #bool_decoder_count]
+
+    ldmia   sp!, {r4 - r5, pc}
+    ENDP    ; |vp8dx_decode_bool_neon|
+
+    END
diff --git a/vp8/decoder/arm/neon/dequantdcidct_neon.asm b/vp8/decoder/arm/neon/dequantdcidct_neon.asm
new file mode 100644
index 0000000..3392f2c
--- /dev/null
+++ b/vp8/decoder/arm/neon/dequantdcidct_neon.asm
@@ -0,0 +1,133 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_dequant_dc_idct_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;void vp8_dequant_dc_idct_c(short *input, short *dq, short *output, int pitch, int Dc);
+; r0    short *input,
+; r1    short *dq,
+; r2    short *output,
+; r3    int pitch,
+; (stack)   int Dc
+|vp8_dequant_dc_idct_neon| PROC
+    vld1.16         {q3, q4}, [r0]
+    vld1.16         {q5, q6}, [r1]
+
+    ldr             r1, [sp]                ;load Dc from stack
+
+    ldr             r12, _dcidct_coeff_
+
+    vmul.i16        q1, q3, q5              ;input for short_idct4x4llm_neon
+    vmul.i16        q2, q4, q6
+
+    vmov.16         d2[0], r1
+
+;|short_idct4x4llm_neon| PROC
+    vld1.16         {d0}, [r12]
+    vswp            d3, d4                  ;q2(vp[4] vp[12])
+
+    vqdmulh.s16     q3, q2, d0[2]
+    vqdmulh.s16     q4, q2, d0[0]
+
+    vqadd.s16       d12, d2, d3             ;a1
+    vqsub.s16       d13, d2, d3             ;b1
+
+    vshr.s16        q3, q3, #1
+    vshr.s16        q4, q4, #1
+
+    vqadd.s16       q3, q3, q2              ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
+    vqadd.s16       q4, q4, q2
+
+    ;d6 - c1:temp1
+    ;d7 - d1:temp2
+    ;d8 - d1:temp1
+    ;d9 - c1:temp2
+
+    vqsub.s16       d10, d6, d9             ;c1
+    vqadd.s16       d11, d7, d8             ;d1
+
+    vqadd.s16       d2, d12, d11
+    vqadd.s16       d3, d13, d10
+    vqsub.s16       d4, d13, d10
+    vqsub.s16       d5, d12, d11
+
+    vtrn.32         d2, d4
+    vtrn.32         d3, d5
+    vtrn.16         d2, d3
+    vtrn.16         d4, d5
+
+; memset(input, 0, 32) -- 32bytes
+    vmov.i16        q14, #0
+
+    vswp            d3, d4
+    vqdmulh.s16     q3, q2, d0[2]
+    vqdmulh.s16     q4, q2, d0[0]
+
+    vqadd.s16       d12, d2, d3             ;a1
+    vqsub.s16       d13, d2, d3             ;b1
+
+    vmov            q15, q14
+
+    vshr.s16        q3, q3, #1
+    vshr.s16        q4, q4, #1
+
+    vqadd.s16       q3, q3, q2              ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
+    vqadd.s16       q4, q4, q2
+
+    vqsub.s16       d10, d6, d9             ;c1
+    vqadd.s16       d11, d7, d8             ;d1
+
+    vqadd.s16       d2, d12, d11
+    vqadd.s16       d3, d13, d10
+    vqsub.s16       d4, d13, d10
+    vqsub.s16       d5, d12, d11
+
+    vst1.16         {q14, q15}, [r0]
+
+    vrshr.s16       d2, d2, #3
+    vrshr.s16       d3, d3, #3
+    vrshr.s16       d4, d4, #3
+    vrshr.s16       d5, d5, #3
+
+    add             r1, r2, r3
+    add             r12, r1, r3
+    add             r0, r12, r3
+
+    vtrn.32         d2, d4
+    vtrn.32         d3, d5
+    vtrn.16         d2, d3
+    vtrn.16         d4, d5
+
+    vst1.16         {d2}, [r2]
+    vst1.16         {d3}, [r1]
+    vst1.16         {d4}, [r12]
+    vst1.16         {d5}, [r0]
+
+    bx             lr
+
+    ENDP
+
+;-----------------
+    AREA    dcidct4x4_dat, DATA, READWRITE          ;read/write by default
+;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
+;One word each is reserved. Label filter_coeff can be used to access the data.
+;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+_dcidct_coeff_
+    DCD     dcidct_coeff
+dcidct_coeff
+    DCD     0x4e7b4e7b, 0x8a8c8a8c
+
+;20091, 20091, 35468, 35468
+
+    END
diff --git a/vp8/decoder/arm/neon/dequantidct_neon.asm b/vp8/decoder/arm/neon/dequantidct_neon.asm
new file mode 100644
index 0000000..bba4d5d
--- /dev/null
+++ b/vp8/decoder/arm/neon/dequantidct_neon.asm
@@ -0,0 +1,128 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_dequant_idct_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;void vp8_dequant_idct_c(short *input, short *dq, short *output, int pitch);
+; r0    short *input,
+; r1    short *dq,
+; r2    short *output,
+; r3    int pitch,
+|vp8_dequant_idct_neon| PROC
+    vld1.16         {q3, q4}, [r0]
+    vld1.16         {q5, q6}, [r1]
+
+    ldr             r12, _didct_coeff_
+
+    vmul.i16        q1, q3, q5              ;input for short_idct4x4llm_neon
+    vmul.i16        q2, q4, q6
+
+;|short_idct4x4llm_neon| PROC
+    vld1.16         {d0}, [r12]
+    vswp            d3, d4                  ;q2(vp[4] vp[12])
+
+    vqdmulh.s16     q3, q2, d0[2]
+    vqdmulh.s16     q4, q2, d0[0]
+
+    vqadd.s16       d12, d2, d3             ;a1
+    vqsub.s16       d13, d2, d3             ;b1
+
+    vshr.s16        q3, q3, #1
+    vshr.s16        q4, q4, #1
+
+    vqadd.s16       q3, q3, q2              ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
+    vqadd.s16       q4, q4, q2
+
+    ;d6 - c1:temp1
+    ;d7 - d1:temp2
+    ;d8 - d1:temp1
+    ;d9 - c1:temp2
+
+    vqsub.s16       d10, d6, d9             ;c1
+    vqadd.s16       d11, d7, d8             ;d1
+
+    vqadd.s16       d2, d12, d11
+    vqadd.s16       d3, d13, d10
+    vqsub.s16       d4, d13, d10
+    vqsub.s16       d5, d12, d11
+
+    vtrn.32         d2, d4
+    vtrn.32         d3, d5
+    vtrn.16         d2, d3
+    vtrn.16         d4, d5
+
+; memset(input, 0, 32) -- 32bytes
+    vmov.i16        q14, #0
+
+    vswp            d3, d4
+    vqdmulh.s16     q3, q2, d0[2]
+    vqdmulh.s16     q4, q2, d0[0]
+
+    vqadd.s16       d12, d2, d3             ;a1
+    vqsub.s16       d13, d2, d3             ;b1
+
+    vmov            q15, q14
+
+    vshr.s16        q3, q3, #1
+    vshr.s16        q4, q4, #1
+
+    vqadd.s16       q3, q3, q2              ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
+    vqadd.s16       q4, q4, q2
+
+    vqsub.s16       d10, d6, d9             ;c1
+    vqadd.s16       d11, d7, d8             ;d1
+
+    vqadd.s16       d2, d12, d11
+    vqadd.s16       d3, d13, d10
+    vqsub.s16       d4, d13, d10
+    vqsub.s16       d5, d12, d11
+
+    vst1.16         {q14, q15}, [r0]
+
+    vrshr.s16       d2, d2, #3
+    vrshr.s16       d3, d3, #3
+    vrshr.s16       d4, d4, #3
+    vrshr.s16       d5, d5, #3
+
+    add             r1, r2, r3
+    add             r12, r1, r3
+    add             r0, r12, r3
+
+    vtrn.32         d2, d4
+    vtrn.32         d3, d5
+    vtrn.16         d2, d3
+    vtrn.16         d4, d5
+
+    vst1.16         {d2}, [r2]
+    vst1.16         {d3}, [r1]
+    vst1.16         {d4}, [r12]
+    vst1.16         {d5}, [r0]
+
+    bx             lr
+
+    ENDP
+
+;-----------------
+    AREA    didct4x4_dat, DATA, READWRITE           ;read/write by default
+;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
+;One word each is reserved. Label filter_coeff can be used to access the data.
+;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+_didct_coeff_
+    DCD     didct_coeff
+didct_coeff
+    DCD     0x4e7b4e7b, 0x8a8c8a8c
+
+;20091, 20091, 35468, 35468
+
+    END
diff --git a/vp8/decoder/arm/neon/dequantizeb_neon.asm b/vp8/decoder/arm/neon/dequantizeb_neon.asm
new file mode 100644
index 0000000..1bde946
--- /dev/null
+++ b/vp8/decoder/arm/neon/dequantizeb_neon.asm
@@ -0,0 +1,33 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_dequantize_b_loop_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0    short *Q,
+; r1    short *DQC
+; r2    short *DQ
+|vp8_dequantize_b_loop_neon| PROC
+    vld1.16         {q0, q1}, [r0]
+    vld1.16         {q2, q3}, [r1]
+
+    vmul.i16        q4, q0, q2
+    vmul.i16        q5, q1, q3
+
+    vst1.16         {q4, q5}, [r2]
+
+    bx             lr
+
+    ENDP
+
+    END
diff --git a/vp8/decoder/dboolhuff.c b/vp8/decoder/dboolhuff.c
new file mode 100644
index 0000000..442054e
--- /dev/null
+++ b/vp8/decoder/dboolhuff.c
@@ -0,0 +1,174 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "dboolhuff.h"
+#include "vpx_ports/mem.h"
+#include "vpx_mem/vpx_mem.h"
+
+DECLARE_ALIGNED(16, const unsigned int, vp8dx_bitreader_norm[256]) =
+{
+    0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+
+static void copy_in(BOOL_DECODER *br, unsigned int to_write)
+{
+    if (to_write > br->user_buffer_sz)
+        to_write = br->user_buffer_sz;
+
+    memcpy(br->write_ptr, br->user_buffer, to_write);
+    br->user_buffer += to_write;
+    br->user_buffer_sz -= to_write;
+    br->write_ptr = br_ptr_advance(br->write_ptr, to_write);
+}
+
+int vp8dx_start_decode_c(BOOL_DECODER *br, const unsigned char *source,
+                        unsigned int source_sz)
+{
+    br->lowvalue = 0;
+    br->range    = 255;
+    br->count    = 0;
+    br->user_buffer    = source;
+    br->user_buffer_sz = source_sz;
+
+    if (source_sz && !source)
+        return 1;
+
+    /* Allocate the ring buffer backing store with alignment equal to the
+     * buffer size*2 so that a single pointer can be used for wrapping rather
+     * than a pointer+offset.
+     */
+    br->decode_buffer  = vpx_memalign(VP8_BOOL_DECODER_SZ * 2,
+                                      VP8_BOOL_DECODER_SZ);
+
+    if (!br->decode_buffer)
+        return 1;
+
+    /* Populate the buffer */
+    br->read_ptr = br->decode_buffer;
+    br->write_ptr = br->decode_buffer;
+    copy_in(br, VP8_BOOL_DECODER_SZ);
+
+    /* Read the first byte */
+    br->value = (*br->read_ptr++) << 8;
+    return 0;
+}
+
+
+void vp8dx_bool_decoder_fill_c(BOOL_DECODER *br)
+{
+    int          left, right;
+
+    /* Find available room in the buffer */
+    left = 0;
+    right = br->read_ptr - br->write_ptr;
+
+    if (right < 0)
+    {
+        /* Read pointer is behind the write pointer. We can write from the
+         * write pointer to the end of the buffer.
+         */
+        right = VP8_BOOL_DECODER_SZ - (br->write_ptr - br->decode_buffer);
+        left = br->read_ptr - br->decode_buffer;
+    }
+
+    if (right + left < 128)
+        return;
+
+    if (right)
+        copy_in(br, right);
+
+    if (left)
+    {
+        br->write_ptr = br->decode_buffer;
+        copy_in(br, left);
+    }
+
+}
+
+
+void vp8dx_stop_decode_c(BOOL_DECODER *bc)
+{
+    vpx_free(bc->decode_buffer);
+    bc->decode_buffer = 0;
+}
+
+#if 0
+/*
+ * Until optimized versions of these functions are available, we
+ * keep the implementation in the header to allow inlining.
+ *
+ * The RTCD-style invocations are still in place so this can
+ * be switched by just uncommenting these functions here and
+ * the DBOOLHUFF_INVOKE calls in the header.
+ */
+int vp8dx_decode_bool_c(BOOL_DECODER *br, int probability)
+{
+    unsigned int bit=0;
+    unsigned int split;
+    unsigned int bigsplit;
+    register unsigned int range = br->range;
+    register unsigned int value = br->value;
+
+    split = 1 + (((range-1) * probability) >> 8);
+    bigsplit = (split<<8);
+
+    range = split;
+    if(value >= bigsplit)
+    {
+        range = br->range-split;
+        value = value-bigsplit;
+        bit = 1;
+    }
+
+    /*if(range>=0x80)
+    {
+        br->value = value;
+        br->range = range;
+        return bit;
+    }*/
+
+    {
+        int count = br->count;
+        register unsigned int shift = vp8dx_bitreader_norm[range];
+        range <<= shift;
+        value <<= shift;
+        count -= shift;
+        if(count <= 0)
+        {
+            value |= (*br->read_ptr) << (-count);
+            br->read_ptr = br_ptr_advance(br->read_ptr, 1);
+            count += 8 ;
+        }
+        br->count = count;
+    }
+    br->value = value;
+    br->range = range;
+    return bit;
+}
+
+int vp8dx_decode_value_c(BOOL_DECODER *br, int bits)
+{
+    int z = 0;
+    int bit;
+    for ( bit=bits-1; bit>=0; bit-- )
+    {
+        z |= (vp8dx_decode_bool(br, 0x80)<<bit);
+    }
+    return z;
+}
+#endif
diff --git a/vp8/decoder/dboolhuff.h b/vp8/decoder/dboolhuff.h
new file mode 100644
index 0000000..f5c9822
--- /dev/null
+++ b/vp8/decoder/dboolhuff.h
@@ -0,0 +1,226 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef DBOOLHUFF_H
+#define DBOOLHUFF_H
+#include "vpx_ports/config.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/vpx_integer.h"
+
+/* Size of the bool decoder backing storage
+ *
+ * This size was chosen to be greater than the worst case encoding of a
+ * single macroblock. This was calcluated as follows (python):
+ *
+ *     def max_cost(prob):
+ *         return max(prob_costs[prob], prob_costs[255-prob]) / 256;
+ *
+ *     tree_nodes_cost = 7 * max_cost(255)
+ *     extra_bits_cost = sum([max_cost(bit) for bit in extra_bits])
+ *     sign_bit_cost = max_cost(128)
+ *     total_cost = tree_nodes_cost + extra_bits_cost + sign_bit_cost
+ *
+ * where the prob_costs table was taken from the C vp8_prob_cost table in
+ * boolhuff.c and the extra_bits table was taken from the 11 extrabits for
+ * a category 6 token as defined in vp8d_token_extra_bits2/detokenize.c
+ *
+ * This equation produced a maximum of 79 bits per coefficient. Scaling up
+ * to the macroblock level:
+ *
+ *     79 bits/coeff * 16 coeff/block * 25 blocks/macroblock = 31600 b/mb
+ *
+ *     4096 bytes = 32768 bits > 31600
+ */
+#define VP8_BOOL_DECODER_SZ       4096
+#define VP8_BOOL_DECODER_MASK     (VP8_BOOL_DECODER_SZ-1)
+#define VP8_BOOL_DECODER_PTR_MASK (~(uintptr_t)(VP8_BOOL_DECODER_SZ))
+
+struct vp8_dboolhuff_rtcd_vtable;
+
+typedef struct
+{
+    unsigned int         lowvalue;
+    unsigned int         range;
+    unsigned int         value;
+    int                  count;
+    const unsigned char *user_buffer;
+    unsigned int         user_buffer_sz;
+    unsigned char       *decode_buffer;
+    const unsigned char *read_ptr;
+    unsigned char       *write_ptr;
+#if CONFIG_RUNTIME_CPU_DETECT
+    struct vp8_dboolhuff_rtcd_vtable *rtcd;
+#endif
+} BOOL_DECODER;
+
+#define prototype_dbool_start(sym) int sym(BOOL_DECODER *br, \
+    const unsigned char *source, unsigned int source_sz)
+#define prototype_dbool_stop(sym) void sym(BOOL_DECODER *bc)
+#define prototype_dbool_fill(sym) void sym(BOOL_DECODER *br)
+#define prototype_dbool_debool(sym) int sym(BOOL_DECODER *br, int probability)
+#define prototype_dbool_devalue(sym) int sym(BOOL_DECODER *br, int bits);
+
+#if ARCH_ARM
+#include "arm/dboolhuff_arm.h"
+#endif
+
+#ifndef vp8_dbool_start
+#define vp8_dbool_start vp8dx_start_decode_c
+#endif
+
+#ifndef vp8_dbool_stop
+#define vp8_dbool_stop vp8dx_stop_decode_c
+#endif
+
+#ifndef vp8_dbool_fill
+#define vp8_dbool_fill vp8dx_bool_decoder_fill_c
+#endif
+
+#ifndef vp8_dbool_debool
+#define vp8_dbool_debool vp8dx_decode_bool_c
+#endif
+
+#ifndef vp8_dbool_devalue
+#define vp8_dbool_devalue vp8dx_decode_value_c
+#endif
+
+extern prototype_dbool_start(vp8_dbool_start);
+extern prototype_dbool_stop(vp8_dbool_stop);
+extern prototype_dbool_fill(vp8_dbool_fill);
+extern prototype_dbool_debool(vp8_dbool_debool);
+extern prototype_dbool_devalue(vp8_dbool_devalue);
+
+typedef prototype_dbool_start((*vp8_dbool_start_fn_t));
+typedef prototype_dbool_stop((*vp8_dbool_stop_fn_t));
+typedef prototype_dbool_fill((*vp8_dbool_fill_fn_t));
+typedef prototype_dbool_debool((*vp8_dbool_debool_fn_t));
+typedef prototype_dbool_devalue((*vp8_dbool_devalue_fn_t));
+
+typedef struct vp8_dboolhuff_rtcd_vtable {
+    vp8_dbool_start_fn_t   start;
+    vp8_dbool_stop_fn_t    stop;
+    vp8_dbool_fill_fn_t    fill;
+    vp8_dbool_debool_fn_t  debool;
+    vp8_dbool_devalue_fn_t devalue;
+} vp8_dboolhuff_rtcd_vtable_t;
+
+// There are no processor-specific versions of these
+// functions right now. Disable RTCD to avoid using
+// function pointers which gives a speed boost
+//#ifdef ENABLE_RUNTIME_CPU_DETECT
+//#define DBOOLHUFF_INVOKE(ctx,fn) (ctx)->fn
+//#define IF_RTCD(x) (x)
+//#else
+#define DBOOLHUFF_INVOKE(ctx,fn) vp8_dbool_##fn
+#define IF_RTCD(x) NULL
+//#endif
+
+static unsigned char *br_ptr_advance(const unsigned char *_ptr,
+                                     unsigned int n)
+{
+    uintptr_t  ptr = (uintptr_t)_ptr;
+
+    ptr += n;
+    ptr &= VP8_BOOL_DECODER_PTR_MASK;
+
+    return (void *)ptr;
+}
+
+DECLARE_ALIGNED(16, extern const unsigned int, vp8dx_bitreader_norm[256]);
+
+/* wrapper functions to hide RTCD. static means inline means hopefully no
+ * penalty
+ */
+static int vp8dx_start_decode(BOOL_DECODER *br,
+        struct vp8_dboolhuff_rtcd_vtable *rtcd,
+        const unsigned char *source, unsigned int source_sz) {
+#if CONFIG_RUNTIME_CPU_DETECT
+    br->rtcd = rtcd;
+#endif
+    return DBOOLHUFF_INVOKE(rtcd, start)(br, source, source_sz);
+}
+static void vp8dx_stop_decode(BOOL_DECODER *br) {
+    DBOOLHUFF_INVOKE(br->rtcd, stop)(br);
+}
+static void vp8dx_bool_decoder_fill(BOOL_DECODER *br) {
+    DBOOLHUFF_INVOKE(br->rtcd, fill)(br);
+}
+static int vp8dx_decode_bool(BOOL_DECODER *br, int probability) {
+  /*
+   * Until optimized versions of this function are available, we
+   * keep the implementation in the header to allow inlining.
+   *
+   *return DBOOLHUFF_INVOKE(br->rtcd, debool)(br, probability);
+   */
+    unsigned int bit = 0;
+    unsigned int split;
+    unsigned int bigsplit;
+    register unsigned int range = br->range;
+    register unsigned int value = br->value;
+
+    split = 1 + (((range - 1) * probability) >> 8);
+    bigsplit = (split << 8);
+
+    range = split;
+
+    if (value >= bigsplit)
+    {
+        range = br->range - split;
+        value = value - bigsplit;
+        bit = 1;
+    }
+
+    /*if(range>=0x80)
+    {
+        br->value = value;
+        br->range = range;
+        return bit
+    }*/
+
+    {
+        int count = br->count;
+        register unsigned int shift = vp8dx_bitreader_norm[range];
+        range <<= shift;
+        value <<= shift;
+        count -= shift;
+
+        if (count <= 0)
+        {
+            value |= (*br->read_ptr) << (-count);
+            br->read_ptr = br_ptr_advance(br->read_ptr, 1);
+            count += 8 ;
+        }
+
+        br->count = count;
+    }
+    br->value = value;
+    br->range = range;
+    return bit;
+}
+
+static int vp8_decode_value(BOOL_DECODER *br, int bits)
+{
+  /*
+   * Until optimized versions of this function are available, we
+   * keep the implementation in the header to allow inlining.
+   *
+   *return DBOOLHUFF_INVOKE(br->rtcd, devalue)(br, bits);
+   */
+    int z = 0;
+    int bit;
+
+    for (bit = bits - 1; bit >= 0; bit--)
+    {
+        z |= (vp8dx_decode_bool(br, 0x80) << bit);
+    }
+
+    return z;
+}
+#endif
diff --git a/vp8/decoder/decodemv.c b/vp8/decoder/decodemv.c
new file mode 100644
index 0000000..6035f3e
--- /dev/null
+++ b/vp8/decoder/decodemv.c
@@ -0,0 +1,418 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "treereader.h"
+#include "entropymv.h"
+#include "entropymode.h"
+#include "onyxd_int.h"
+#include "findnearmv.h"
+#include "demode.h"
+#if CONFIG_DEBUG
+#include <assert.h>
+#endif
+
+static int read_mvcomponent(vp8_reader *r, const MV_CONTEXT *mvc)
+{
+    const vp8_prob *const p = (const vp8_prob *) mvc;
+    int x = 0;
+
+    if (vp8_read(r, p [mvpis_short]))  /* Large */
+    {
+        int i = 0;
+
+        do
+        {
+            x += vp8_read(r, p [MVPbits + i]) << i;
+        }
+        while (++i < 3);
+
+        i = mvlong_width - 1;  /* Skip bit 3, which is sometimes implicit */
+
+        do
+        {
+            x += vp8_read(r, p [MVPbits + i]) << i;
+        }
+        while (--i > 3);
+
+        if (!(x & 0xFFF0)  ||  vp8_read(r, p [MVPbits + 3]))
+            x += 8;
+    }
+    else   /* small */
+        x = vp8_treed_read(r, vp8_small_mvtree, p + MVPshort);
+
+    if (x  &&  vp8_read(r, p [MVPsign]))
+        x = -x;
+
+    return x;
+}
+
+static void read_mv(vp8_reader *r, MV *mv, const MV_CONTEXT *mvc)
+{
+    mv->row = (short)(read_mvcomponent(r,   mvc) << 1);
+    mv->col = (short)(read_mvcomponent(r, ++mvc) << 1);
+}
+
+
+static void read_mvcontexts(vp8_reader *bc, MV_CONTEXT *mvc)
+{
+    int i = 0;
+
+    do
+    {
+        const vp8_prob *up = vp8_mv_update_probs[i].prob;
+        vp8_prob *p = (vp8_prob *)(mvc + i);
+        vp8_prob *const pstop = p + MVPcount;
+
+        do
+        {
+            if (vp8_read(bc, *up++))
+            {
+                const vp8_prob x = (vp8_prob)vp8_read_literal(bc, 7);
+
+                *p = x ? x << 1 : 1;
+            }
+        }
+        while (++p < pstop);
+    }
+    while (++i < 2);
+}
+
+
+static MB_PREDICTION_MODE read_mv_ref(vp8_reader *bc, const vp8_prob *p)
+{
+    const int i = vp8_treed_read(bc, vp8_mv_ref_tree, p);
+
+    return (MB_PREDICTION_MODE)i;
+}
+
+static MB_PREDICTION_MODE sub_mv_ref(vp8_reader *bc, const vp8_prob *p)
+{
+    const int i = vp8_treed_read(bc, vp8_sub_mv_ref_tree, p);
+
+    return (MB_PREDICTION_MODE)i;
+}
+unsigned int vp8_mv_cont_count[5][4] =
+{
+    { 0, 0, 0, 0 },
+    { 0, 0, 0, 0 },
+    { 0, 0, 0, 0 },
+    { 0, 0, 0, 0 },
+    { 0, 0, 0, 0 }
+};
+
+void vp8_decode_mode_mvs(VP8D_COMP *pbi)
+{
+    const MV Zero = { 0, 0};
+
+    VP8_COMMON *const pc = & pbi->common;
+    vp8_reader *const bc = & pbi->bc;
+
+    MODE_INFO *mi = pc->mi, *ms;
+    const int mis = pc->mode_info_stride;
+
+    MV_CONTEXT *const mvc = pc->fc.mvc;
+
+    int mb_row = -1;
+
+    vp8_prob prob_intra;
+    vp8_prob prob_last;
+    vp8_prob prob_gf;
+    vp8_prob prob_skip_false = 0;
+
+    if (pc->mb_no_coeff_skip)
+        prob_skip_false = (vp8_prob)vp8_read_literal(bc, 8);
+
+    prob_intra = (vp8_prob)vp8_read_literal(bc, 8);
+    prob_last  = (vp8_prob)vp8_read_literal(bc, 8);
+    prob_gf    = (vp8_prob)vp8_read_literal(bc, 8);
+
+    ms = pc->mi - 1;
+
+    if (vp8_read_bit(bc))
+    {
+        int i = 0;
+
+        do
+        {
+            pc->fc.ymode_prob[i] = (vp8_prob) vp8_read_literal(bc, 8);
+        }
+        while (++i < 4);
+    }
+
+    if (vp8_read_bit(bc))
+    {
+        int i = 0;
+
+        do
+        {
+            pc->fc.uv_mode_prob[i] = (vp8_prob) vp8_read_literal(bc, 8);
+        }
+        while (++i < 3);
+    }
+
+    read_mvcontexts(bc, mvc);
+
+    while (++mb_row < pc->mb_rows)
+    {
+        int mb_col = -1;
+
+        while (++mb_col < pc->mb_cols)
+        {
+            MB_MODE_INFO *const mbmi = & mi->mbmi;
+            MV *const mv = & mbmi->mv.as_mv;
+            VP8_COMMON *const pc = &pbi->common;
+            MACROBLOCKD *xd = &pbi->mb;
+
+            vp8dx_bool_decoder_fill(bc);
+
+            // Distance of Mb to the various image edges.
+            // These specified to 8th pel as they are always compared to MV values that are in 1/8th pel units
+            xd->mb_to_left_edge = -((mb_col * 16) << 3);
+            xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
+            xd->mb_to_top_edge = -((mb_row * 16)) << 3;
+            xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
+
+            // If required read in new segmentation data for this MB
+            if (pbi->mb.update_mb_segmentation_map)
+                vp8_read_mb_features(bc, mbmi, &pbi->mb);
+
+            // Read the macroblock coeff skip flag if this feature is in use, else default to 0
+            if (pc->mb_no_coeff_skip)
+                mbmi->mb_skip_coeff = vp8_read(bc, prob_skip_false);
+            else
+                mbmi->mb_skip_coeff = 0;
+
+            mbmi->uv_mode = DC_PRED;
+
+            if ((mbmi->ref_frame = (MV_REFERENCE_FRAME) vp8_read(bc, prob_intra)))    /* inter MB */
+            {
+                int rct[4];
+                vp8_prob mv_ref_p [VP8_MVREFS-1];
+                MV nearest, nearby, best_mv;
+
+                if (vp8_read(bc, prob_last))
+                {
+                    mbmi->ref_frame = (MV_REFERENCE_FRAME)((int)mbmi->ref_frame + (int)(1 + vp8_read(bc, prob_gf)));
+                }
+
+                vp8_find_near_mvs(xd, mi, &nearest, &nearby, &best_mv, rct, mbmi->ref_frame, pbi->common.ref_frame_sign_bias);
+
+                vp8_mv_ref_probs(mv_ref_p, rct);
+
+                switch (mbmi->mode = read_mv_ref(bc, mv_ref_p))
+                {
+                case SPLITMV:
+                {
+                    const int s = mbmi->partitioning = vp8_treed_read(
+                                                           bc, vp8_mbsplit_tree, vp8_mbsplit_probs
+                                                       );
+                    const int num_p = vp8_mbsplit_count [s];
+                    const int *const  L = vp8_mbsplits [s];
+                    int j = 0;
+
+                    do  /* for each subset j */
+                    {
+                        B_MODE_INFO *const bmi = mbmi->partition_bmi + j;
+                        MV *const mv = & bmi->mv.as_mv;
+
+                        int k = -1;  /* first block in subset j */
+                        int mv_contz;
+
+                        while (j != L[++k])
+                            if (k >= 16)
+#if CONFIG_DEBUG
+                                assert(0);
+
+#else
+                                ;
+#endif
+
+                        mv_contz = vp8_mv_cont(&(vp8_left_bmi(mi, k)->mv.as_mv), &(vp8_above_bmi(mi, k, mis)->mv.as_mv));
+
+                        switch (bmi->mode = (B_PREDICTION_MODE) sub_mv_ref(bc, vp8_sub_mv_ref_prob2 [mv_contz])) //pc->fc.sub_mv_ref_prob))
+                        {
+                        case NEW4X4:
+                            read_mv(bc, mv, (const MV_CONTEXT *) mvc);
+                            mv->row += best_mv.row;
+                            mv->col += best_mv.col;
+#ifdef VPX_MODE_COUNT
+                            vp8_mv_cont_count[mv_contz][3]++;
+#endif
+                            break;
+                        case LEFT4X4:
+                            *mv = vp8_left_bmi(mi, k)->mv.as_mv;
+#ifdef VPX_MODE_COUNT
+                            vp8_mv_cont_count[mv_contz][0]++;
+#endif
+                            break;
+                        case ABOVE4X4:
+                            *mv = vp8_above_bmi(mi, k, mis)->mv.as_mv;
+#ifdef VPX_MODE_COUNT
+                            vp8_mv_cont_count[mv_contz][1]++;
+#endif
+                            break;
+                        case ZERO4X4:
+                            *mv = Zero;
+#ifdef VPX_MODE_COUNT
+                            vp8_mv_cont_count[mv_contz][2]++;
+#endif
+                            break;
+                        default:
+                            break;
+                        }
+
+                        /* Fill (uniform) modes, mvs of jth subset.
+                           Must do it here because ensuing subsets can
+                           refer back to us via "left" or "above". */
+                        do
+                            if (j == L[k])
+                                mi->bmi[k] = *bmi;
+
+                        while (++k < 16);
+                    }
+                    while (++j < num_p);
+                }
+
+                *mv = mi->bmi[15].mv.as_mv;
+
+                break;  /* done with SPLITMV */
+
+                case NEARMV:
+                    *mv = nearby;
+
+                    // Clip "next_nearest" so that it does not extend to far out of image
+                    if (mv->col < (xd->mb_to_left_edge - LEFT_TOP_MARGIN))
+                        mv->col = xd->mb_to_left_edge - LEFT_TOP_MARGIN;
+                    else if (mv->col > xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN)
+                        mv->col = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN;
+
+                    if (mv->row < (xd->mb_to_top_edge - LEFT_TOP_MARGIN))
+                        mv->row = xd->mb_to_top_edge - LEFT_TOP_MARGIN;
+                    else if (mv->row > xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN)
+                        mv->row = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN;
+
+                    goto propagate_mv;
+
+                case NEARESTMV:
+                    *mv = nearest;
+
+                    // Clip "next_nearest" so that it does not extend to far out of image
+                    if (mv->col < (xd->mb_to_left_edge - LEFT_TOP_MARGIN))
+                        mv->col = xd->mb_to_left_edge - LEFT_TOP_MARGIN;
+                    else if (mv->col > xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN)
+                        mv->col = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN;
+
+                    if (mv->row < (xd->mb_to_top_edge - LEFT_TOP_MARGIN))
+                        mv->row = xd->mb_to_top_edge - LEFT_TOP_MARGIN;
+                    else if (mv->row > xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN)
+                        mv->row = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN;
+
+                    goto propagate_mv;
+
+                case ZEROMV:
+                    *mv = Zero;
+                    goto propagate_mv;
+
+                case NEWMV:
+                    read_mv(bc, mv, (const MV_CONTEXT *) mvc);
+                    mv->row += best_mv.row;
+                    mv->col += best_mv.col;
+                    /* Encoder should not produce invalid motion vectors, but since
+                     * arbitrary length MVs can be parsed from the bitstream, we
+                     * need to clamp them here in case we're reading bad data to
+                     * avoid a crash.
+                     */
+#if CONFIG_DEBUG
+                    assert(mv->col >= (xd->mb_to_left_edge - LEFT_TOP_MARGIN));
+                    assert(mv->col <= (xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN));
+                    assert(mv->row >= (xd->mb_to_top_edge - LEFT_TOP_MARGIN));
+                    assert(mv->row <= (xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN));
+#endif
+
+                    if (mv->col < (xd->mb_to_left_edge - LEFT_TOP_MARGIN))
+                        mv->col = xd->mb_to_left_edge - LEFT_TOP_MARGIN;
+                    else if (mv->col > xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN)
+                        mv->col = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN;
+
+                    if (mv->row < (xd->mb_to_top_edge - LEFT_TOP_MARGIN))
+                        mv->row = xd->mb_to_top_edge - LEFT_TOP_MARGIN;
+                    else if (mv->row > xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN)
+                        mv->row = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN;
+
+                propagate_mv:  /* same MV throughout */
+                    {
+                        //int i=0;
+                        //do
+                        //{
+                        //  mi->bmi[i].mv.as_mv = *mv;
+                        //}
+                        //while( ++i < 16);
+
+                        mi->bmi[0].mv.as_mv = *mv;
+                        mi->bmi[1].mv.as_mv = *mv;
+                        mi->bmi[2].mv.as_mv = *mv;
+                        mi->bmi[3].mv.as_mv = *mv;
+                        mi->bmi[4].mv.as_mv = *mv;
+                        mi->bmi[5].mv.as_mv = *mv;
+                        mi->bmi[6].mv.as_mv = *mv;
+                        mi->bmi[7].mv.as_mv = *mv;
+                        mi->bmi[8].mv.as_mv = *mv;
+                        mi->bmi[9].mv.as_mv = *mv;
+                        mi->bmi[10].mv.as_mv = *mv;
+                        mi->bmi[11].mv.as_mv = *mv;
+                        mi->bmi[12].mv.as_mv = *mv;
+                        mi->bmi[13].mv.as_mv = *mv;
+                        mi->bmi[14].mv.as_mv = *mv;
+                        mi->bmi[15].mv.as_mv = *mv;
+                    }
+
+                    break;
+
+                default:;
+#if CONFIG_DEBUG
+                    assert(0);
+#endif
+                }
+
+            }
+            else
+            {
+                /* MB is intra coded */
+
+                int j = 0;
+
+                do
+                {
+                    mi->bmi[j].mv.as_mv = Zero;
+                }
+                while (++j < 16);
+
+                *mv = Zero;
+
+                if ((mbmi->mode = (MB_PREDICTION_MODE) vp8_read_ymode(bc, pc->fc.ymode_prob)) == B_PRED)
+                {
+                    int j = 0;
+
+                    do
+                    {
+                        mi->bmi[j].mode = (B_PREDICTION_MODE)vp8_read_bmode(bc, pc->fc.bmode_prob);
+                    }
+                    while (++j < 16);
+                }
+
+                mbmi->uv_mode = (MB_PREDICTION_MODE)vp8_read_uv_mode(bc, pc->fc.uv_mode_prob);
+            }
+
+            mi++;       // next macroblock
+        }
+
+        mi++;           // skip left predictor each row
+    }
+}
diff --git a/vp8/decoder/decodemv.h b/vp8/decoder/decodemv.h
new file mode 100644
index 0000000..4030071
--- /dev/null
+++ b/vp8/decoder/decodemv.h
@@ -0,0 +1,13 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "onyxd_int.h"
+
+void vp8_decode_mode_mvs(VP8D_COMP *);
diff --git a/vp8/decoder/decoderthreading.h b/vp8/decoder/decoderthreading.h
new file mode 100644
index 0000000..ebc5c27
--- /dev/null
+++ b/vp8/decoder/decoderthreading.h
@@ -0,0 +1,24 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+
+
+
+#ifndef _DECODER_THREADING_H
+#define _DECODER_THREADING_H
+
+
+extern void vp8_mtdecode_mb_rows(VP8D_COMP *pbi,
+                                 MACROBLOCKD *xd);
+extern void vp8_stop_lfthread(VP8D_COMP *pbi);
+extern void vp8_start_lfthread(VP8D_COMP *pbi);
+extern void vp8_decoder_remove_threads(VP8D_COMP *pbi);
+extern void vp8_decoder_create_threads(VP8D_COMP *pbi);
+#endif
diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c
new file mode 100644
index 0000000..4edf4f6
--- /dev/null
+++ b/vp8/decoder/decodframe.c
@@ -0,0 +1,907 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "onyxd_int.h"
+#include "header.h"
+#include "reconintra.h"
+#include "reconintra4x4.h"
+#include "recon.h"
+#include "reconinter.h"
+#include "dequantize.h"
+#include "detokenize.h"
+#include "invtrans.h"
+#include "alloccommon.h"
+#include "entropymode.h"
+#include "quant_common.h"
+#include "segmentation_common.h"
+#include "setupintrarecon.h"
+#include "demode.h"
+#include "decodemv.h"
+#include "extend.h"
+#include "vpx_mem/vpx_mem.h"
+#include "idct.h"
+#include "dequantize.h"
+#include "predictdc.h"
+#include "threading.h"
+#include "decoderthreading.h"
+#include "dboolhuff.h"
+
+#include <assert.h>
+#include <stdio.h>
+
+void vp8cx_init_de_quantizer(VP8D_COMP *pbi)
+{
+    int r, c;
+    int i;
+    int Q;
+    VP8_COMMON *const pc = & pbi->common;
+
+    for (Q = 0; Q < QINDEX_RANGE; Q++)
+    {
+        pc->Y1dequant[Q][0][0] = (short)vp8_dc_quant(Q, pc->y1dc_delta_q);
+        pc->Y2dequant[Q][0][0] = (short)vp8_dc2quant(Q, pc->y2dc_delta_q);
+        pc->UVdequant[Q][0][0] = (short)vp8_dc_uv_quant(Q, pc->uvdc_delta_q);
+
+        // all the ac values = ;
+        for (i = 1; i < 16; i++)
+        {
+            int rc = vp8_default_zig_zag1d[i];
+            r = (rc >> 2);
+            c = (rc & 3);
+
+            pc->Y1dequant[Q][r][c] = (short)vp8_ac_yquant(Q);
+            pc->Y2dequant[Q][r][c] = (short)vp8_ac2quant(Q, pc->y2ac_delta_q);
+            pc->UVdequant[Q][r][c] = (short)vp8_ac_uv_quant(Q, pc->uvac_delta_q);
+        }
+    }
+}
+
+static void mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd)
+{
+    int i;
+    int QIndex;
+    MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
+    VP8_COMMON *const pc = & pbi->common;
+
+    // Decide whether to use the default or alternate baseline Q value.
+    if (xd->segmentation_enabled)
+    {
+        // Abs Value
+        if (xd->mb_segement_abs_delta == SEGMENT_ABSDATA)
+            QIndex = xd->segment_feature_data[MB_LVL_ALT_Q][mbmi->segment_id];
+
+        // Delta Value
+        else
+        {
+            QIndex = pc->base_qindex + xd->segment_feature_data[MB_LVL_ALT_Q][mbmi->segment_id];
+            QIndex = (QIndex >= 0) ? ((QIndex <= MAXQ) ? QIndex : MAXQ) : 0;    // Clamp to valid range
+        }
+    }
+    else
+        QIndex = pc->base_qindex;
+
+    // Set up the block level dequant pointers
+    for (i = 0; i < 16; i++)
+    {
+        xd->block[i].dequant = pc->Y1dequant[QIndex];
+    }
+
+    for (i = 16; i < 24; i++)
+    {
+        xd->block[i].dequant = pc->UVdequant[QIndex];
+    }
+
+    xd->block[24].dequant = pc->Y2dequant[QIndex];
+
+}
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define RTCD_VTABLE(x) (&(pbi)->common.rtcd.x)
+#else
+#define RTCD_VTABLE(x) NULL
+#endif
+
+//skip_recon_mb() is Modified: Instead of writing the result to predictor buffer and then copying it
+// to dst buffer, we can write the result directly to dst buffer. This eliminates unnecessary copy.
+static void skip_recon_mb(VP8D_COMP *pbi, MACROBLOCKD *xd)
+{
+    if (xd->frame_type == KEY_FRAME  ||  xd->mbmi.ref_frame == INTRA_FRAME)
+    {
+
+        vp8_build_intra_predictors_mbuv_s(xd);
+        vp8_build_intra_predictors_mby_s_ptr(xd);
+
+    }
+    else
+    {
+        vp8_build_inter_predictors_mb_s(xd);
+    }
+}
+
+static void reconstruct_mb(VP8D_COMP *pbi, MACROBLOCKD *xd)
+{
+    if (xd->frame_type == KEY_FRAME  ||  xd->mbmi.ref_frame == INTRA_FRAME)
+    {
+        vp8_build_intra_predictors_mbuv(xd);
+
+        if (xd->mbmi.mode != B_PRED)
+        {
+            vp8_build_intra_predictors_mby_ptr(xd);
+            vp8_recon16x16mb(RTCD_VTABLE(recon), xd);
+        }
+        else
+        {
+            vp8_recon_intra4x4mb(RTCD_VTABLE(recon), xd);
+        }
+    }
+    else
+    {
+        vp8_build_inter_predictors_mb(xd);
+        vp8_recon16x16mb(RTCD_VTABLE(recon), xd);
+    }
+}
+
+
+static void de_quantand_idct(VP8D_COMP *pbi, MACROBLOCKD *xd)
+{
+    int i;
+    BLOCKD *b = &xd->block[24];
+
+
+    if (xd->mbmi.mode != B_PRED && xd->mbmi.mode != SPLITMV)
+    {
+        DEQUANT_INVOKE(&pbi->dequant, block)(b);
+
+        // do 2nd order transform on the dc block
+        if (b->eob > 1)
+        {
+            IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff);
+            ((int *)b->qcoeff)[0] = 0;
+            ((int *)b->qcoeff)[1] = 0;
+            ((int *)b->qcoeff)[2] = 0;
+            ((int *)b->qcoeff)[3] = 0;
+            ((int *)b->qcoeff)[4] = 0;
+            ((int *)b->qcoeff)[5] = 0;
+            ((int *)b->qcoeff)[6] = 0;
+            ((int *)b->qcoeff)[7] = 0;
+        }
+        else
+        {
+            IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], b->diff);
+            ((int *)b->qcoeff)[0] = 0;
+        }
+
+
+        for (i = 0; i < 16; i++)
+        {
+
+            b = &xd->block[i];
+
+            if (b->eob > 1)
+            {
+                DEQUANT_INVOKE(&pbi->dequant, idct_dc)(b->qcoeff, &b->dequant[0][0], b->diff, 32, xd->block[24].diff[i]);
+            }
+            else
+            {
+                IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar)(xd->block[24].diff[i], b->diff, 32);
+            }
+        }
+
+        for (i = 16; i < 24; i++)
+        {
+            b = &xd->block[i];
+
+            if (b->eob > 1)
+            {
+                DEQUANT_INVOKE(&pbi->dequant, idct)(b->qcoeff, &b->dequant[0][0], b->diff, 16);
+            }
+            else
+            {
+                IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar)(b->qcoeff[0] * b->dequant[0][0], b->diff, 16);
+                ((int *)b->qcoeff)[0] = 0;
+            }
+        }
+    }
+    else
+    {
+        for (i = 0; i < 24; i++)
+        {
+
+            b = &xd->block[i];
+
+            if (b->eob > 1)
+            {
+                DEQUANT_INVOKE(&pbi->dequant, idct)(b->qcoeff, &b->dequant[0][0], b->diff, (32 - (i & 16)));
+            }
+            else
+            {
+                IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar)(b->qcoeff[0] * b->dequant[0][0], b->diff, (32 - (i & 16)));
+                ((int *)b->qcoeff)[0] = 0;
+            }
+        }
+    }
+}
+
+void vp8_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd)
+{
+    int eobtotal = 0;
+
+    if (xd->mbmi.mb_skip_coeff)
+    {
+        vp8_reset_mb_tokens_context(xd);
+    }
+    else
+    {
+        eobtotal = vp8_decode_mb_tokens(pbi, xd);
+    }
+
+    xd->mode_info_context->mbmi.dc_diff = 1;
+
+    if (xd->mbmi.mode != B_PRED && xd->mbmi.mode != SPLITMV && eobtotal == 0)
+    {
+        xd->mode_info_context->mbmi.dc_diff = 0;
+        skip_recon_mb(pbi, xd);
+        return;
+    }
+
+    if (xd->segmentation_enabled)
+        mb_init_dequantizer(pbi, xd);
+
+    de_quantand_idct(pbi, xd);
+    reconstruct_mb(pbi, xd);
+}
+
+static int get_delta_q(vp8_reader *bc, int prev, int *q_update)
+{
+    int ret_val = 0;
+
+    if (vp8_read_bit(bc))
+    {
+        ret_val = vp8_read_literal(bc, 4);
+
+        if (vp8_read_bit(bc))
+            ret_val = -ret_val;
+    }
+
+    /* Trigger a quantizer update if the delta-q value has changed */
+    if (ret_val != prev)
+        *q_update = 1;
+
+    return ret_val;
+}
+
+#ifdef PACKET_TESTING
+#include <stdio.h>
+FILE *vpxlog = 0;
+#endif
+
+
+
+void vp8_decode_mb_row(VP8D_COMP *pbi,
+                       VP8_COMMON *pc,
+                       int mb_row,
+                       MACROBLOCKD *xd)
+{
+
+    int i;
+    int recon_yoffset, recon_uvoffset;
+    int mb_col;
+    int recon_y_stride = pc->last_frame.y_stride;
+    int recon_uv_stride = pc->last_frame.uv_stride;
+
+    vpx_memset(pc->left_context, 0, sizeof(pc->left_context));
+    recon_yoffset = mb_row * recon_y_stride * 16;
+    recon_uvoffset = mb_row * recon_uv_stride * 8;
+    // reset above block coeffs
+
+    xd->above_context[Y1CONTEXT] = pc->above_context[Y1CONTEXT];
+    xd->above_context[UCONTEXT ] = pc->above_context[UCONTEXT];
+    xd->above_context[VCONTEXT ] = pc->above_context[VCONTEXT];
+    xd->above_context[Y2CONTEXT] = pc->above_context[Y2CONTEXT];
+    xd->up_available = (mb_row != 0);
+
+    xd->mb_to_top_edge = -((mb_row * 16)) << 3;
+    xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
+
+    for (mb_col = 0; mb_col < pc->mb_cols; mb_col++)
+    {
+        // Take a copy of the mode and Mv information for this macroblock into the xd->mbmi
+        vpx_memcpy(&xd->mbmi, &xd->mode_info_context->mbmi, 32); //sizeof(MB_MODE_INFO) );
+
+        if (xd->mbmi.mode == SPLITMV || xd->mbmi.mode == B_PRED)
+        {
+            for (i = 0; i < 16; i++)
+            {
+                BLOCKD *d = &xd->block[i];
+                vpx_memcpy(&d->bmi, &xd->mode_info_context->bmi[i], sizeof(B_MODE_INFO));
+            }
+        }
+
+        // Distance of Mb to the various image edges.
+        // These specified to 8th pel as they are always compared to values that are in 1/8th pel units
+        xd->mb_to_left_edge = -((mb_col * 16) << 3);
+        xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
+
+        xd->dst.y_buffer = pc->new_frame.y_buffer + recon_yoffset;
+        xd->dst.u_buffer = pc->new_frame.u_buffer + recon_uvoffset;
+        xd->dst.v_buffer = pc->new_frame.v_buffer + recon_uvoffset;
+
+        xd->left_available = (mb_col != 0);
+
+        // Select the appropriate reference frame for this MB
+        if (xd->mbmi.ref_frame == LAST_FRAME)
+        {
+            xd->pre.y_buffer = pc->last_frame.y_buffer + recon_yoffset;
+            xd->pre.u_buffer = pc->last_frame.u_buffer + recon_uvoffset;
+            xd->pre.v_buffer = pc->last_frame.v_buffer + recon_uvoffset;
+        }
+        else if (xd->mbmi.ref_frame == GOLDEN_FRAME)
+        {
+            // Golden frame reconstruction buffer
+            xd->pre.y_buffer = pc->golden_frame.y_buffer + recon_yoffset;
+            xd->pre.u_buffer = pc->golden_frame.u_buffer + recon_uvoffset;
+            xd->pre.v_buffer = pc->golden_frame.v_buffer + recon_uvoffset;
+        }
+        else
+        {
+            // Alternate reference frame reconstruction buffer
+            xd->pre.y_buffer = pc->alt_ref_frame.y_buffer + recon_yoffset;
+            xd->pre.u_buffer = pc->alt_ref_frame.u_buffer + recon_uvoffset;
+            xd->pre.v_buffer = pc->alt_ref_frame.v_buffer + recon_uvoffset;
+        }
+
+        vp8_build_uvmvs(xd, pc->full_pixel);
+
+        /*
+        if(pbi->common.current_video_frame==0 &&mb_col==1 && mb_row==0)
+        pbi->debugoutput =1;
+        else
+        pbi->debugoutput =0;
+        */
+        vp8dx_bool_decoder_fill(xd->current_bc);
+        vp8_decode_macroblock(pbi, xd);
+
+
+        recon_yoffset += 16;
+        recon_uvoffset += 8;
+
+        ++xd->mode_info_context;  /* next mb */
+
+        xd->gf_active_ptr++;      // GF useage flag for next MB
+
+        xd->above_context[Y1CONTEXT] += 4;
+        xd->above_context[UCONTEXT ] += 2;
+        xd->above_context[VCONTEXT ] += 2;
+        xd->above_context[Y2CONTEXT] ++;
+
+        pbi->current_mb_col_main = mb_col;
+    }
+
+    // adjust to the next row of mbs
+    vp8_extend_mb_row(
+        &pc->new_frame,
+        xd->dst.y_buffer + 16, xd->dst.u_buffer + 8, xd->dst.v_buffer + 8
+    );
+
+    ++xd->mode_info_context;      /* skip prediction column */
+
+    pbi->last_mb_row_decoded = mb_row;
+}
+
+
+static unsigned int read_partition_size(const unsigned char *cx_size)
+{
+    const unsigned int size =
+        cx_size[0] + (cx_size[1] << 8) + (cx_size[2] << 16);
+    return size;
+}
+
+
+static void setup_token_decoder(VP8D_COMP *pbi,
+                                const unsigned char *cx_data)
+{
+    int num_part;
+    int i;
+    VP8_COMMON          *pc = &pbi->common;
+    const unsigned char *user_data_end = pbi->Source + pbi->source_sz;
+    vp8_reader          *bool_decoder;
+    const unsigned char *partition;
+
+    /* Parse number of token partitions to use */
+    pc->multi_token_partition = (TOKEN_PARTITION)vp8_read_literal(&pbi->bc, 2);
+    num_part = 1 << pc->multi_token_partition;
+
+    /* Set up pointers to the first partition */
+    partition = cx_data;
+    bool_decoder = &pbi->bc2;
+
+    if (num_part > 1)
+    {
+        CHECK_MEM_ERROR(pbi->mbc, vpx_malloc(num_part * sizeof(vp8_reader)));
+        bool_decoder = pbi->mbc;
+        partition += 3 * (num_part - 1);
+    }
+
+    for (i = 0; i < num_part; i++)
+    {
+        const unsigned char *partition_size_ptr = cx_data + i * 3;
+        unsigned int         partition_size;
+
+        /* Calculate the length of this partition. The last partition
+         * size is implicit.
+         */
+        if (i < num_part - 1)
+        {
+            partition_size = read_partition_size(partition_size_ptr);
+        }
+        else
+        {
+            partition_size = user_data_end - partition;
+        }
+
+        if (partition + partition_size > user_data_end)
+            vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+                               "Truncated packet or corrupt partition "
+                               "%d length", i + 1);
+
+        if (vp8dx_start_decode(bool_decoder, IF_RTCD(&pbi->dboolhuff),
+                               partition, partition_size))
+            vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
+                               "Failed to allocate bool decoder %d", i + 1);
+
+        /* Advance to the next partition */
+        partition += partition_size;
+        bool_decoder++;
+    }
+
+    /* Clamp number of decoder threads */
+    if (pbi->decoding_thread_count > num_part - 1)
+        pbi->decoding_thread_count = num_part - 1;
+}
+
+
+static void stop_token_decoder(VP8D_COMP *pbi)
+{
+    int i;
+    VP8_COMMON *pc = &pbi->common;
+
+    if (pc->multi_token_partition != ONE_PARTITION)
+    {
+        int num_part = (1 << pc->multi_token_partition);
+
+        for (i = 0; i < num_part; i++)
+        {
+            vp8dx_stop_decode(&pbi->mbc[i]);
+        }
+
+        vpx_free(pbi->mbc);
+    }
+    else
+        vp8dx_stop_decode(& pbi->bc2);
+}
+
+static void init_frame(VP8D_COMP *pbi)
+{
+    VP8_COMMON *const pc = & pbi->common;
+    MACROBLOCKD *const xd  = & pbi->mb;
+
+    if (pc->frame_type == KEY_FRAME)
+    {
+        // Various keyframe initializations
+        vpx_memcpy(pc->fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context));
+
+        vp8_init_mbmode_probs(pc);
+
+        vp8_default_coef_probs(pc);
+        vp8_kf_default_bmode_probs(pc->kf_bmode_prob);
+
+        // reset the segment feature data to 0 with delta coding (Default state).
+        vpx_memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data));
+        xd->mb_segement_abs_delta = SEGMENT_DELTADATA;
+
+       // reset the mode ref deltasa for loop filter
+        vpx_memset(xd->ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas));
+        vpx_memset(xd->mode_lf_deltas, 0, sizeof(xd->mode_lf_deltas));
+
+        // All buffers are implicitly updated on key frames.
+        pc->refresh_golden_frame = 1;
+        pc->refresh_alt_ref_frame = 1;
+        pc->copy_buffer_to_gf = 0;
+        pc->copy_buffer_to_arf = 0;
+
+        // Note that Golden and Altref modes cannot be used on a key frame so
+        // ref_frame_sign_bias[] is undefined and meaningless
+        pc->ref_frame_sign_bias[GOLDEN_FRAME] = 0;
+        pc->ref_frame_sign_bias[ALTREF_FRAME] = 0;
+    }
+    else
+    {
+        if (!pc->use_bilinear_mc_filter)
+            pc->mcomp_filter_type = SIXTAP;
+        else
+            pc->mcomp_filter_type = BILINEAR;
+
+        // To enable choice of different interploation filters
+        if (pc->mcomp_filter_type == SIXTAP)
+        {
+            xd->subpixel_predict      = SUBPIX_INVOKE(RTCD_VTABLE(subpix), sixtap4x4);
+            xd->subpixel_predict8x4   = SUBPIX_INVOKE(RTCD_VTABLE(subpix), sixtap8x4);
+            xd->subpixel_predict8x8   = SUBPIX_INVOKE(RTCD_VTABLE(subpix), sixtap8x8);
+            xd->subpixel_predict16x16 = SUBPIX_INVOKE(RTCD_VTABLE(subpix), sixtap16x16);
+        }
+        else
+        {
+            xd->subpixel_predict      = SUBPIX_INVOKE(RTCD_VTABLE(subpix), bilinear4x4);
+            xd->subpixel_predict8x4   = SUBPIX_INVOKE(RTCD_VTABLE(subpix), bilinear8x4);
+            xd->subpixel_predict8x8   = SUBPIX_INVOKE(RTCD_VTABLE(subpix), bilinear8x8);
+            xd->subpixel_predict16x16 = SUBPIX_INVOKE(RTCD_VTABLE(subpix), bilinear16x16);
+        }
+    }
+
+    xd->left_context = pc->left_context;
+    xd->mode_info_context = pc->mi;
+    xd->frame_type = pc->frame_type;
+    xd->mbmi.mode = DC_PRED;
+    xd->mode_info_stride = pc->mode_info_stride;
+}
+
+int vp8_decode_frame(VP8D_COMP *pbi)
+{
+    vp8_reader *const bc = & pbi->bc;
+    VP8_COMMON *const pc = & pbi->common;
+    MACROBLOCKD *const xd  = & pbi->mb;
+    const unsigned char *data = (const unsigned char *)pbi->Source;
+    const unsigned char *const data_end = data + pbi->source_sz;
+    int first_partition_length_in_bytes;
+
+    int mb_row;
+    int i, j, k, l;
+    const int *const mb_feature_data_bits = vp8_mb_feature_data_bits;
+
+    pc->frame_type = (FRAME_TYPE)(data[0] & 1);
+    pc->version = (data[0] >> 1) & 7;
+    pc->show_frame = (data[0] >> 4) & 1;
+    first_partition_length_in_bytes =
+        (data[0] | (data[1] << 8) | (data[2] << 16)) >> 5;
+    data += 3;
+
+    if (data + first_partition_length_in_bytes > data_end)
+        vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+                           "Truncated packet or corrupt partition 0 length");
+    vp8_setup_version(pc);
+
+    if (pc->frame_type == KEY_FRAME)
+    {
+        const int Width = pc->Width;
+        const int Height = pc->Height;
+
+        // vet via sync code
+        if (data[0] != 0x9d || data[1] != 0x01 || data[2] != 0x2a)
+            vpx_internal_error(&pc->error, VPX_CODEC_UNSUP_BITSTREAM,
+                               "Invalid frame sync code");
+
+        pc->Width = (data[3] | (data[4] << 8)) & 0x3fff;
+        pc->horiz_scale = data[4] >> 6;
+        pc->Height = (data[5] | (data[6] << 8)) & 0x3fff;
+        pc->vert_scale = data[6] >> 6;
+        data += 7;
+
+        if (Width != pc->Width  ||  Height != pc->Height)
+        {
+            if (pc->Width <= 0)
+            {
+                pc->Width = Width;
+                vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+                                   "Invalid frame width");
+            }
+
+            if (pc->Height <= 0)
+            {
+                pc->Height = Height;
+                vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+                                   "Invalid frame height");
+            }
+
+            if (vp8_alloc_frame_buffers(&pbi->common, pc->Width, pc->Height))
+                vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
+                                   "Failed to allocate frame buffers");
+        }
+    }
+
+    if (pc->Width == 0 || pc->Height == 0)
+    {
+        return -1;
+    }
+
+    init_frame(pbi);
+
+    if (vp8dx_start_decode(bc, IF_RTCD(&pbi->dboolhuff),
+                           data, data_end - data))
+        vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
+                           "Failed to allocate bool decoder 0");
+    if (pc->frame_type == KEY_FRAME) {
+        pc->clr_type    = (YUV_TYPE)vp8_read_bit(bc);
+        pc->clamp_type  = (CLAMP_TYPE)vp8_read_bit(bc);
+    }
+
+    // Is segmentation enabled
+    xd->segmentation_enabled = (unsigned char)vp8_read_bit(bc);
+
+    if (xd->segmentation_enabled)
+    {
+        // Signal whether or not the segmentation map is being explicitly updated this frame.
+        xd->update_mb_segmentation_map = (unsigned char)vp8_read_bit(bc);
+        xd->update_mb_segmentation_data = (unsigned char)vp8_read_bit(bc);
+
+        if (xd->update_mb_segmentation_data)
+        {
+            xd->mb_segement_abs_delta = (unsigned char)vp8_read_bit(bc);
+
+            vpx_memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data));
+
+            // For each segmentation feature (Quant and loop filter level)
+            for (i = 0; i < MB_LVL_MAX; i++)
+            {
+                for (j = 0; j < MAX_MB_SEGMENTS; j++)
+                {
+                    // Frame level data
+                    if (vp8_read_bit(bc))
+                    {
+                        xd->segment_feature_data[i][j] = (signed char)vp8_read_literal(bc, mb_feature_data_bits[i]);
+
+                        if (vp8_read_bit(bc))
+                            xd->segment_feature_data[i][j] = -xd->segment_feature_data[i][j];
+                    }
+                    else
+                        xd->segment_feature_data[i][j] = 0;
+                }
+            }
+        }
+
+        if (xd->update_mb_segmentation_map)
+        {
+            // Which macro block level features are enabled
+            vpx_memset(xd->mb_segment_tree_probs, 255, sizeof(xd->mb_segment_tree_probs));
+
+            // Read the probs used to decode the segment id for each macro block.
+            for (i = 0; i < MB_FEATURE_TREE_PROBS; i++)
+            {
+                // If not explicitly set value is defaulted to 255 by memset above
+                if (vp8_read_bit(bc))
+                    xd->mb_segment_tree_probs[i] = (vp8_prob)vp8_read_literal(bc, 8);
+            }
+        }
+    }
+
+    // Read the loop filter level and type
+    pc->filter_type = (LOOPFILTERTYPE) vp8_read_bit(bc);
+    pc->filter_level = vp8_read_literal(bc, 6);
+    pc->sharpness_level = vp8_read_literal(bc, 3);
+
+    // Read in loop filter deltas applied at the MB level based on mode or ref frame.
+    xd->mode_ref_lf_delta_update = 0;
+    xd->mode_ref_lf_delta_enabled = (unsigned char)vp8_read_bit(bc);
+
+    if (xd->mode_ref_lf_delta_enabled)
+    {
+        // Do the deltas need to be updated
+        xd->mode_ref_lf_delta_update = (unsigned char)vp8_read_bit(bc);
+
+        if (xd->mode_ref_lf_delta_update)
+        {
+            // Send update
+            for (i = 0; i < MAX_REF_LF_DELTAS; i++)
+            {
+                if (vp8_read_bit(bc))
+                {
+                    //sign = vp8_read_bit( bc );
+                    xd->ref_lf_deltas[i] = (signed char)vp8_read_literal(bc, 6);
+
+                    if (vp8_read_bit(bc))        // Apply sign
+                        xd->ref_lf_deltas[i] = xd->ref_lf_deltas[i] * -1;
+                }
+            }
+
+            // Send update
+            for (i = 0; i < MAX_MODE_LF_DELTAS; i++)
+            {
+                if (vp8_read_bit(bc))
+                {
+                    //sign = vp8_read_bit( bc );
+                    xd->mode_lf_deltas[i] = (signed char)vp8_read_literal(bc, 6);
+
+                    if (vp8_read_bit(bc))        // Apply sign
+                        xd->mode_lf_deltas[i] = xd->mode_lf_deltas[i] * -1;
+                }
+            }
+        }
+    }
+
+    setup_token_decoder(pbi, data + first_partition_length_in_bytes);
+    xd->current_bc = &pbi->bc2;
+
+    // Read the default quantizers.
+    {
+        int Q, q_update;
+
+        Q = vp8_read_literal(bc, 7);  // AC 1st order Q = default
+        pc->base_qindex = Q;
+        q_update = 0;
+        pc->y1dc_delta_q = get_delta_q(bc, pc->y1dc_delta_q, &q_update);
+        pc->y2dc_delta_q = get_delta_q(bc, pc->y2dc_delta_q, &q_update);
+        pc->y2ac_delta_q = get_delta_q(bc, pc->y2ac_delta_q, &q_update);
+        pc->uvdc_delta_q = get_delta_q(bc, pc->uvdc_delta_q, &q_update);
+        pc->uvac_delta_q = get_delta_q(bc, pc->uvac_delta_q, &q_update);
+
+        if (q_update)
+            vp8cx_init_de_quantizer(pbi);
+
+        // MB level dequantizer setup
+        mb_init_dequantizer(pbi, &pbi->mb);
+    }
+
+    // Determine if the golden frame or ARF buffer should be updated and how.
+    // For all non key frames the GF and ARF refresh flags and sign bias
+    // flags must be set explicitly.
+    if (pc->frame_type != KEY_FRAME)
+    {
+        // Should the GF or ARF be updated from the current frame
+        pc->refresh_golden_frame = vp8_read_bit(bc);
+        pc->refresh_alt_ref_frame = vp8_read_bit(bc);
+
+        // Buffer to buffer copy flags.
+        pc->copy_buffer_to_gf = 0;
+
+        if (!pc->refresh_golden_frame)
+            pc->copy_buffer_to_gf = vp8_read_literal(bc, 2);
+
+        pc->copy_buffer_to_arf = 0;
+
+        if (!pc->refresh_alt_ref_frame)
+            pc->copy_buffer_to_arf = vp8_read_literal(bc, 2);
+
+        pc->ref_frame_sign_bias[GOLDEN_FRAME] = vp8_read_bit(bc);
+        pc->ref_frame_sign_bias[ALTREF_FRAME] = vp8_read_bit(bc);
+    }
+
+    pc->refresh_entropy_probs = vp8_read_bit(bc);
+    if (pc->refresh_entropy_probs == 0)
+    {
+        vpx_memcpy(&pc->lfc, &pc->fc, sizeof(pc->fc));
+    }
+
+    pc->refresh_last_frame = pc->frame_type == KEY_FRAME  ||  vp8_read_bit(bc);
+
+    if (0)
+    {
+        FILE *z = fopen("decodestats.stt", "a");
+        fprintf(z, "%6d F:%d,G:%d,A:%d,L:%d,Q:%d\n",
+                pc->current_video_frame,
+                pc->frame_type,
+                pc->refresh_golden_frame,
+                pc->refresh_alt_ref_frame,
+                pc->refresh_last_frame,
+                pc->base_qindex);
+        fclose(z);
+    }
+
+
+    vp8dx_bool_decoder_fill(bc);
+    {
+        // read coef probability tree
+
+        for (i = 0; i < BLOCK_TYPES; i++)
+            for (j = 0; j < COEF_BANDS; j++)
+                for (k = 0; k < PREV_COEF_CONTEXTS; k++)
+                    for (l = 0; l < MAX_ENTROPY_TOKENS - 1; l++)
+                    {
+
+                        vp8_prob *const p = pc->fc.coef_probs [i][j][k] + l;
+
+                        if (vp8_read(bc, vp8_coef_update_probs [i][j][k][l]))
+                        {
+                            *p = (vp8_prob)vp8_read_literal(bc, 8);
+
+                        }
+                    }
+    }
+
+    vpx_memcpy(&xd->pre, &pc->last_frame, sizeof(YV12_BUFFER_CONFIG));
+    vpx_memcpy(&xd->dst, &pc->new_frame, sizeof(YV12_BUFFER_CONFIG));
+
+    // set up frame new frame for intra coded blocks
+    vp8_setup_intra_recon(&pc->new_frame);
+
+    vp8_setup_block_dptrs(xd);
+
+    vp8_build_block_doffsets(xd);
+
+    // clear out the coeff buffer
+    vpx_memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
+
+    // Read the mb_no_coeff_skip flag
+    pc->mb_no_coeff_skip = (int)vp8_read_bit(bc);
+
+    if (pc->frame_type == KEY_FRAME)
+        vp8_kfread_modes(pbi);
+    else
+        vp8_decode_mode_mvs(pbi);
+
+    // reset since these guys are used as iterators
+    vpx_memset(pc->above_context[Y1CONTEXT], 0, sizeof(ENTROPY_CONTEXT) * pc->mb_cols * 4);
+    vpx_memset(pc->above_context[UCONTEXT ], 0, sizeof(ENTROPY_CONTEXT) * pc->mb_cols * 2);
+    vpx_memset(pc->above_context[VCONTEXT ], 0, sizeof(ENTROPY_CONTEXT) * pc->mb_cols * 2);
+    vpx_memset(pc->above_context[Y2CONTEXT], 0, sizeof(ENTROPY_CONTEXT) * pc->mb_cols);
+
+    xd->gf_active_ptr = (signed char *)pc->gf_active_flags;     // Point to base of GF active flags data structure
+
+
+    vpx_memcpy(&xd->block[0].bmi, &xd->mode_info_context->bmi[0], sizeof(B_MODE_INFO));
+
+
+    if (pbi->b_multithreaded_lf && pbi->common.filter_level != 0)
+        vp8_start_lfthread(pbi);
+
+    if (pbi->b_multithreaded_rd && pbi->common.multi_token_partition != ONE_PARTITION)
+    {
+        vp8_mtdecode_mb_rows(pbi, xd);
+    }
+    else
+    {
+        int ibc = 0;
+        int num_part = 1 << pbi->common.multi_token_partition;
+
+        // Decode the individual macro block
+        for (mb_row = 0; mb_row < pc->mb_rows; mb_row++)
+        {
+
+            if (num_part > 1)
+            {
+                xd->current_bc = & pbi->mbc[ibc];
+                ibc++;
+
+                if (ibc == num_part)
+                    ibc = 0;
+            }
+
+            vp8_decode_mb_row(pbi, pc, mb_row, xd);
+        }
+
+        pbi->last_mb_row_decoded = mb_row;
+    }
+
+
+    stop_token_decoder(pbi);
+
+    vp8dx_stop_decode(bc);
+
+    // vpx_log("Decoder: Frame Decoded, Size Roughly:%d bytes  \n",bc->pos+pbi->bc2.pos);
+
+    // If this was a kf or Gf note the Q used
+    if ((pc->frame_type == KEY_FRAME) || (pc->refresh_golden_frame) || pbi->common.refresh_alt_ref_frame)
+        pc->last_kf_gf_q = pc->base_qindex;
+
+    if (pc->refresh_entropy_probs == 0)
+    {
+        vpx_memcpy(&pc->fc, &pc->lfc, sizeof(pc->fc));
+    }
+
+#ifdef PACKET_TESTING
+    {
+        FILE *f = fopen("decompressor.VP8", "ab");
+        unsigned int size = pbi->bc2.pos + pbi->bc.pos + 8;
+        fwrite((void *) &size, 4, 1, f);
+        fwrite((void *) pbi->Source, size, 1, f);
+        fclose(f);
+    }
+#endif
+
+    return 0;
+}
diff --git a/vp8/decoder/demode.c b/vp8/decoder/demode.c
new file mode 100644
index 0000000..fd05e6d
--- /dev/null
+++ b/vp8/decoder/demode.c
@@ -0,0 +1,149 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "onyxd_int.h"
+#include "entropymode.h"
+#include "findnearmv.h"
+
+
+int vp8_read_bmode(vp8_reader *bc, const vp8_prob *p)
+{
+    const int i = vp8_treed_read(bc, vp8_bmode_tree, p);
+
+    return i;
+}
+
+
+int vp8_read_ymode(vp8_reader *bc, const vp8_prob *p)
+{
+    const int i = vp8_treed_read(bc, vp8_ymode_tree, p);
+
+    return i;
+}
+
+int vp8_kfread_ymode(vp8_reader *bc, const vp8_prob *p)
+{
+    const int i = vp8_treed_read(bc, vp8_kf_ymode_tree, p);
+
+    return i;
+}
+
+
+
+int vp8_read_uv_mode(vp8_reader *bc, const vp8_prob *p)
+{
+    const int i = vp8_treed_read(bc, vp8_uv_mode_tree, p);
+
+    return i;
+}
+
+void vp8_read_mb_features(vp8_reader *r, MB_MODE_INFO *mi, MACROBLOCKD *x)
+{
+    // Is segmentation enabled
+    if (x->segmentation_enabled && x->update_mb_segmentation_map)
+    {
+        // If so then read the segment id.
+        if (vp8_read(r, x->mb_segment_tree_probs[0]))
+            mi->segment_id = (unsigned char)(2 + vp8_read(r, x->mb_segment_tree_probs[2]));
+        else
+            mi->segment_id = (unsigned char)(vp8_read(r, x->mb_segment_tree_probs[1]));
+    }
+}
+
+void vp8_kfread_modes(VP8D_COMP *pbi)
+{
+    VP8_COMMON *const cp = & pbi->common;
+    vp8_reader *const bc = & pbi->bc;
+
+    MODE_INFO *m = cp->mi;
+    const int ms = cp->mode_info_stride;
+
+    int mb_row = -1;
+    vp8_prob prob_skip_false = 0;
+
+    if (cp->mb_no_coeff_skip)
+        prob_skip_false = (vp8_prob)(vp8_read_literal(bc, 8));
+
+    while (++mb_row < cp->mb_rows)
+    {
+        int mb_col = -1;
+
+        while (++mb_col < cp->mb_cols)
+        {
+            MB_PREDICTION_MODE y_mode;
+
+            vp8dx_bool_decoder_fill(bc);
+            // Read the Macroblock segmentation map if it is being updated explicitly this frame (reset to 0 above by default)
+            // By default on a key frame reset all MBs to segment 0
+            m->mbmi.segment_id = 0;
+
+            if (pbi->mb.update_mb_segmentation_map)
+                vp8_read_mb_features(bc, &m->mbmi, &pbi->mb);
+
+            // Read the macroblock coeff skip flag if this feature is in use, else default to 0
+            if (cp->mb_no_coeff_skip)
+                m->mbmi.mb_skip_coeff = vp8_read(bc, prob_skip_false);
+            else
+                m->mbmi.mb_skip_coeff = 0;
+
+            y_mode = (MB_PREDICTION_MODE) vp8_kfread_ymode(bc, cp->kf_ymode_prob);
+
+            m->mbmi.ref_frame = INTRA_FRAME;
+
+            if ((m->mbmi.mode = y_mode) == B_PRED)
+            {
+                int i = 0;
+
+                do
+                {
+                    const B_PREDICTION_MODE A = vp8_above_bmi(m, i, ms)->mode;
+                    const B_PREDICTION_MODE L = vp8_left_bmi(m, i)->mode;
+
+                    m->bmi[i].mode = (B_PREDICTION_MODE) vp8_read_bmode(bc, cp->kf_bmode_prob [A] [L]);
+                }
+                while (++i < 16);
+            }
+            else
+            {
+                int BMode;
+                int i = 0;
+
+                switch (y_mode)
+                {
+                case DC_PRED:
+                    BMode = B_DC_PRED;
+                    break;
+                case V_PRED:
+                    BMode = B_VE_PRED;
+                    break;
+                case H_PRED:
+                    BMode = B_HE_PRED;
+                    break;
+                case TM_PRED:
+                    BMode = B_TM_PRED;
+                    break;
+                default:
+                    BMode = B_DC_PRED;
+                    break;
+                }
+
+                do
+                {
+                    m->bmi[i].mode = (B_PREDICTION_MODE)BMode;
+                }
+                while (++i < 16);
+            }
+
+            (m++)->mbmi.uv_mode = (MB_PREDICTION_MODE)vp8_read_uv_mode(bc, cp->kf_uv_mode_prob);
+        }
+
+        m++; // skip the border
+    }
+}
diff --git a/vp8/decoder/demode.h b/vp8/decoder/demode.h
new file mode 100644
index 0000000..51bbc5e
--- /dev/null
+++ b/vp8/decoder/demode.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "onyxd_int.h"
+
+/* Read (intra) modes for all blocks in a keyframe */
+
+void vp8_kfread_modes(VP8D_COMP *pbi);
+
+/* Intra mode for a Y subblock */
+
+int vp8_read_bmode(vp8_reader *, const vp8_prob *);
+
+/* MB intra Y mode trees differ for key and inter frames. */
+
+int   vp8_read_ymode(vp8_reader *, const vp8_prob *);
+int vp8_kfread_ymode(vp8_reader *, const vp8_prob *);
+
+/* MB intra UV mode trees are the same for key and inter frames. */
+
+int vp8_read_uv_mode(vp8_reader *, const vp8_prob *);
+
+/* Read any macroblock-level features that may be present. */
+
+void vp8_read_mb_features(vp8_reader *, MB_MODE_INFO *, MACROBLOCKD *);
diff --git a/vp8/decoder/dequantize.c b/vp8/decoder/dequantize.c
new file mode 100644
index 0000000..14798d9
--- /dev/null
+++ b/vp8/decoder/dequantize.c
@@ -0,0 +1,60 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "dequantize.h"
+#include "predictdc.h"
+#include "idct.h"
+#include "vpx_mem/vpx_mem.h"
+
+extern void vp8_short_idct4x4llm_c(short *input, short *output, int pitch) ;
+extern void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch);
+
+
+void vp8_dequantize_b_c(BLOCKD *d)
+{
+    int i;
+    short *DQ  = d->dqcoeff;
+    short *Q   = d->qcoeff;
+    short *DQC = &d->dequant[0][0];
+
+    for (i = 0; i < 16; i++)
+    {
+        DQ[i] = Q[i] * DQC[i];
+    }
+}
+
+void vp8_dequant_idct_c(short *input, short *dq, short *output, int pitch)
+{
+    int i;
+
+    for (i = 0; i < 16; i++)
+    {
+        input[i] = dq[i] * input[i];
+    }
+
+    vp8_short_idct4x4llm_c(input, output, pitch);
+    vpx_memset(input, 0, 32);
+}
+
+void vp8_dequant_dc_idct_c(short *input, short *dq, short *output, int pitch, int Dc)
+{
+    int i;
+
+    input[0] = (short)Dc;
+
+    for (i = 1; i < 16; i++)
+    {
+        input[i] = dq[i] * input[i];
+    }
+
+    vp8_short_idct4x4llm_c(input, output, pitch);
+    vpx_memset(input, 0, 32);
+}
diff --git a/vp8/decoder/dequantize.h b/vp8/decoder/dequantize.h
new file mode 100644
index 0000000..d16b02e
--- /dev/null
+++ b/vp8/decoder/dequantize.h
@@ -0,0 +1,64 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef DEQUANTIZE_H
+#define DEQUANTIZE_H
+#include "blockd.h"
+
+#define prototype_dequant_block(sym) \
+    void sym(BLOCKD *x)
+
+#define prototype_dequant_idct(sym) \
+    void sym(short *input, short *dq, short *output, int pitch)
+
+#define prototype_dequant_idct_dc(sym) \
+    void sym(short *input, short *dq, short *output, int pitch, int dc)
+
+#if ARCH_X86 || ARCH_X86_64
+#include "x86/dequantize_x86.h"
+#endif
+
+#if ARCH_ARM
+#include "arm/dequantize_arm.h"
+#endif
+
+#ifndef vp8_dequant_block
+#define vp8_dequant_block vp8_dequantize_b_c
+#endif
+extern prototype_dequant_block(vp8_dequant_block);
+
+#ifndef vp8_dequant_idct
+#define vp8_dequant_idct vp8_dequant_idct_c
+#endif
+extern prototype_dequant_idct(vp8_dequant_idct);
+
+#ifndef vp8_dequant_idct_dc
+#define vp8_dequant_idct_dc vp8_dequant_dc_idct_c
+#endif
+extern prototype_dequant_idct_dc(vp8_dequant_idct_dc);
+
+
+typedef prototype_dequant_block((*vp8_dequant_block_fn_t));
+typedef prototype_dequant_idct((*vp8_dequant_idct_fn_t));
+typedef prototype_dequant_idct_dc((*vp8_dequant_idct_dc_fn_t));
+typedef struct
+{
+    vp8_dequant_block_fn_t    block;
+    vp8_dequant_idct_fn_t     idct;
+    vp8_dequant_idct_dc_fn_t  idct_dc;
+} vp8_dequant_rtcd_vtable_t;
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define DEQUANT_INVOKE(ctx,fn) (ctx)->fn
+#else
+#define DEQUANT_INVOKE(ctx,fn) vp8_dequant_##fn
+#endif
+
+#endif
diff --git a/vp8/decoder/detokenize.c b/vp8/decoder/detokenize.c
new file mode 100644
index 0000000..a42f18d
--- /dev/null
+++ b/vp8/decoder/detokenize.c
@@ -0,0 +1,374 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "type_aliases.h"
+#include "blockd.h"
+#include "onyxd_int.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+
+#define BR_COUNT 8
+#define BOOL_DATA UINT8
+
+#define OCB_X PREV_COEF_CONTEXTS * ENTROPY_NODES
+DECLARE_ALIGNED(16, UINT16, vp8_coef_bands_x[16]) = { 0, 1 * OCB_X, 2 * OCB_X, 3 * OCB_X, 6 * OCB_X, 4 * OCB_X, 5 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 7 * OCB_X};
+#define EOB_CONTEXT_NODE            0
+#define ZERO_CONTEXT_NODE           1
+#define ONE_CONTEXT_NODE            2
+#define LOW_VAL_CONTEXT_NODE        3
+#define TWO_CONTEXT_NODE            4
+#define THREE_CONTEXT_NODE          5
+#define HIGH_LOW_CONTEXT_NODE       6
+#define CAT_ONE_CONTEXT_NODE        7
+#define CAT_THREEFOUR_CONTEXT_NODE  8
+#define CAT_THREE_CONTEXT_NODE      9
+#define CAT_FIVE_CONTEXT_NODE       10
+
+/*
+//the definition is put in "onyxd_int.h"
+typedef struct
+{
+    INT16         min_val;
+    INT16         Length;
+    UINT8 Probs[12];
+} TOKENEXTRABITS;
+*/
+
+DECLARE_ALIGNED(16, static const TOKENEXTRABITS, vp8d_token_extra_bits2[MAX_ENTROPY_TOKENS]) =
+{
+    {  0, -1, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },  //ZERO_TOKEN
+    {  1, 0, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },   //ONE_TOKEN
+    {  2, 0, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },   //TWO_TOKEN
+    {  3, 0, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },   //THREE_TOKEN
+    {  4, 0, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },   //FOUR_TOKEN
+    {  5, 0, { 159, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },  //DCT_VAL_CATEGORY1
+    {  7, 1, { 145, 165, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } }, //DCT_VAL_CATEGORY2
+    { 11, 2, { 140, 148, 173, 0,  0,  0,  0,  0,  0,  0,  0,  0   } }, //DCT_VAL_CATEGORY3
+    { 19, 3, { 135, 140, 155, 176, 0,  0,  0,  0,  0,  0,  0,  0   } }, //DCT_VAL_CATEGORY4
+    { 35, 4, { 130, 134, 141, 157, 180, 0,  0,  0,  0,  0,  0,  0   } }, //DCT_VAL_CATEGORY5
+    { 67, 10, { 129, 130, 133, 140, 153, 177, 196, 230, 243, 254, 254, 0   } }, //DCT_VAL_CATEGORY6
+    {  0, -1, { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0   } },  // EOB TOKEN
+};
+
+
+void vp8_reset_mb_tokens_context(MACROBLOCKD *x)
+{
+    ENTROPY_CONTEXT **const A = x->above_context;
+    ENTROPY_CONTEXT(* const L)[4] = x->left_context;
+
+    ENTROPY_CONTEXT *a;
+    ENTROPY_CONTEXT *l;
+    int i;
+
+    for (i = 0; i < 24; i++)
+    {
+
+        a = A[ vp8_block2context[i] ] + vp8_block2above[i];
+        l = L[ vp8_block2context[i] ] + vp8_block2left[i];
+
+        *a = *l = 0;
+    }
+
+    if (x->mbmi.mode != B_PRED && x->mbmi.mode != SPLITMV)
+    {
+        a = A[Y2CONTEXT] + vp8_block2above[24];
+        l = L[Y2CONTEXT] + vp8_block2left[24];
+        *a = *l = 0;
+    }
+
+
+}
+DECLARE_ALIGNED(16, extern const unsigned int, vp8dx_bitreader_norm[256]);
+#define NORMALIZE \
+    /*if(range < 0x80)*/                            \
+    { \
+        shift = vp8dx_bitreader_norm[range]; \
+        range <<= shift; \
+        value <<= shift; \
+        count -= shift; \
+        if(count <= 0) \
+        { \
+            count += BR_COUNT ; \
+            value |= (*bufptr) << (BR_COUNT-count); \
+            bufptr = br_ptr_advance(bufptr, 1); \
+        } \
+    }
+
+#define DECODE_AND_APPLYSIGN(value_to_sign) \
+    split = (range + 1) >> 1; \
+    if ( (value >> 8) < split ) \
+    { \
+        range = split; \
+        v= value_to_sign; \
+    } \
+    else \
+    { \
+        range = range-split; \
+        value = value-(split<<8); \
+        v = -value_to_sign; \
+    } \
+    range +=range;                   \
+    value +=value;                   \
+    if (!--count) \
+    { \
+        count = BR_COUNT; \
+        value |= *bufptr; \
+        bufptr = br_ptr_advance(bufptr, 1); \
+    }
+
+#define DECODE_AND_BRANCH_IF_ZERO(probability,branch) \
+    { \
+        split = 1 +  ((( probability*(range-1) ) )>> 8); \
+        if ( (value >> 8) < split ) \
+        { \
+            range = split; \
+            NORMALIZE \
+            goto branch; \
+        } \
+        value -= (split<<8); \
+        range = range - split; \
+        NORMALIZE \
+    }
+
+#define DECODE_AND_LOOP_IF_ZERO(probability,branch) \
+    { \
+        split = 1 + ((( probability*(range-1) ) ) >> 8); \
+        if ( (value >> 8) < split ) \
+        { \
+            range = split; \
+            NORMALIZE \
+            Prob = coef_probs; \
+            if(c<15) {\
+            ++c; \
+            Prob += vp8_coef_bands_x[c]; \
+            goto branch; \
+            } goto BLOCK_FINISHED; /*for malformed input */\
+        } \
+        value -= (split<<8); \
+        range = range - split; \
+        NORMALIZE \
+    }
+
+#define DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val) \
+    DECODE_AND_APPLYSIGN(val) \
+    Prob = coef_probs + (ENTROPY_NODES*2); \
+    if(c < 15){\
+        qcoeff_ptr [ scan[c] ] = (INT16) v; \
+        ++c; \
+        goto DO_WHILE; }\
+    qcoeff_ptr [ scan[15] ] = (INT16) v; \
+    goto BLOCK_FINISHED;
+
+
+#define DECODE_EXTRABIT_AND_ADJUST_VAL(t,bits_count)\
+    split = 1 +  (((range-1) * vp8d_token_extra_bits2[t].Probs[bits_count]) >> 8); \
+    if(value >= (split<<8))\
+    {\
+        range = range-split;\
+        value = value-(split<<8);\
+        val += ((UINT16)1<<bits_count);\
+    }\
+    else\
+    {\
+        range = split;\
+    }\
+    NORMALIZE
+
+int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x)
+{
+    ENTROPY_CONTEXT **const A = x->above_context;
+    ENTROPY_CONTEXT(* const L)[4] = x->left_context;
+    const VP8_COMMON *const oc = & dx->common;
+
+    BOOL_DECODER *bc = x->current_bc;
+
+    ENTROPY_CONTEXT *a;
+    ENTROPY_CONTEXT *l;
+    int i;
+
+    int eobtotal = 0;
+
+    register int count;
+
+    const BOOL_DATA *bufptr;
+    register unsigned int range;
+    register unsigned int value;
+    const int *scan;
+    register unsigned int shift;
+    UINT32 split;
+    INT16 *qcoeff_ptr;
+
+    const vp8_prob *coef_probs;
+    int type;
+    int stop;
+    INT16 val, bits_count;
+    INT16 c;
+    INT16 t;
+    INT16 v;
+    const vp8_prob *Prob;
+
+    //int *scan;
+    type = 3;
+    i = 0;
+    stop = 16;
+
+    if (x->mbmi.mode != B_PRED && x->mbmi.mode != SPLITMV)
+    {
+        i = 24;
+        stop = 24;
+        type = 1;
+        qcoeff_ptr = &x->qcoeff[24*16];
+        scan = vp8_default_zig_zag1d;
+        eobtotal -= 16;
+    }
+    else
+    {
+        scan = vp8_default_zig_zag1d;
+        qcoeff_ptr = &x->qcoeff[0];
+    }
+
+    count   = bc->count;
+    range   = bc->range;
+    value   = bc->value;
+    bufptr  = bc->read_ptr;
+
+
+    coef_probs = oc->fc.coef_probs [type] [ 0 ] [0];
+
+BLOCK_LOOP:
+    a = A[ vp8_block2context[i] ] + vp8_block2above[i];
+    l = L[ vp8_block2context[i] ] + vp8_block2left[i];
+    c = (INT16)(!type);
+
+    VP8_COMBINEENTROPYCONTEXTS(t, *a, *l);
+    Prob = coef_probs;
+    Prob += t * ENTROPY_NODES;
+
+DO_WHILE:
+    Prob += vp8_coef_bands_x[c];
+    DECODE_AND_BRANCH_IF_ZERO(Prob[EOB_CONTEXT_NODE], BLOCK_FINISHED);
+
+CHECK_0_:
+    DECODE_AND_LOOP_IF_ZERO(Prob[ZERO_CONTEXT_NODE], CHECK_0_);
+    DECODE_AND_BRANCH_IF_ZERO(Prob[ONE_CONTEXT_NODE], ONE_CONTEXT_NODE_0_);
+    DECODE_AND_BRANCH_IF_ZERO(Prob[LOW_VAL_CONTEXT_NODE], LOW_VAL_CONTEXT_NODE_0_);
+    DECODE_AND_BRANCH_IF_ZERO(Prob[HIGH_LOW_CONTEXT_NODE], HIGH_LOW_CONTEXT_NODE_0_);
+    DECODE_AND_BRANCH_IF_ZERO(Prob[CAT_THREEFOUR_CONTEXT_NODE], CAT_THREEFOUR_CONTEXT_NODE_0_);
+    DECODE_AND_BRANCH_IF_ZERO(Prob[CAT_FIVE_CONTEXT_NODE], CAT_FIVE_CONTEXT_NODE_0_);
+    val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY6].min_val;
+    bits_count = vp8d_token_extra_bits2[DCT_VAL_CATEGORY6].Length;
+
+    do
+    {
+        DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY6, bits_count);
+        bits_count -- ;
+    }
+    while (bits_count >= 0);
+
+    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
+
+CAT_FIVE_CONTEXT_NODE_0_:
+    val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY5].min_val;
+    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY5, 4);
+    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY5, 3);
+    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY5, 2);
+    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY5, 1);
+    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY5, 0);
+    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
+
+CAT_THREEFOUR_CONTEXT_NODE_0_:
+    DECODE_AND_BRANCH_IF_ZERO(Prob[CAT_THREE_CONTEXT_NODE], CAT_THREE_CONTEXT_NODE_0_);
+    val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY4].min_val;
+    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY4, 3);
+    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY4, 2);
+    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY4, 1);
+    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY4, 0);
+    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
+
+CAT_THREE_CONTEXT_NODE_0_:
+    val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY3].min_val;
+    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY3, 2);
+    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY3, 1);
+    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY3, 0);
+    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
+
+HIGH_LOW_CONTEXT_NODE_0_:
+    DECODE_AND_BRANCH_IF_ZERO(Prob[CAT_ONE_CONTEXT_NODE], CAT_ONE_CONTEXT_NODE_0_);
+
+    val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY2].min_val;
+    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY2, 1);
+    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY2, 0);
+    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
+
+CAT_ONE_CONTEXT_NODE_0_:
+    val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY1].min_val;
+    DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY1, 0);
+    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
+
+LOW_VAL_CONTEXT_NODE_0_:
+    DECODE_AND_BRANCH_IF_ZERO(Prob[TWO_CONTEXT_NODE], TWO_CONTEXT_NODE_0_);
+    DECODE_AND_BRANCH_IF_ZERO(Prob[THREE_CONTEXT_NODE], THREE_CONTEXT_NODE_0_);
+    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(4);
+
+THREE_CONTEXT_NODE_0_:
+    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(3);
+
+TWO_CONTEXT_NODE_0_:
+    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(2);
+
+ONE_CONTEXT_NODE_0_:
+    DECODE_AND_APPLYSIGN(1);
+    Prob = coef_probs + ENTROPY_NODES;
+
+    if (c < 15)
+    {
+        qcoeff_ptr [ scan[c] ] = (INT16) v;
+        ++c;
+        goto DO_WHILE;
+    }
+
+    qcoeff_ptr [ scan[15] ] = (INT16) v;
+BLOCK_FINISHED:
+    t = ((x->block[i].eob = c) != !type);   // any nonzero data?
+    eobtotal += x->block[i].eob;
+    *a = *l = t;
+    qcoeff_ptr += 16;
+
+    i++;
+
+    if (i < stop)
+        goto BLOCK_LOOP;
+
+    if (i == 25)
+    {
+        scan = vp8_default_zig_zag1d;//x->scan_order1d;
+        type = 0;
+        i = 0;
+        stop = 16;
+        coef_probs = oc->fc.coef_probs [type] [ 0 ] [0];
+        qcoeff_ptr = &x->qcoeff[0];
+        goto BLOCK_LOOP;
+    }
+
+    if (i == 16)
+    {
+        type = 2;
+        coef_probs = oc->fc.coef_probs [type] [ 0 ] [0];
+        stop = 24;
+        goto BLOCK_LOOP;
+    }
+
+    bc->count = count;
+    bc->value = value;
+    bc->range = range;
+    bc->read_ptr = bufptr;
+    return eobtotal;
+
+}
diff --git a/vp8/decoder/detokenize.h b/vp8/decoder/detokenize.h
new file mode 100644
index 0000000..6a9a476
--- /dev/null
+++ b/vp8/decoder/detokenize.h
@@ -0,0 +1,19 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef detokenize_h
+#define detokenize_h 1
+
+#include "onyxd_int.h"
+
+void vp8_reset_mb_tokens_context(MACROBLOCKD *x);
+int vp8_decode_mb_tokens(VP8D_COMP *, MACROBLOCKD *);
+
+#endif /* detokenize_h */
diff --git a/vp8/decoder/generic/dsystemdependent.c b/vp8/decoder/generic/dsystemdependent.c
new file mode 100644
index 0000000..302b64b
--- /dev/null
+++ b/vp8/decoder/generic/dsystemdependent.c
@@ -0,0 +1,37 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "dequantize.h"
+#include "onyxd_int.h"
+
+extern void vp8_arch_x86_decode_init(VP8D_COMP *pbi);
+
+void vp8_dmachine_specific_config(VP8D_COMP *pbi)
+{
+    // Pure C:
+#if CONFIG_RUNTIME_CPU_DETECT
+    pbi->mb.rtcd         = &pbi->common.rtcd;
+    pbi->dequant.block   = vp8_dequantize_b_c;
+    pbi->dequant.idct    = vp8_dequant_idct_c;
+    pbi->dequant.idct_dc = vp8_dequant_dc_idct_c;
+    pbi->dboolhuff.start = vp8dx_start_decode_c;
+    pbi->dboolhuff.stop  = vp8dx_stop_decode_c;
+    pbi->dboolhuff.fill  = vp8dx_bool_decoder_fill_c;
+#if 0 //For use with RTCD, when implemented
+    pbi->dboolhuff.debool = vp8dx_decode_bool_c;
+    pbi->dboolhuff.devalue = vp8dx_decode_value_c;
+#endif
+#endif
+
+#if ARCH_X86 || ARCH_X86_64
+    vp8_arch_x86_decode_init(pbi);
+#endif
+}
diff --git a/vp8/decoder/onyxd_if.c b/vp8/decoder/onyxd_if.c
new file mode 100644
index 0000000..6875585
--- /dev/null
+++ b/vp8/decoder/onyxd_if.c
@@ -0,0 +1,451 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "onyxc_int.h"
+#if CONFIG_POSTPROC
+#include "postproc.h"
+#endif
+#include "onyxd.h"
+#include "onyxd_int.h"
+#include "vpx_mem/vpx_mem.h"
+#include "alloccommon.h"
+#include "vpx_scale/yv12extend.h"
+#include "loopfilter.h"
+#include "swapyv12buffer.h"
+#include "g_common.h"
+#include "threading.h"
+#include "decoderthreading.h"
+#include <stdio.h>
+#include "segmentation_common.h"
+#include "quant_common.h"
+#include "vpx_scale/vpxscale.h"
+#include "systemdependent.h"
+#include "vpx_ports/vpx_timer.h"
+
+
+extern void vp8_init_loop_filter(VP8_COMMON *cm);
+
+extern void vp8cx_init_de_quantizer(VP8D_COMP *pbi);
+
+// DEBUG code
+#if CONFIG_DEBUG
+void vp8_recon_write_yuv_frame(unsigned char *name, YV12_BUFFER_CONFIG *s)
+{
+    FILE *yuv_file = fopen((char *)name, "ab");
+    unsigned char *src = s->y_buffer;
+    int h = s->y_height;
+
+    do
+    {
+        fwrite(src, s->y_width, 1,  yuv_file);
+        src += s->y_stride;
+    }
+    while (--h);
+
+    src = s->u_buffer;
+    h = s->uv_height;
+
+    do
+    {
+        fwrite(src, s->uv_width, 1,  yuv_file);
+        src += s->uv_stride;
+    }
+    while (--h);
+
+    src = s->v_buffer;
+    h = s->uv_height;
+
+    do
+    {
+        fwrite(src, s->uv_width, 1, yuv_file);
+        src += s->uv_stride;
+    }
+    while (--h);
+
+    fclose(yuv_file);
+}
+#endif
+
+void vp8dx_initialize()
+{
+    static int init_done = 0;
+
+    if (!init_done)
+    {
+        vp8_initialize_common();
+        vp8_scale_machine_specific_config();
+        init_done = 1;
+    }
+}
+
+
+VP8D_PTR vp8dx_create_decompressor(VP8D_CONFIG *oxcf)
+{
+    VP8D_COMP *pbi = vpx_memalign(32, sizeof(VP8D_COMP));
+
+    if (!pbi)
+        return NULL;
+
+    vpx_memset(pbi, 0, sizeof(VP8D_COMP));
+
+    if (setjmp(pbi->common.error.jmp))
+    {
+        pbi->common.error.setjmp = 0;
+        vp8dx_remove_decompressor(pbi);
+        return 0;
+    }
+
+    pbi->common.error.setjmp = 1;
+    vp8dx_initialize();
+
+    vp8_create_common(&pbi->common);
+    vp8_dmachine_specific_config(pbi);
+
+    pbi->common.current_video_frame = 0;
+    pbi->ready_for_new_data = 1;
+
+    pbi->CPUFreq = 0; //vp8_get_processor_freq();
+    pbi->max_threads = oxcf->max_threads;
+    vp8_decoder_create_threads(pbi);
+
+    //vp8cx_init_de_quantizer() is first called here. Add check in frame_init_dequantizer() to avoid
+    // unnecessary calling of vp8cx_init_de_quantizer() for every frame.
+    vp8cx_init_de_quantizer(pbi);
+
+    {
+        VP8_COMMON *cm = &pbi->common;
+
+        vp8_init_loop_filter(cm);
+        cm->last_frame_type = KEY_FRAME;
+        cm->last_filter_type = cm->filter_type;
+        cm->last_sharpness_level = cm->sharpness_level;
+    }
+
+    pbi->common.error.setjmp = 0;
+    return (VP8D_PTR) pbi;
+}
+
+
+void vp8dx_remove_decompressor(VP8D_PTR ptr)
+{
+    VP8D_COMP *pbi = (VP8D_COMP *) ptr;
+
+    if (!pbi)
+        return;
+
+    vp8_decoder_remove_threads(pbi);
+    vp8_remove_common(&pbi->common);
+    vpx_free(pbi);
+}
+
+
+void vp8dx_set_setting(VP8D_PTR comp, VP8D_SETTING oxst, int x)
+{
+    VP8D_COMP *pbi = (VP8D_COMP *) comp;
+
+    (void) pbi;
+    (void) x;
+
+    switch (oxst)
+    {
+    case VP8D_OK:
+        break;
+    }
+}
+
+int vp8dx_get_setting(VP8D_PTR comp, VP8D_SETTING oxst)
+{
+    VP8D_COMP *pbi = (VP8D_COMP *) comp;
+
+    (void) pbi;
+
+    switch (oxst)
+    {
+    case VP8D_OK:
+        break;
+    }
+
+    return -1;
+}
+
+int vp8dx_get_reference(VP8D_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
+{
+    VP8D_COMP *pbi = (VP8D_COMP *) ptr;
+    VP8_COMMON *cm = &pbi->common;
+
+    if (ref_frame_flag == VP8_LAST_FLAG)
+        vp8_yv12_copy_frame_ptr(&cm->last_frame, sd);
+
+    else if (ref_frame_flag == VP8_GOLD_FLAG)
+        vp8_yv12_copy_frame_ptr(&cm->golden_frame, sd);
+
+    else if (ref_frame_flag == VP8_ALT_FLAG)
+        vp8_yv12_copy_frame_ptr(&cm->alt_ref_frame, sd);
+
+    else
+        return -1;
+
+    return 0;
+}
+int vp8dx_set_reference(VP8D_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
+{
+    VP8D_COMP *pbi = (VP8D_COMP *) ptr;
+    VP8_COMMON *cm = &pbi->common;
+
+    if (ref_frame_flag == VP8_LAST_FLAG)
+        vp8_yv12_copy_frame_ptr(sd, &cm->last_frame);
+
+    else if (ref_frame_flag == VP8_GOLD_FLAG)
+        vp8_yv12_copy_frame_ptr(sd, &cm->golden_frame);
+
+    else if (ref_frame_flag == VP8_ALT_FLAG)
+        vp8_yv12_copy_frame_ptr(sd, &cm->alt_ref_frame);
+
+    else
+        return -1;
+
+    return 0;
+}
+
+//For ARM NEON, d8-d15 are callee-saved registers, and need to be saved by us.
+#if HAVE_ARMV7
+extern void vp8_push_neon(INT64 *store);
+extern void vp8_pop_neon(INT64 *store);
+static INT64 dx_store_reg[8];
+#endif
+int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsigned char *source, INT64 time_stamp)
+{
+    VP8D_COMP *pbi = (VP8D_COMP *) ptr;
+    VP8_COMMON *cm = &pbi->common;
+    int retcode = 0;
+
+    struct vpx_usec_timer timer;
+
+//  if(pbi->ready_for_new_data == 0)
+//      return -1;
+
+    if (ptr == 0)
+    {
+        return -1;
+    }
+
+    pbi->common.error.error_code = VPX_CODEC_OK;
+
+    if (setjmp(pbi->common.error.jmp))
+    {
+        pbi->common.error.setjmp = 0;
+        return -1;
+    }
+
+    pbi->common.error.setjmp = 1;
+
+#if HAVE_ARMV7
+    vp8_push_neon(dx_store_reg);
+#endif
+
+    vpx_usec_timer_start(&timer);
+
+    //cm->current_video_frame++;
+    pbi->Source = source;
+    pbi->source_sz = size;
+
+    retcode = vp8_decode_frame(pbi);
+
+    if (retcode < 0)
+    {
+#if HAVE_ARMV7
+        vp8_pop_neon(dx_store_reg);
+#endif
+        pbi->common.error.error_code = VPX_CODEC_ERROR;
+        pbi->common.error.setjmp = 0;
+        return retcode;
+    }
+
+    // Update the GF useage maps.
+    vp8_update_gf_useage_maps(cm, &pbi->mb);
+
+    if (pbi->b_multithreaded_lf && pbi->common.filter_level != 0)
+        vp8_stop_lfthread(pbi);
+
+    if (cm->refresh_last_frame)
+    {
+        vp8_swap_yv12_buffer(&cm->last_frame, &cm->new_frame);
+
+        cm->frame_to_show = &cm->last_frame;
+    }
+    else
+    {
+        cm->frame_to_show = &cm->new_frame;
+    }
+
+    if (!pbi->b_multithreaded_lf)
+    {
+        struct vpx_usec_timer lpftimer;
+        vpx_usec_timer_start(&lpftimer);
+        // Apply the loop filter if appropriate.
+
+        if (cm->filter_level > 0)
+        {
+            vp8_loop_filter_frame(cm, &pbi->mb, cm->filter_level);
+            cm->last_frame_type = cm->frame_type;
+            cm->last_filter_type = cm->filter_type;
+            cm->last_sharpness_level = cm->sharpness_level;
+
+        }
+
+        vpx_usec_timer_mark(&lpftimer);
+        pbi->time_loop_filtering += vpx_usec_timer_elapsed(&lpftimer);
+    }
+
+    vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show);
+
+#if 0
+    // DEBUG code
+    //vp8_recon_write_yuv_frame("recon.yuv", cm->frame_to_show);
+    if (cm->current_video_frame <= 5)
+        write_dx_frame_to_file(cm->frame_to_show, cm->current_video_frame);
+#endif
+
+    // If any buffer copy / swaping is signalled it should be done here.
+    if (cm->copy_buffer_to_arf)
+    {
+        if (cm->copy_buffer_to_arf == 1)
+        {
+            if (cm->refresh_last_frame)
+                vp8_yv12_copy_frame_ptr(&cm->new_frame, &cm->alt_ref_frame);
+            else
+                vp8_yv12_copy_frame_ptr(&cm->last_frame, &cm->alt_ref_frame);
+        }
+        else if (cm->copy_buffer_to_arf == 2)
+            vp8_yv12_copy_frame_ptr(&cm->golden_frame, &cm->alt_ref_frame);
+    }
+
+    if (cm->copy_buffer_to_gf)
+    {
+        if (cm->copy_buffer_to_gf == 1)
+        {
+            if (cm->refresh_last_frame)
+                vp8_yv12_copy_frame_ptr(&cm->new_frame, &cm->golden_frame);
+            else
+                vp8_yv12_copy_frame_ptr(&cm->last_frame, &cm->golden_frame);
+        }
+        else if (cm->copy_buffer_to_gf == 2)
+            vp8_yv12_copy_frame_ptr(&cm->alt_ref_frame, &cm->golden_frame);
+    }
+
+    // Should the golden or alternate reference frame be refreshed?
+    if (cm->refresh_golden_frame || cm->refresh_alt_ref_frame)
+    {
+        if (cm->refresh_golden_frame)
+            vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cm->golden_frame);
+
+        if (cm->refresh_alt_ref_frame)
+            vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cm->alt_ref_frame);
+
+        //vpx_log("Decoder: recovery frame received \n");
+
+        // Update data structures that monitors GF useage
+        vpx_memset(cm->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
+        cm->gf_active_count = cm->mb_rows * cm->mb_cols;
+    }
+
+    vp8_clear_system_state();
+
+    vpx_usec_timer_mark(&timer);
+    pbi->decode_microseconds = vpx_usec_timer_elapsed(&timer);
+
+    pbi->time_decoding += pbi->decode_microseconds;
+
+//  vp8_print_modes_and_motion_vectors( cm->mi, cm->mb_rows,cm->mb_cols, cm->current_video_frame);
+
+    if (cm->show_frame)
+        cm->current_video_frame++;
+
+    pbi->ready_for_new_data = 0;
+    pbi->last_time_stamp = time_stamp;
+
+#if 0
+    {
+        int i;
+        INT64 earliest_time = pbi->dr[0].time_stamp;
+        INT64 latest_time = pbi->dr[0].time_stamp;
+        INT64 time_diff = 0;
+        int bytes = 0;
+
+        pbi->dr[pbi->common.current_video_frame&0xf].size = pbi->bc.pos + pbi->bc2.pos + 4;;
+        pbi->dr[pbi->common.current_video_frame&0xf].time_stamp = time_stamp;
+
+        for (i = 0; i < 16; i++)
+        {
+
+            bytes += pbi->dr[i].size;
+
+            if (pbi->dr[i].time_stamp < earliest_time)
+                earliest_time = pbi->dr[i].time_stamp;
+
+            if (pbi->dr[i].time_stamp > latest_time)
+                latest_time = pbi->dr[i].time_stamp;
+        }
+
+        time_diff = latest_time - earliest_time;
+
+        if (time_diff > 0)
+        {
+            pbi->common.bitrate = 80000.00 * bytes / time_diff  ;
+            pbi->common.framerate = 160000000.00 / time_diff ;
+        }
+
+    }
+#endif
+
+#if HAVE_ARMV7
+    vp8_pop_neon(dx_store_reg);
+#endif
+    pbi->common.error.setjmp = 0;
+    return retcode;
+}
+int vp8dx_get_raw_frame(VP8D_PTR ptr, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp, INT64 *time_end_stamp, int deblock_level,  int noise_level, int flags)
+{
+    int ret = -1;
+    VP8D_COMP *pbi = (VP8D_COMP *) ptr;
+
+    if (pbi->ready_for_new_data == 1)
+        return ret;
+
+    // ie no raw frame to show!!!
+    if (pbi->common.show_frame == 0)
+        return ret;
+
+    pbi->ready_for_new_data = 1;
+    *time_stamp = pbi->last_time_stamp;
+    *time_end_stamp = 0;
+
+    sd->clrtype = pbi->common.clr_type;
+#if CONFIG_POSTPROC
+    ret = vp8_post_proc_frame(&pbi->common, sd, deblock_level, noise_level, flags);
+#else
+
+    if (pbi->common.frame_to_show)
+    {
+        *sd = *pbi->common.frame_to_show;
+        sd->y_width = pbi->common.Width;
+        sd->y_height = pbi->common.Height;
+        sd->uv_height = pbi->common.Height / 2;
+        ret = 0;
+    }
+    else
+    {
+        ret = -1;
+    }
+
+#endif //!CONFIG_POSTPROC
+    vp8_clear_system_state();
+    return ret;
+}
diff --git a/vp8/decoder/onyxd_if_sjl.c b/vp8/decoder/onyxd_if_sjl.c
new file mode 100644
index 0000000..363ad5d
--- /dev/null
+++ b/vp8/decoder/onyxd_if_sjl.c
@@ -0,0 +1,398 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "onyxc_int.h"
+#include "postproc.h"
+#include "onyxd.h"
+#include "onyxd_int.h"
+#include "vpx_mem/vpx_mem.h"
+#include "alloccommon.h"
+#include "vpx_scale/yv12extend.h"
+#include "loopfilter.h"
+#include "swapyv12buffer.h"
+#include "g_common.h"
+#include "threading.h"
+#include "decoderthreading.h"
+#include <stdio.h>
+#include "segmentation_common.h"
+#include "quant_common.h"
+#include "vpx_scale/vpxscale.h"
+#include "systemdependent.h"
+#include "vpx_ports/vpx_timer.h"
+
+
+#ifndef VPX_NO_GLOBALS
+static int init_ct = 0;
+#else
+# include "vpx_global_handling.h"
+# define init_ct ((int)vpxglobalm(onyxd,init_ct))
+#endif
+
+extern void vp8_init_loop_filter(VP8_COMMON *cm);
+
+extern void vp8cx_init_de_quantizer(VP8D_COMP *pbi);
+extern void init_detokenizer(VP8D_COMP *dx);
+
+// DEBUG code
+void vp8_recon_write_yuv_frame(unsigned char *name, YV12_BUFFER_CONFIG *s)
+{
+    FILE *yuv_file = fopen((char *)name, "ab");
+    unsigned char *src = s->y_buffer;
+    int h = s->y_height;
+
+    do
+    {
+        fwrite(src, s->y_width, 1,  yuv_file);
+        src += s->y_stride;
+    }
+    while (--h);
+
+    src = s->u_buffer;
+    h = s->uv_height;
+
+    do
+    {
+        fwrite(src, s->uv_width, 1,  yuv_file);
+        src += s->uv_stride;
+    }
+    while (--h);
+
+    src = s->v_buffer;
+    h = s->uv_height;
+
+    do
+    {
+        fwrite(src, s->uv_width, 1, yuv_file);
+        src += s->uv_stride;
+    }
+    while (--h);
+
+    fclose(yuv_file);
+}
+
+void vp8dx_initialize()
+{
+    if (!init_ct++)
+    {
+        vp8_initialize_common();
+        vp8_scale_machine_specific_config();
+    }
+}
+
+void vp8dx_shutdown()
+{
+    if (!--init_ct)
+    {
+        vp8_shutdown_common();
+    }
+}
+
+
+VP8D_PTR vp8dx_create_decompressor(VP8D_CONFIG *oxcf)
+{
+    VP8D_COMP *pbi = vpx_memalign(32, sizeof(VP8D_COMP));
+
+    if (!pbi)
+        return NULL;
+
+    vpx_memset(pbi, 0, sizeof(VP8D_COMP));
+
+    vp8dx_initialize();
+
+    vp8_create_common(&pbi->common);
+    vp8_dmachine_specific_config(pbi);
+
+    pbi->common.current_video_frame = 0;
+    pbi->ready_for_new_data = 1;
+
+    pbi->CPUFreq = 0; //vp8_get_processor_freq();
+    pbi->max_threads = oxcf->max_threads;
+    vp8_decoder_create_threads(pbi);
+
+    //vp8cx_init_de_quantizer() is first called here. Add check in frame_init_dequantizer() to avoid
+    // unnecessary calling of vp8cx_init_de_quantizer() for every frame.
+    vp8cx_init_de_quantizer(pbi);
+
+    {
+        VP8_COMMON *cm = &pbi->common;
+
+        vp8_init_loop_filter(cm);
+        cm->last_frame_type = KEY_FRAME;
+        cm->last_filter_type = cm->filter_type;
+        cm->last_sharpness_level = cm->sharpness_level;
+    }
+
+    init_detokenizer(pbi);
+
+    return (VP8D_PTR) pbi;
+}
+void vp8dx_remove_decompressor(VP8D_PTR ptr)
+{
+    VP8D_COMP *pbi = (VP8D_COMP *) ptr;
+
+    if (!pbi)
+        return;
+
+    vp8_decoder_remove_threads(pbi);
+    vp8_remove_common(&pbi->common);
+    vpx_free(pbi);
+    vp8dx_shutdown();
+
+}
+
+void vp8dx_set_setting(VP8D_PTR comp, VP8D_SETTING oxst, int x)
+{
+    VP8D_COMP *pbi = (VP8D_COMP *) comp;
+
+    (void) pbi;
+    (void) x;
+
+    switch (oxst)
+    {
+    case VP8D_OK:
+        break;
+    }
+}
+
+int vp8dx_get_setting(VP8D_PTR comp, VP8D_SETTING oxst)
+{
+    VP8D_COMP *pbi = (VP8D_COMP *) comp;
+
+    (void) pbi;
+
+    switch (oxst)
+    {
+    case VP8D_OK:
+        break;
+    }
+
+    return -1;
+}
+
+int vp8dx_get_reference(VP8D_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
+{
+    VP8D_COMP *pbi = (VP8D_COMP *) ptr;
+    VP8_COMMON *cm = &pbi->common;
+
+    if (ref_frame_flag == VP8_LAST_FLAG)
+        vp8_yv12_copy_frame_ptr(&cm->last_frame, sd);
+
+    else if (ref_frame_flag == VP8_GOLD_FLAG)
+        vp8_yv12_copy_frame_ptr(&cm->golden_frame, sd);
+
+    else if (ref_frame_flag == VP8_ALT_FLAG)
+        vp8_yv12_copy_frame_ptr(&cm->alt_ref_frame, sd);
+
+    else
+        return -1;
+
+    return 0;
+}
+int vp8dx_set_reference(VP8D_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
+{
+    VP8D_COMP *pbi = (VP8D_COMP *) ptr;
+    VP8_COMMON *cm = &pbi->common;
+
+    if (ref_frame_flag == VP8_LAST_FLAG)
+        vp8_yv12_copy_frame_ptr(sd, &cm->last_frame);
+
+    else if (ref_frame_flag == VP8_GOLD_FLAG)
+        vp8_yv12_copy_frame_ptr(sd, &cm->golden_frame);
+
+    else if (ref_frame_flag == VP8_ALT_FLAG)
+        vp8_yv12_copy_frame_ptr(sd, &cm->alt_ref_frame);
+
+    else
+        return -1;
+
+    return 0;
+}
+int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, char *source, INT64 time_stamp)
+{
+    VP8D_COMP *pbi = (VP8D_COMP *) ptr;
+    VP8_COMMON *cm = &pbi->common;
+    int retcode = 0;
+
+    struct vpx_usec_timer timer;
+    (void) size;
+
+//  if(pbi->ready_for_new_data == 0)
+//      return -1;
+
+    vpx_usec_timer_start(&timer);
+
+    if (ptr == 0)
+    {
+        return -1;
+    }
+
+    //cm->current_video_frame++;
+    pbi->Source = source;
+
+    retcode = vp8_decode_frame(pbi);
+
+    if (retcode < 0)
+        return retcode;
+
+    // Update the GF useage maps.
+    vp8_update_gf_useage_maps(cm, &pbi->mb);
+
+    if (pbi->b_multithreaded)
+        vp8_stop_lfthread(pbi);
+
+    if (cm->refresh_last_frame)
+    {
+        vp8_swap_yv12_buffer(&cm->last_frame, &cm->new_frame);
+
+        cm->frame_to_show = &cm->last_frame;
+    }
+    else
+    {
+        cm->frame_to_show = &cm->new_frame;
+    }
+
+    if (!pbi->b_multithreaded)
+    {
+        struct vpx_usec_timer lpftimer;
+        vpx_usec_timer_start(&lpftimer);
+        // Apply the loop filter if appropriate.
+
+        if (cm->filter_level > 0)
+        {
+            vp8_loop_filter_frame(cm, &pbi->mb, cm->filter_level);
+            cm->last_frame_type = cm->frame_type;
+            cm->last_filter_type = cm->filter_type;
+            cm->last_sharpness_level = cm->sharpness_level;
+
+        }
+
+        vpx_usec_timer_mark(&lpftimer);
+        pbi->time_loop_filtering += vpx_usec_timer_elapsed(&lpftimer);
+    }
+
+    vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show);
+
+#if 0
+    // DEBUG code
+    //vp8_recon_write_yuv_frame("recon.yuv", cm->frame_to_show);
+    if (cm->current_video_frame <= 5)
+        write_dx_frame_to_file(cm->frame_to_show, cm->current_video_frame);
+#endif
+
+    // If any buffer copy / swaping is signalled it should be done here.
+    if (cm->copy_buffer_to_arf)
+    {
+        if (cm->copy_buffer_to_arf == 1)
+        {
+            if (cm->refresh_last_frame)
+                vp8_yv12_copy_frame_ptr(&cm->new_frame, &cm->alt_ref_frame);
+            else
+                vp8_yv12_copy_frame_ptr(&cm->last_frame, &cm->alt_ref_frame);
+        }
+        else if (cm->copy_buffer_to_arf == 2)
+            vp8_yv12_copy_frame_ptr(&cm->golden_frame, &cm->alt_ref_frame);
+    }
+
+    if (cm->copy_buffer_to_gf)
+    {
+        if (cm->copy_buffer_to_gf == 1)
+        {
+            if (cm->refresh_last_frame)
+                vp8_yv12_copy_frame_ptr(&cm->new_frame, &cm->golden_frame);
+            else
+                vp8_yv12_copy_frame_ptr(&cm->last_frame, &cm->golden_frame);
+        }
+        else if (cm->copy_buffer_to_gf == 2)
+            vp8_yv12_copy_frame_ptr(&cm->alt_ref_frame, &cm->golden_frame);
+    }
+
+    // Should the golden or alternate reference frame be refreshed?
+    if (cm->refresh_golden_frame || cm->refresh_alt_ref_frame)
+    {
+        if (cm->refresh_golden_frame)
+            vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cm->golden_frame);
+
+        if (cm->refresh_alt_ref_frame)
+            vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cm->alt_ref_frame);
+
+        //vpx_log("Decoder: recovery frame received \n");
+
+        // Update data structures that monitors GF useage
+        vpx_memset(cm->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
+        cm->gf_active_count = cm->mb_rows * cm->mb_cols;
+    }
+
+    vp8_clear_system_state();
+
+    vpx_usec_timer_mark(&timer);
+    pbi->decode_microseconds = vpx_usec_timer_elapsed(&timer);
+
+    pbi->time_decoding += pbi->decode_microseconds;
+
+//  vp8_print_modes_and_motion_vectors( cm->mi, cm->mb_rows,cm->mb_cols, cm->current_video_frame);
+
+    cm->current_video_frame++;
+    pbi->ready_for_new_data = 0;
+    pbi->last_time_stamp = time_stamp;
+
+    {
+        int i;
+        INT64 earliest_time = pbi->dr[0].time_stamp;
+        INT64 latest_time = pbi->dr[0].time_stamp;
+        INT64 time_diff = 0;
+        int bytes = 0;
+
+        pbi->dr[pbi->common.current_video_frame&0xf].size = pbi->bc.pos + pbi->bc2.pos + 4;;
+        pbi->dr[pbi->common.current_video_frame&0xf].time_stamp = time_stamp;
+
+        for (i = 0; i < 16; i++)
+        {
+
+            bytes += pbi->dr[i].size;
+
+            if (pbi->dr[i].time_stamp < earliest_time)
+                earliest_time = pbi->dr[i].time_stamp;
+
+            if (pbi->dr[i].time_stamp > latest_time)
+                latest_time = pbi->dr[i].time_stamp;
+        }
+
+        time_diff = latest_time - earliest_time;
+
+        if (time_diff > 0)
+        {
+            pbi->common.bitrate = 80000.00 * bytes / time_diff  ;
+            pbi->common.framerate = 160000000.00 / time_diff ;
+        }
+
+    }
+    return retcode;
+}
+int vp8dx_get_raw_frame(VP8D_PTR ptr, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp, INT64 *time_end_stamp, int deblock_level,  int noise_level, int flags)
+{
+    int ret = -1;
+    VP8D_COMP *pbi = (VP8D_COMP *) ptr;
+
+    if (pbi->ready_for_new_data == 1)
+        return ret;
+
+    // ie no raw frame to show!!!
+    if (pbi->common.show_frame == 0)
+        return ret;
+
+    pbi->ready_for_new_data = 1;
+    *time_stamp = pbi->last_time_stamp;
+    *time_end_stamp = 0;
+
+    sd->clrtype = pbi->common.clr_type;
+    ret = vp8_post_proc_frame(&pbi->common, sd, deblock_level, noise_level, flags);
+    vp8_clear_system_state();
+    return ret;
+}
diff --git a/vp8/decoder/onyxd_int.h b/vp8/decoder/onyxd_int.h
new file mode 100644
index 0000000..fa4fa48
--- /dev/null
+++ b/vp8/decoder/onyxd_int.h
@@ -0,0 +1,149 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef __INC_VP8D_INT_H
+#define __INC_VP8D_INT_H
+#include "vpx_ports/config.h"
+#include "onyxd.h"
+#include "treereader.h"
+#include "onyxc_int.h"
+#include "threading.h"
+#include "dequantize.h"
+
+typedef struct
+{
+    int ithread;
+    void *ptr1;
+    void *ptr2;
+} DECODETHREAD_DATA;
+
+typedef struct
+{
+    MACROBLOCKD  mbd;
+    int mb_row;
+    int current_mb_col;
+    short *coef_ptr;
+} MB_ROW_DEC;
+
+typedef struct
+{
+    INT64 time_stamp;
+    int size;
+} DATARATE;
+
+typedef struct
+{
+    INT16         min_val;
+    INT16         Length;
+    UINT8 Probs[12];
+} TOKENEXTRABITS;
+
+typedef struct
+{
+    int *scan;
+    UINT8 *ptr_onyxblock2context_leftabove;
+    vp8_tree_index *vp8_coef_tree_ptr;  //onyx_coef_tree_ptr; ???
+    TOKENEXTRABITS *teb_base_ptr;
+    unsigned char *norm_ptr;
+//  UINT16 *ptr_onyx_coef_bands_x;
+    UINT8 *ptr_onyx_coef_bands_x;
+
+    ENTROPY_CONTEXT   **A;
+    ENTROPY_CONTEXT(*L)[4];
+
+    INT16 *qcoeff_start_ptr;
+    BOOL_DECODER *current_bc;
+
+    UINT8 *coef_probs[4];
+
+    UINT8 eob[25];
+
+} DETOK;
+
+typedef struct VP8Decompressor
+{
+    DECLARE_ALIGNED(16, MACROBLOCKD, mb);
+
+    DECLARE_ALIGNED(16, VP8_COMMON, common);
+
+    vp8_reader bc, bc2;
+
+    VP8D_CONFIG oxcf;
+
+
+    const unsigned char *Source;
+    unsigned int   source_sz;
+
+
+    unsigned int CPUFreq;
+    unsigned int decode_microseconds;
+    unsigned int time_decoding;
+    unsigned int time_loop_filtering;
+
+    volatile int b_multithreaded_rd;
+    volatile int b_multithreaded_lf;
+    int max_threads;
+    int last_mb_row_decoded;
+    int current_mb_col_main;
+    int decoding_thread_count;
+    int allocated_decoding_thread_count;
+
+    // variable for threading
+    DECLARE_ALIGNED(16, MACROBLOCKD, lpfmb);
+#if CONFIG_MULTITHREAD
+    pthread_t           h_thread_lpf;         // thread for postprocessing
+    sem_t               h_event_lpf;          // Event for post_proc completed
+    sem_t               h_event_start_lpf;
+#endif
+    MB_ROW_DEC           *mb_row_di;
+    DECODETHREAD_DATA   *de_thread_data;
+#if CONFIG_MULTITHREAD
+    pthread_t           *h_decoding_thread;
+    sem_t               *h_event_mbrdecoding;
+    sem_t               h_event_main;
+    // end of threading data
+#endif
+    vp8_reader *mbc;
+    INT64 last_time_stamp;
+    int   ready_for_new_data;
+
+    DATARATE dr[16];
+
+    DETOK detoken;
+
+#if CONFIG_RUNTIME_CPU_DETECT
+    vp8_dequant_rtcd_vtable_t        dequant;
+    struct vp8_dboolhuff_rtcd_vtable dboolhuff;
+#endif
+
+} VP8D_COMP;
+
+int vp8_decode_frame(VP8D_COMP *cpi);
+void vp8_dmachine_specific_config(VP8D_COMP *pbi);
+
+
+#if CONFIG_DEBUG
+#define CHECK_MEM_ERROR(lval,expr) do {\
+        lval = (expr); \
+        if(!lval) \
+            vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,\
+                               "Failed to allocate "#lval" at %s:%d", \
+                               __FILE__,__LINE__);\
+    } while(0)
+#else
+#define CHECK_MEM_ERROR(lval,expr) do {\
+        lval = (expr); \
+        if(!lval) \
+            vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,\
+                               "Failed to allocate "#lval);\
+    } while(0)
+#endif
+
+#endif
diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c
new file mode 100644
index 0000000..e35d175
--- /dev/null
+++ b/vp8/decoder/threading.c
@@ -0,0 +1,596 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef WIN32
+# include <unistd.h>
+#endif
+#include "onyxd_int.h"
+#include "vpx_mem/vpx_mem.h"
+#include "threading.h"
+
+#include "loopfilter.h"
+#include "extend.h"
+#include "vpx_ports/vpx_timer.h"
+
+extern void vp8_decode_mb_row(VP8D_COMP *pbi,
+                              VP8_COMMON *pc,
+                              int mb_row,
+                              MACROBLOCKD *xd);
+
+extern void vp8_build_uvmvs(MACROBLOCKD *x, int fullpixel);
+extern void vp8_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd);
+
+void vp8_setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC *mbrd, int count)
+{
+
+
+
+#if CONFIG_MULTITHREAD
+    VP8_COMMON *const pc = & pbi->common;
+    int i, j;
+
+    for (i = 0; i < count; i++)
+    {
+        MACROBLOCKD *mbd = &mbrd[i].mbd;
+#if CONFIG_RUNTIME_CPU_DETECT
+        mbd->rtcd = xd->rtcd;
+#endif
+
+
+        mbd->subpixel_predict        = xd->subpixel_predict;
+        mbd->subpixel_predict8x4     = xd->subpixel_predict8x4;
+        mbd->subpixel_predict8x8     = xd->subpixel_predict8x8;
+        mbd->subpixel_predict16x16   = xd->subpixel_predict16x16;
+        mbd->gf_active_ptr            = xd->gf_active_ptr;
+
+        mbd->mode_info        = pc->mi - 1;
+        mbd->mode_info_context = pc->mi   + pc->mode_info_stride * (i + 1);
+        mbd->mode_info_stride  = pc->mode_info_stride;
+
+        mbd->frame_type = pc->frame_type;
+        mbd->frames_since_golden      = pc->frames_since_golden;
+        mbd->frames_till_alt_ref_frame  = pc->frames_till_alt_ref_frame;
+
+        mbd->pre = pc->last_frame;
+        mbd->dst = pc->new_frame;
+
+
+
+
+        vp8_setup_block_dptrs(mbd);
+        vp8_build_block_doffsets(mbd);
+        mbd->segmentation_enabled    = xd->segmentation_enabled;
+        mbd->mb_segement_abs_delta     = xd->mb_segement_abs_delta;
+        vpx_memcpy(mbd->segment_feature_data, xd->segment_feature_data, sizeof(xd->segment_feature_data));
+
+        mbd->mbmi.mode = DC_PRED;
+        mbd->mbmi.uv_mode = DC_PRED;
+
+        mbd->current_bc = &pbi->bc2;
+
+        for (j = 0; j < 25; j++)
+        {
+            mbd->block[j].dequant = xd->block[j].dequant;
+        }
+    }
+
+#else
+    (void) pbi;
+    (void) xd;
+    (void) mbrd;
+    (void) count;
+#endif
+}
+
+
+THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)
+{
+#if CONFIG_MULTITHREAD
+    int ithread = ((DECODETHREAD_DATA *)p_data)->ithread;
+    VP8D_COMP *pbi = (VP8D_COMP *)(((DECODETHREAD_DATA *)p_data)->ptr1);
+    MB_ROW_DEC *mbrd = (MB_ROW_DEC *)(((DECODETHREAD_DATA *)p_data)->ptr2);
+    ENTROPY_CONTEXT mb_row_left_context[4][4];
+
+    while (1)
+    {
+        if (pbi->b_multithreaded_rd == 0)
+            break;
+
+        //if(WaitForSingleObject(pbi->h_event_mbrdecoding[ithread], INFINITE) == WAIT_OBJECT_0)
+        if (sem_wait(&pbi->h_event_mbrdecoding[ithread]) == 0)
+        {
+            if (pbi->b_multithreaded_rd == 0)
+                break;
+            else
+            {
+                VP8_COMMON *pc = &pbi->common;
+                int mb_row       = mbrd->mb_row;
+                MACROBLOCKD *xd = &mbrd->mbd;
+
+                //printf("ithread:%d mb_row %d\n", ithread, mb_row);
+                int i;
+                int recon_yoffset, recon_uvoffset;
+                int mb_col;
+                int recon_y_stride = pc->last_frame.y_stride;
+                int recon_uv_stride = pc->last_frame.uv_stride;
+
+                volatile int *last_row_current_mb_col;
+
+                if (ithread > 0)
+                    last_row_current_mb_col = &pbi->mb_row_di[ithread-1].current_mb_col;
+                else
+                    last_row_current_mb_col = &pbi->current_mb_col_main;
+
+                recon_yoffset = mb_row * recon_y_stride * 16;
+                recon_uvoffset = mb_row * recon_uv_stride * 8;
+                // reset above block coeffs
+
+                xd->above_context[Y1CONTEXT] = pc->above_context[Y1CONTEXT];
+                xd->above_context[UCONTEXT ] = pc->above_context[UCONTEXT];
+                xd->above_context[VCONTEXT ] = pc->above_context[VCONTEXT];
+                xd->above_context[Y2CONTEXT] = pc->above_context[Y2CONTEXT];
+                xd->left_context = mb_row_left_context;
+                vpx_memset(mb_row_left_context, 0, sizeof(mb_row_left_context));
+                xd->up_available = (mb_row != 0);
+
+                xd->mb_to_top_edge = -((mb_row * 16)) << 3;
+                xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
+
+                for (mb_col = 0; mb_col < pc->mb_cols; mb_col++)
+                {
+
+                    while (mb_col > (*last_row_current_mb_col - 1) && *last_row_current_mb_col != pc->mb_cols - 1)
+                    {
+                        x86_pause_hint();
+                        thread_sleep(0);
+                    }
+
+                    // Take a copy of the mode and Mv information for this macroblock into the xd->mbmi
+                    vpx_memcpy(&xd->mbmi, &xd->mode_info_context->mbmi, 32); //sizeof(MB_MODE_INFO) );
+
+                    if (xd->mbmi.mode == SPLITMV || xd->mbmi.mode == B_PRED)
+                    {
+                        for (i = 0; i < 16; i++)
+                        {
+                            BLOCKD *d = &xd->block[i];
+                            vpx_memcpy(&d->bmi, &xd->mode_info_context->bmi[i], sizeof(B_MODE_INFO));
+                        }
+                    }
+
+                    // Distance of Mb to the various image edges.
+                    // These specified to 8th pel as they are always compared to values that are in 1/8th pel units
+                    xd->mb_to_left_edge = -((mb_col * 16) << 3);
+                    xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
+
+                    xd->dst.y_buffer = pc->new_frame.y_buffer + recon_yoffset;
+                    xd->dst.u_buffer = pc->new_frame.u_buffer + recon_uvoffset;
+                    xd->dst.v_buffer = pc->new_frame.v_buffer + recon_uvoffset;
+
+                    xd->left_available = (mb_col != 0);
+
+                    // Select the appropriate reference frame for this MB
+                    if (xd->mbmi.ref_frame == LAST_FRAME)
+                    {
+                        xd->pre.y_buffer = pc->last_frame.y_buffer + recon_yoffset;
+                        xd->pre.u_buffer = pc->last_frame.u_buffer + recon_uvoffset;
+                        xd->pre.v_buffer = pc->last_frame.v_buffer + recon_uvoffset;
+                    }
+                    else if (xd->mbmi.ref_frame == GOLDEN_FRAME)
+                    {
+                        // Golden frame reconstruction buffer
+                        xd->pre.y_buffer = pc->golden_frame.y_buffer + recon_yoffset;
+                        xd->pre.u_buffer = pc->golden_frame.u_buffer + recon_uvoffset;
+                        xd->pre.v_buffer = pc->golden_frame.v_buffer + recon_uvoffset;
+                    }
+                    else
+                    {
+                        // Alternate reference frame reconstruction buffer
+                        xd->pre.y_buffer = pc->alt_ref_frame.y_buffer + recon_yoffset;
+                        xd->pre.u_buffer = pc->alt_ref_frame.u_buffer + recon_uvoffset;
+                        xd->pre.v_buffer = pc->alt_ref_frame.v_buffer + recon_uvoffset;
+                    }
+
+                    vp8_build_uvmvs(xd, pc->full_pixel);
+
+                    vp8dx_bool_decoder_fill(xd->current_bc);
+                    vp8_decode_macroblock(pbi, xd);
+
+
+                    recon_yoffset += 16;
+                    recon_uvoffset += 8;
+
+                    ++xd->mode_info_context;  /* next mb */
+
+                    xd->gf_active_ptr++;      // GF useage flag for next MB
+
+                    xd->above_context[Y1CONTEXT] += 4;
+                    xd->above_context[UCONTEXT ] += 2;
+                    xd->above_context[VCONTEXT ] += 2;
+                    xd->above_context[Y2CONTEXT] ++;
+                    pbi->mb_row_di[ithread].current_mb_col = mb_col;
+
+                }
+
+                // adjust to the next row of mbs
+                vp8_extend_mb_row(
+                    &pc->new_frame,
+                    xd->dst.y_buffer + 16, xd->dst.u_buffer + 8, xd->dst.v_buffer + 8
+                );
+
+                ++xd->mode_info_context;      /* skip prediction column */
+
+                // since we have multithread
+                xd->mode_info_context += xd->mode_info_stride * pbi->decoding_thread_count;
+
+                //memcpy(&pbi->lpfmb, &pbi->mb, sizeof(pbi->mb));
+                if ((mb_row & 1) == 1)
+                {
+                    pbi->last_mb_row_decoded = mb_row;
+                    //printf("S%d", pbi->last_mb_row_decoded);
+                }
+
+                if (ithread == (pbi->decoding_thread_count - 1) || mb_row == pc->mb_rows - 1)
+                {
+                    //SetEvent(pbi->h_event_main);
+                    sem_post(&pbi->h_event_main);
+
+                }
+            }
+        }
+    }
+
+#else
+    (void) p_data;
+#endif
+
+    return 0 ;
+}
+
+THREAD_FUNCTION vp8_thread_loop_filter(void *p_data)
+{
+#if CONFIG_MULTITHREAD
+    VP8D_COMP *pbi = (VP8D_COMP *)p_data;
+
+    while (1)
+    {
+        if (pbi->b_multithreaded_lf == 0)
+            break;
+
+        //printf("before waiting for start_lpf\n");
+
+        //if(WaitForSingleObject(pbi->h_event_start_lpf, INFINITE) == WAIT_OBJECT_0)
+        if (sem_wait(&pbi->h_event_start_lpf) == 0)
+        {
+            if (pbi->b_multithreaded_lf == 0) // we're shutting down
+                break;
+            else
+            {
+
+                VP8_COMMON *cm  = &pbi->common;
+                MACROBLOCKD *mbd = &pbi->lpfmb;
+                int default_filt_lvl = pbi->common.filter_level;
+
+                YV12_BUFFER_CONFIG *post = &cm->new_frame;
+                loop_filter_info *lfi = cm->lf_info;
+
+                int mb_row;
+                int mb_col;
+
+
+                int baseline_filter_level[MAX_MB_SEGMENTS];
+                int filter_level;
+                int alt_flt_enabled = mbd->segmentation_enabled;
+
+                int i;
+                unsigned char *y_ptr, *u_ptr, *v_ptr;
+
+                volatile int *last_mb_row_decoded = &pbi->last_mb_row_decoded;
+
+                //MODE_INFO * this_mb_mode_info = cm->mi;
+                mbd->mode_info_context = cm->mi;          // Point at base of Mb MODE_INFO list
+
+                // Note the baseline filter values for each segment
+                if (alt_flt_enabled)
+                {
+                    for (i = 0; i < MAX_MB_SEGMENTS; i++)
+                    {
+                        if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
+                            baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i];
+                        else
+                        {
+                            baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i];
+                            baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0;  // Clamp to valid range
+                        }
+                    }
+                }
+                else
+                {
+                    for (i = 0; i < MAX_MB_SEGMENTS; i++)
+                        baseline_filter_level[i] = default_filt_lvl;
+                }
+
+                // Initialize the loop filter for this frame.
+                vp8_init_loop_filter(cm);
+
+                // Set up the buffer pointers
+                y_ptr = post->y_buffer;
+                u_ptr = post->u_buffer;
+                v_ptr = post->v_buffer;
+
+                // vp8_filter each macro block
+                for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
+                {
+
+                    while (mb_row >= *last_mb_row_decoded)
+                    {
+                        x86_pause_hint();
+                        thread_sleep(0);
+                    }
+
+                    //printf("R%d", mb_row);
+                    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+                    {
+                        int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0;
+
+                        filter_level = baseline_filter_level[Segment];
+
+                        // Apply any context driven MB level adjustment
+                        vp8_adjust_mb_lf_value(mbd, &filter_level);
+
+                        if (filter_level)
+                        {
+                            if (mb_col > 0)
+                                cm->lf_mbv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
+
+                            if (mbd->mode_info_context->mbmi.dc_diff > 0)
+                                cm->lf_bv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
+
+                            // don't apply across umv border
+                            if (mb_row > 0)
+                                cm->lf_mbh(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
+
+                            if (mbd->mode_info_context->mbmi.dc_diff > 0)
+                                cm->lf_bh(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
+                        }
+
+                        y_ptr += 16;
+                        u_ptr += 8;
+                        v_ptr += 8;
+
+                        mbd->mode_info_context++;     // step to next MB
+
+                    }
+
+                    y_ptr += post->y_stride  * 16 - post->y_width;
+                    u_ptr += post->uv_stride *  8 - post->uv_width;
+                    v_ptr += post->uv_stride *  8 - post->uv_width;
+
+                    mbd->mode_info_context++;         // Skip border mb
+                }
+
+                //printf("R%d\n", mb_row);
+                // When done, signal main thread that ME is finished
+                //SetEvent(pbi->h_event_lpf);
+                sem_post(&pbi->h_event_lpf);
+            }
+
+        }
+    }
+
+#else
+    (void) p_data;
+#endif
+    return 0;
+}
+
+void vp8_decoder_create_threads(VP8D_COMP *pbi)
+{
+#if CONFIG_MULTITHREAD
+    int core_count = 0;
+    int ithread;
+
+    pbi->b_multithreaded_rd = 0;
+    pbi->b_multithreaded_lf = 0;
+    pbi->allocated_decoding_thread_count = 0;
+    core_count = (pbi->max_threads > 16) ? 16 : pbi->max_threads; //vp8_get_proc_core_count();
+    if (core_count > 1)
+    {
+        sem_init(&pbi->h_event_lpf, 0, 0);
+        sem_init(&pbi->h_event_start_lpf, 0, 0);
+        pbi->b_multithreaded_lf = 1;
+        pthread_create(&pbi->h_thread_lpf, 0, vp8_thread_loop_filter, (pbi));
+    }
+
+    if (core_count > 1)
+    {
+        pbi->b_multithreaded_rd = 1;
+        pbi->decoding_thread_count = core_count - 1;
+
+        CHECK_MEM_ERROR(pbi->h_decoding_thread, vpx_malloc(sizeof(pthread_t) * pbi->decoding_thread_count));
+        CHECK_MEM_ERROR(pbi->h_event_mbrdecoding, vpx_malloc(sizeof(sem_t) * pbi->decoding_thread_count));
+        CHECK_MEM_ERROR(pbi->mb_row_di, vpx_memalign(32, sizeof(MB_ROW_DEC) * pbi->decoding_thread_count));
+        vpx_memset(pbi->mb_row_di, 0, sizeof(MB_ROW_DEC) * pbi->decoding_thread_count);
+        CHECK_MEM_ERROR(pbi->de_thread_data, vpx_malloc(sizeof(DECODETHREAD_DATA) * pbi->decoding_thread_count));
+
+        for (ithread = 0; ithread < pbi->decoding_thread_count; ithread++)
+        {
+            sem_init(&pbi->h_event_mbrdecoding[ithread], 0, 0);
+
+            pbi->de_thread_data[ithread].ithread  = ithread;
+            pbi->de_thread_data[ithread].ptr1     = (void *)pbi;
+            pbi->de_thread_data[ithread].ptr2     = (void *) &pbi->mb_row_di[ithread];
+
+            pthread_create(&pbi->h_decoding_thread[ithread], 0, vp8_thread_decoding_proc, (&pbi->de_thread_data[ithread]));
+
+        }
+
+        sem_init(&pbi->h_event_main, 0, 0);
+        pbi->allocated_decoding_thread_count = pbi->decoding_thread_count;
+    }
+
+#else
+    (void) pbi;
+#endif
+}
+
+void vp8_decoder_remove_threads(VP8D_COMP *pbi)
+{
+#if CONFIG_MULTITHREAD
+
+    if (pbi->b_multithreaded_lf)
+    {
+        pbi->b_multithreaded_lf = 0;
+        sem_post(&pbi->h_event_start_lpf);
+        pthread_join(pbi->h_thread_lpf, 0);
+        sem_destroy(&pbi->h_event_start_lpf);
+    }
+
+    //shutdown MB Decoding thread;
+    if (pbi->b_multithreaded_rd)
+    {
+        pbi->b_multithreaded_rd = 0;
+        // allow all threads to exit
+        {
+            int i;
+
+            for (i = 0; i < pbi->allocated_decoding_thread_count; i++)
+            {
+
+                sem_post(&pbi->h_event_mbrdecoding[i]);
+                pthread_join(pbi->h_decoding_thread[i], NULL);
+            }
+        }
+        {
+
+            int i;
+            for (i = 0; i < pbi->allocated_decoding_thread_count; i++)
+            {
+                sem_destroy(&pbi->h_event_mbrdecoding[i]);
+            }
+
+
+        }
+
+        sem_destroy(&pbi->h_event_main);
+
+        if (pbi->h_decoding_thread)
+        {
+            vpx_free(pbi->h_decoding_thread);
+            pbi->h_decoding_thread = NULL;
+        }
+
+        if (pbi->h_event_mbrdecoding)
+        {
+            vpx_free(pbi->h_event_mbrdecoding);
+            pbi->h_event_mbrdecoding = NULL;
+        }
+
+        if (pbi->mb_row_di)
+        {
+            vpx_free(pbi->mb_row_di);
+            pbi->mb_row_di = NULL ;
+        }
+
+        if (pbi->de_thread_data)
+        {
+            vpx_free(pbi->de_thread_data);
+            pbi->de_thread_data = NULL;
+        }
+    }
+
+#else
+    (void) pbi;
+#endif
+}
+
+
+void vp8_start_lfthread(VP8D_COMP *pbi)
+{
+#if CONFIG_MULTITHREAD
+    memcpy(&pbi->lpfmb, &pbi->mb, sizeof(pbi->mb));
+    pbi->last_mb_row_decoded = 0;
+    sem_post(&pbi->h_event_start_lpf);
+#else
+    (void) pbi;
+#endif
+}
+
+void vp8_stop_lfthread(VP8D_COMP *pbi)
+{
+#if CONFIG_MULTITHREAD
+    struct vpx_usec_timer timer;
+
+    vpx_usec_timer_start(&timer);
+
+    sem_wait(&pbi->h_event_lpf);
+
+    vpx_usec_timer_mark(&timer);
+    pbi->time_loop_filtering += vpx_usec_timer_elapsed(&timer);
+#else
+    (void) pbi;
+#endif
+}
+
+
+void vp8_mtdecode_mb_rows(VP8D_COMP *pbi,
+                          MACROBLOCKD *xd)
+{
+#if CONFIG_MULTITHREAD
+    int mb_row;
+    VP8_COMMON *pc = &pbi->common;
+
+    int ibc = 0;
+    int num_part = 1 << pbi->common.multi_token_partition;
+
+    vp8_setup_decoding_thread_data(pbi, xd, pbi->mb_row_di, pbi->decoding_thread_count);
+
+    for (mb_row = 0; mb_row < pc->mb_rows; mb_row += (pbi->decoding_thread_count + 1))
+    {
+        int i;
+        pbi->current_mb_col_main = -1;
+
+        xd->current_bc = &pbi->mbc[ibc];
+        ibc++ ;
+
+        if (ibc == num_part)
+            ibc = 0;
+
+        for (i = 0; i < pbi->decoding_thread_count; i++)
+        {
+            if ((mb_row + i + 1) >= pc->mb_rows)
+                break;
+
+            pbi->mb_row_di[i].mb_row = mb_row + i + 1;
+            pbi->mb_row_di[i].mbd.current_bc =  &pbi->mbc[ibc];
+            ibc++;
+
+            if (ibc == num_part)
+                ibc = 0;
+
+            pbi->mb_row_di[i].current_mb_col = -1;
+            sem_post(&pbi->h_event_mbrdecoding[i]);
+        }
+
+        vp8_decode_mb_row(pbi, pc, mb_row, xd);
+
+        xd->mode_info_context += xd->mode_info_stride * pbi->decoding_thread_count;
+
+        if (mb_row < pc->mb_rows - 1)
+        {
+            sem_wait(&pbi->h_event_main);
+        }
+    }
+
+    pbi->last_mb_row_decoded = mb_row;
+#else
+    (void) pbi;
+    (void) xd;
+#endif
+}
diff --git a/vp8/decoder/treereader.h b/vp8/decoder/treereader.h
new file mode 100644
index 0000000..eb10e24
--- /dev/null
+++ b/vp8/decoder/treereader.h
@@ -0,0 +1,63 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef tree_reader_h
+#define tree_reader_h 1
+
+#include "treecoder.h"
+
+#include "dboolhuff.h"
+
+typedef BOOL_DECODER vp8_reader;
+
+#define vp8_read vp8dx_decode_bool
+#define vp8_read_literal vp8_decode_value
+#define vp8_read_bit( R) vp8_read( R, vp8_prob_half)
+
+
+/* Intent of tree data structure is to make decoding trivial. */
+
+static int vp8_treed_read(
+    vp8_reader *const r,        /* !!! must return a 0 or 1 !!! */
+    vp8_tree t,
+    const vp8_prob *const p
+)
+{
+    register vp8_tree_index i = 0;
+
+    while ((i = t[ i + vp8_read(r, p[i>>1])]) > 0) ;
+
+    return -i;
+}
+
+
+/* Variant reads a binary number given distributions on each bit.
+   Note that tree is arbitrary; probability of decoding a zero
+   may or may not depend on previously decoded bits. */
+
+static int vp8_treed_read_num(
+    vp8_reader *const r,        /* !!! must return a 0 or 1 !!! */
+    vp8_tree t,
+    const vp8_prob *const p
+)
+{
+    vp8_tree_index i = 0;
+    int v = 0, b;
+
+    do
+    {
+        b = vp8_read(r, p[i>>1]);
+        v = (v << 1) + b;
+    }
+    while ((i = t[i+b]) > 0);
+
+    return v;
+}
+#endif /* tree_reader_h */
diff --git a/vp8/decoder/x86/dequantize_mmx.asm b/vp8/decoder/x86/dequantize_mmx.asm
new file mode 100644
index 0000000..02be487
--- /dev/null
+++ b/vp8/decoder/x86/dequantize_mmx.asm
@@ -0,0 +1,410 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+
+;void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q)
+global sym(vp8_dequantize_b_impl_mmx)
+sym(vp8_dequantize_b_impl_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 3
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov       rsi, arg(0) ;sq
+        mov       rdi, arg(1) ;dq
+        mov       rax, arg(2) ;q
+
+        movq      mm1, [rsi]
+        pmullw    mm1, [rax+0]            ; mm4 *= kernel 0 modifiers.
+        movq      [rdi], mm1
+
+        movq      mm1, [rsi+8]
+        pmullw    mm1, [rax+8]            ; mm4 *= kernel 0 modifiers.
+        movq      [rdi+8], mm1
+
+        movq      mm1, [rsi+16]
+        pmullw    mm1, [rax+16]            ; mm4 *= kernel 0 modifiers.
+        movq      [rdi+16], mm1
+
+        movq      mm1, [rsi+24]
+        pmullw    mm1, [rax+24]            ; mm4 *= kernel 0 modifiers.
+        movq      [rdi+24], mm1
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void dequant_idct_mmx(short *input, short *dq, short *output, int pitch)
+global sym(vp8_dequant_idct_mmx)
+sym(vp8_dequant_idct_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rax,    arg(0) ;input
+        mov         rdx,    arg(1) ;dq
+
+
+        movq        mm0,    [rax   ]
+        pmullw      mm0,    [rdx]
+
+        movq        mm1,    [rax +8]
+        pmullw      mm1,    [rdx +8]
+
+        movq        mm2,    [rax+16]
+        pmullw      mm2,    [rdx+16]
+
+        movq        mm3,    [rax+24]
+        pmullw      mm3,    [rdx+24]
+
+        mov         rdx,    arg(2) ;output
+        pxor        mm7,    mm7
+
+
+        movq        [rax],   mm7
+        movq        [rax+8], mm7
+
+        movq        [rax+16],mm7
+        movq        [rax+24],mm7
+
+
+        movsxd      rax,            dword ptr arg(3) ;pitch
+
+        psubw       mm0,            mm2             ; b1= 0-2
+        paddw       mm2,            mm2             ;
+
+        movq        mm5,            mm1
+        paddw       mm2,            mm0             ; a1 =0+2
+
+        pmulhw      mm5,            [x_s1sqr2 GLOBAL];
+        paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
+
+        movq        mm7,            mm3             ;
+        pmulhw      mm7,            [x_c1sqr2less1 GLOBAL];
+
+        paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
+        psubw       mm7,            mm5             ; c1
+
+        movq        mm5,            mm1
+        movq        mm4,            mm3
+
+        pmulhw      mm5,            [x_c1sqr2less1 GLOBAL]
+        paddw       mm5,            mm1
+
+        pmulhw      mm3,            [x_s1sqr2 GLOBAL]
+        paddw       mm3,            mm4
+
+        paddw       mm3,            mm5             ; d1
+        movq        mm6,            mm2             ; a1
+
+        movq        mm4,            mm0             ; b1
+        paddw       mm2,            mm3             ;0
+
+        paddw       mm4,            mm7             ;1
+        psubw       mm0,            mm7             ;2
+
+        psubw       mm6,            mm3             ;3
+
+        movq        mm1,            mm2             ; 03 02 01 00
+        movq        mm3,            mm4             ; 23 22 21 20
+
+        punpcklwd   mm1,            mm0             ; 11 01 10 00
+        punpckhwd   mm2,            mm0             ; 13 03 12 02
+
+        punpcklwd   mm3,            mm6             ; 31 21 30 20
+        punpckhwd   mm4,            mm6             ; 33 23 32 22
+
+        movq        mm0,            mm1             ; 11 01 10 00
+        movq        mm5,            mm2             ; 13 03 12 02
+
+        punpckldq   mm0,            mm3             ; 30 20 10 00
+        punpckhdq   mm1,            mm3             ; 31 21 11 01
+
+        punpckldq   mm2,            mm4             ; 32 22 12 02
+        punpckhdq   mm5,            mm4             ; 33 23 13 03
+
+        movq        mm3,            mm5             ; 33 23 13 03
+
+        psubw       mm0,            mm2             ; b1= 0-2
+        paddw       mm2,            mm2             ;
+
+        movq        mm5,            mm1
+        paddw       mm2,            mm0             ; a1 =0+2
+
+        pmulhw      mm5,            [x_s1sqr2 GLOBAL];
+        paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
+
+        movq        mm7,            mm3             ;
+        pmulhw      mm7,            [x_c1sqr2less1 GLOBAL];
+
+        paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
+        psubw       mm7,            mm5             ; c1
+
+        movq        mm5,            mm1
+        movq        mm4,            mm3
+
+        pmulhw      mm5,            [x_c1sqr2less1 GLOBAL]
+        paddw       mm5,            mm1
+
+        pmulhw      mm3,            [x_s1sqr2 GLOBAL]
+        paddw       mm3,            mm4
+
+        paddw       mm3,            mm5             ; d1
+        paddw       mm0,            [fours GLOBAL]
+
+        paddw       mm2,            [fours GLOBAL]
+        movq        mm6,            mm2             ; a1
+
+        movq        mm4,            mm0             ; b1
+        paddw       mm2,            mm3             ;0
+
+        paddw       mm4,            mm7             ;1
+        psubw       mm0,            mm7             ;2
+
+        psubw       mm6,            mm3             ;3
+        psraw       mm2,            3
+
+        psraw       mm0,            3
+        psraw       mm4,            3
+
+        psraw       mm6,            3
+
+        movq        mm1,            mm2             ; 03 02 01 00
+        movq        mm3,            mm4             ; 23 22 21 20
+
+        punpcklwd   mm1,            mm0             ; 11 01 10 00
+        punpckhwd   mm2,            mm0             ; 13 03 12 02
+
+        punpcklwd   mm3,            mm6             ; 31 21 30 20
+        punpckhwd   mm4,            mm6             ; 33 23 32 22
+
+        movq        mm0,            mm1             ; 11 01 10 00
+        movq        mm5,            mm2             ; 13 03 12 02
+
+        punpckldq   mm0,            mm3             ; 30 20 10 00
+        punpckhdq   mm1,            mm3             ; 31 21 11 01
+
+        punpckldq   mm2,            mm4             ; 32 22 12 02
+        punpckhdq   mm5,            mm4             ; 33 23 13 03
+
+        movq        [rdx],          mm0
+
+        movq        [rdx+rax],      mm1
+        movq        [rdx+rax*2],    mm2
+
+        add         rdx,            rax
+        movq        [rdx+rax*2],    mm5
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void dequant_dc_idct_mmx(short *input, short *dq, short *output, int pitch, int Dc)
+global sym(vp8_dequant_dc_idct_mmx)
+sym(vp8_dequant_dc_idct_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rax,    arg(0) ;input
+        mov         rdx,    arg(1) ;dq
+
+        movsxd      rcx,    dword ptr arg(4) ;Dc
+
+        movq        mm0,    [rax   ]
+        pmullw      mm0,    [rdx]
+
+        movq        mm1,    [rax +8]
+        pmullw      mm1,    [rdx +8]
+
+        movq        mm2,    [rax+16]
+        pmullw      mm2,    [rdx+16]
+
+        movq        mm3,    [rax+24]
+        pmullw      mm3,    [rdx+24]
+
+        mov         rdx,    arg(2) ;output
+        pxor        mm7,    mm7
+
+
+        movq        [rax],   mm7
+        movq        [rax+8], mm7
+
+        movq        [rax+16],mm7
+        movq        [rax+24],mm7
+
+        pinsrw      mm0,    rcx,  0
+        movsxd      rax,            dword ptr arg(3) ;pitch
+
+        psubw       mm0,            mm2             ; b1= 0-2
+        paddw       mm2,            mm2             ;
+
+        movq        mm5,            mm1
+        paddw       mm2,            mm0             ; a1 =0+2
+
+        pmulhw      mm5,            [x_s1sqr2 GLOBAL];
+        paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
+
+        movq        mm7,            mm3             ;
+        pmulhw      mm7,            [x_c1sqr2less1 GLOBAL];
+
+        paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
+        psubw       mm7,            mm5             ; c1
+
+        movq        mm5,            mm1
+        movq        mm4,            mm3
+
+        pmulhw      mm5,            [x_c1sqr2less1 GLOBAL]
+        paddw       mm5,            mm1
+
+        pmulhw      mm3,            [x_s1sqr2 GLOBAL]
+        paddw       mm3,            mm4
+
+        paddw       mm3,            mm5             ; d1
+        movq        mm6,            mm2             ; a1
+
+        movq        mm4,            mm0             ; b1
+        paddw       mm2,            mm3             ;0
+
+        paddw       mm4,            mm7             ;1
+        psubw       mm0,            mm7             ;2
+
+        psubw       mm6,            mm3             ;3
+
+        movq        mm1,            mm2             ; 03 02 01 00
+        movq        mm3,            mm4             ; 23 22 21 20
+
+        punpcklwd   mm1,            mm0             ; 11 01 10 00
+        punpckhwd   mm2,            mm0             ; 13 03 12 02
+
+        punpcklwd   mm3,            mm6             ; 31 21 30 20
+        punpckhwd   mm4,            mm6             ; 33 23 32 22
+
+        movq        mm0,            mm1             ; 11 01 10 00
+        movq        mm5,            mm2             ; 13 03 12 02
+
+        punpckldq   mm0,            mm3             ; 30 20 10 00
+        punpckhdq   mm1,            mm3             ; 31 21 11 01
+
+        punpckldq   mm2,            mm4             ; 32 22 12 02
+        punpckhdq   mm5,            mm4             ; 33 23 13 03
+
+        movq        mm3,            mm5             ; 33 23 13 03
+
+        psubw       mm0,            mm2             ; b1= 0-2
+        paddw       mm2,            mm2             ;
+
+        movq        mm5,            mm1
+        paddw       mm2,            mm0             ; a1 =0+2
+
+        pmulhw      mm5,            [x_s1sqr2 GLOBAL];
+        paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
+
+        movq        mm7,            mm3             ;
+        pmulhw      mm7,            [x_c1sqr2less1 GLOBAL];
+
+        paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
+        psubw       mm7,            mm5             ; c1
+
+        movq        mm5,            mm1
+        movq        mm4,            mm3
+
+        pmulhw      mm5,            [x_c1sqr2less1 GLOBAL]
+        paddw       mm5,            mm1
+
+        pmulhw      mm3,            [x_s1sqr2 GLOBAL]
+        paddw       mm3,            mm4
+
+        paddw       mm3,            mm5             ; d1
+        paddw       mm0,            [fours GLOBAL]
+
+        paddw       mm2,            [fours GLOBAL]
+        movq        mm6,            mm2             ; a1
+
+        movq        mm4,            mm0             ; b1
+        paddw       mm2,            mm3             ;0
+
+        paddw       mm4,            mm7             ;1
+        psubw       mm0,            mm7             ;2
+
+        psubw       mm6,            mm3             ;3
+        psraw       mm2,            3
+
+        psraw       mm0,            3
+        psraw       mm4,            3
+
+        psraw       mm6,            3
+
+        movq        mm1,            mm2             ; 03 02 01 00
+        movq        mm3,            mm4             ; 23 22 21 20
+
+        punpcklwd   mm1,            mm0             ; 11 01 10 00
+        punpckhwd   mm2,            mm0             ; 13 03 12 02
+
+        punpcklwd   mm3,            mm6             ; 31 21 30 20
+        punpckhwd   mm4,            mm6             ; 33 23 32 22
+
+        movq        mm0,            mm1             ; 11 01 10 00
+        movq        mm5,            mm2             ; 13 03 12 02
+
+        punpckldq   mm0,            mm3             ; 30 20 10 00
+        punpckhdq   mm1,            mm3             ; 31 21 11 01
+
+        punpckldq   mm2,            mm4             ; 32 22 12 02
+        punpckhdq   mm5,            mm4             ; 33 23 13 03
+
+        movq        [rdx],          mm0
+
+        movq        [rdx+rax],      mm1
+        movq        [rdx+rax*2],    mm2
+
+        add         rdx,            rax
+        movq        [rdx+rax*2],    mm5
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+SECTION_RODATA
+align 16
+x_s1sqr2:
+    times 4 dw 0x8A8C
+align 16
+x_c1sqr2less1:
+    times 4 dw 0x4E7B
+align 16
+fours:
+    times 4 dw 0x0004
diff --git a/vp8/decoder/x86/dequantize_x86.h b/vp8/decoder/x86/dequantize_x86.h
new file mode 100644
index 0000000..5def406
--- /dev/null
+++ b/vp8/decoder/x86/dequantize_x86.h
@@ -0,0 +1,40 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef DEQUANTIZE_X86_H
+#define DEQUANTIZE_X86_H
+
+
+/* Note:
+ *
+ * This platform is commonly built for runtime CPU detection. If you modify
+ * any of the function mappings present in this file, be sure to also update
+ * them in the function pointer initialization code
+ */
+#if HAVE_MMX
+extern prototype_dequant_block(vp8_dequantize_b_mmx);
+extern prototype_dequant_idct(vp8_dequant_idct_mmx);
+extern prototype_dequant_idct_dc(vp8_dequant_dc_idct_mmx);
+
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp8_dequant_block
+#define vp8_dequant_block vp8_dequantize_b_mmx
+
+#undef  vp8_dequant_idct
+#define vp8_dequant_idct vp8_dequant_idct_mmx
+
+#undef  vp8_dequant_idct_dc
+#define vp8_dequant_idct_dc vp8_dequant_dc_idct_mmx
+
+#endif
+#endif
+
+#endif
diff --git a/vp8/decoder/x86/onyxdxv.c b/vp8/decoder/x86/onyxdxv.c
new file mode 100644
index 0000000..75a676a
--- /dev/null
+++ b/vp8/decoder/x86/onyxdxv.c
@@ -0,0 +1,1079 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+*
+*   Module Title :     onyxdxv.c
+*
+*   Description  :     VP80 interface to DXV.
+*
+*****************************************************************************
+*/
+/****************************************************************************
+*  Header Files
+****************************************************************************/
+#include <math.h>   // For Abs()
+#include "pragmas.h"
+
+#include "vpxdxv.h"
+#include "vpxdxv_plugin.h"
+
+#include "onyxd_int.h"
+#include "onyx.h"
+#include "codec_common_interface.h"
+#include "vpx_scale/vpxscale.h"
+#include "vpx_mem/vpx_mem.h"
+#include "postproc.h"
+#include "vpxblit.h"
+#include "g_common.h"
+#include "vpx_scale/yv12extend.h"
+
+#include <limits.h>
+#include <stdio.h>
+#include "scale_mode.h"
+#include "onyx_pb_interface.h"
+
+/****************************************************************************
+*  Macros
+****************************************************************************/
+
+#define VP8_FOURCC DXL_MKFOURCC( 'V', 'P', '8', '0')
+
+extern void vp8_blit_text(const char *msg, unsigned char *address, const int pitch);
+
+
+/****************************************************************************
+*  Typedefs
+****************************************************************************/
+
+typedef struct  // YUV buffer configuration structure
+{
+    int   y_width;
+    int   y_height;
+    int   y_stride;
+
+    int   uv_width;
+    int   uv_height;
+    int   uv_stride;
+
+    char *y_buffer;
+    char *u_buffer;
+    char *v_buffer;
+
+    char *uv_start;
+    int   uv_dst_area;
+    int   uv_used_area;
+
+    unsigned char *y_ptr_scrn;
+    unsigned char *u_ptr_scrn;
+    unsigned char *v_ptr_scrn;
+
+
+} DXV_YUV_BUFFER_CONFIG;
+
+
+typedef void ((*vp8blit_func)(unsigned char *, int, YUV_BUFFER_CONFIG *));
+
+/* define an x_image structure based on the core x_image struct */
+typedef struct t_ximage_codec
+{
+    DXV_YUV_BUFFER_CONFIG frame_buffer;
+    VP8D_COMP *my_pbi;
+    VP8_COMMON *common;
+    int owned;
+    int decompressed_once;
+
+    int sizeof_pixel;
+    vp8blit_func blitter;
+
+    unsigned int ppl_tag;
+    unsigned int bd_tag;
+    unsigned int *supported_output_format_list;
+
+    int cpu_free;
+    int postproc;
+    int add_noise;
+    int deinterlace;
+
+    int post_proc2time;
+    int post_proc4time;
+
+    int hs;
+    int hr;
+    int vs;
+    int vr;
+    YV12_BUFFER_CONFIG this_buffer;
+    YV12_BUFFER_CONFIG scaled_buffer;
+    YV12_BUFFER_CONFIG *passed_in_buffer;
+
+    int avgq;
+    int ppcount;
+
+
+} VP8_XIMAGE, *VP8_XIMAGE_HANDLE;
+
+
+/****************************************************************************
+*  Modul Statics
+****************************************************************************/
+static unsigned int g_vp8_preferred_output_format_list[] =
+{
+    VPXDXV_YUY2,
+    VPXDXV_UYVY,
+    VPXDXV_RGB8888,
+    VPXDXV_RGB888,
+    VPXDXV_RGB555,
+    VPXDXV_RGB565,
+    VPXDXV_YV12,
+    VPXDXV_I420,
+
+//    VPXDXV_YV12,
+//    VPXDXV_YUY2,
+//    VPXDXV_RGB565,
+//    VPXDXV_UYVY,
+    0
+};
+
+/****************************************************************************
+*  Forward declarationss
+****************************************************************************/
+void onyx_set_parameter(XIMAGE_HANDLE src, int Command, unsigned int Parameter);
+
+static int onyx_get_output_format(XIMAGE_HANDLE src, unsigned int *bd_tag);
+static int onyx_set_output_format(XIMAGE_HANDLE src, unsigned int bd_tag);
+
+static int vpx_get_size_of_pixel(unsigned int bd);
+
+/****************************************************************************
+*  Imports
+****************************************************************************/
+
+#define __Clamp255(x)   (unsigned char) ( (x) < 0 ? 0 : ( (x) <= 255 ? (x) : 255 ) )
+
+/*
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/
+/*
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/
+void
+convert_yv12_buffer_types(YV12_BUFFER_CONFIG *source, DXV_YUV_BUFFER_CONFIG *dest)
+{
+    dest->y_buffer = (char *)source->y_buffer;
+    dest->u_buffer = (char *)source->u_buffer;
+    dest->v_buffer = (char *)source->v_buffer;
+    dest->y_width  = source->y_width;
+    dest->y_height = source->y_height;
+    dest->y_stride = source->y_stride;
+    dest->uv_width  = source->uv_width;
+    dest->uv_height = source->uv_height;
+    dest->uv_stride = source->uv_stride;
+}
+
+/*
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/
+
+
+int onyx_blit
+(
+    XIMAGE_HANDLE src,
+    VSCREEN_HANDLE v_screen,
+    DXV_YUV_BUFFER_CONFIG *frame_buffer,
+    int x,
+    int y
+)
+{
+    VP8_XIMAGE_HANDLE tab = (VP8_XIMAGE_HANDLE)vpxdxv_get_algorithm_base_ptr(src);
+    VP8D_COMP *pbi;
+    VP8_COMMON *common = tab->common;
+    pbi = tab->my_pbi;
+
+    if (v_screen) /* if there is a v_screen, blit to it */
+    {
+        unsigned char *ptr_scrn;
+        int this_pitch, vs_height, vs_width;
+        unsigned int start_tick, stop_tick;
+
+        vpxdxv_get_vscreen_attributes(v_screen, (void **)&ptr_scrn,  &vs_width, &vs_height, &this_pitch);
+
+        if (ptr_scrn)
+        {
+            int w, h;
+
+            int p_size;
+            int view_x, view_y, view_w;
+            int hs, hr, vs, vr;
+            int neww, newh;
+            int cw, ch;
+            int microseconds_available = (int)(1000000 / 30);
+
+            microseconds_available = microseconds_available * tab->cpu_free / 100;
+
+            if (pbi)
+            {
+                microseconds_available -= pbi->decode_microseconds;
+
+                if (tab->cpu_free == 0)
+                    microseconds_available = INT_MAX;
+
+                if (tab->post_proc2time == 0)
+                    tab->post_proc2time = pbi->decode_microseconds * 1 / 2;
+
+                if (tab->post_proc4time == 0)
+                    tab->post_proc4time = pbi->decode_microseconds;
+            }
+
+
+            if (tab->ppcount == 0)
+            {
+                tab->post_proc2time = 0;
+                tab->post_proc4time = 0;
+                tab->ppcount = 64;
+            }
+            else
+            {
+                tab->ppcount --;
+            }
+
+            vpxdxv_get_vscreen_view(v_screen, &view_x, &view_y, &view_w, NULL);
+
+            Scale2Ratio(common->horiz_scale, &hr, &hs);
+            Scale2Ratio(common->vert_scale, &vr, &vs);
+
+            if (tab->postproc && tab->passed_in_buffer == 0)
+            {
+                int show_text = 0;
+
+                unsigned char message[512];
+
+                int pp = tab->postproc;
+                int q = (tab->avgq + 4) / 8;
+                int noise = 0;
+
+                vp8_clear_system_state();
+
+                if (pp >= 1000)
+                {
+                    pp -= 1000;
+                    noise = pp / 100;
+                    pp = pp - noise * 100;
+                }
+
+                if (pp >= 300)
+                {
+                    pp -= 300;
+                    show_text = 3;
+                }
+                else if (pp >= 200)
+                {
+                    pp -= 200;
+                    show_text = 2;
+                }
+                else if (pp >= 100)
+                {
+                    pp -= 100;
+                    show_text = 1;
+                }
+
+                if (pbi && (pbi->mb.segmentation_enabled & SEGMENT_PF) && tab->deinterlace)
+                {
+                    de_interlace(common->frame_to_show->y_buffer, common->post_proc_buffer.y_buffer,
+                                 common->post_proc_buffer.y_width, common->post_proc_buffer.y_height,
+                                 common->post_proc_buffer.y_stride);
+
+                    de_interlace(common->frame_to_show->u_buffer, common->post_proc_buffer.u_buffer,
+                                 common->post_proc_buffer.uv_width, common->post_proc_buffer.uv_height,
+                                 common->post_proc_buffer.uv_stride);
+                    de_interlace(common->frame_to_show->v_buffer, common->post_proc_buffer.v_buffer,
+                                 common->post_proc_buffer.uv_width, common->post_proc_buffer.uv_height,
+                                 common->post_proc_buffer.uv_stride);
+                }
+                else
+                {
+                    if (pp >= 10 && pp <= 20)
+                    {
+                        q = q + (pp - 15) * 10;
+
+                        if (q < 0)
+                            q = 0;
+                    }
+
+                    start_tick = vp8_get_high_res_timer_tick();
+
+                    if (pp > 3 && tab->post_proc4time < microseconds_available)
+                    {
+                        vp8_deblock_and_de_macro_block(common->frame_to_show, &common->post_proc_buffer, q, 1, 0);
+
+                        stop_tick = vp8_get_high_res_timer_tick();
+
+                        if (pbi)
+                            tab->post_proc4time = vp8_get_time_in_micro_sec(start_tick, stop_tick);
+                    }
+
+                    else if (pp > 0 && tab->post_proc2time < microseconds_available)
+                    {
+                        vp8_deblock(common->frame_to_show, &common->post_proc_buffer, q , 1,  0);
+                        stop_tick = vp8_get_high_res_timer_tick();
+
+                        if (pbi)
+                            tab->post_proc2time = vp8_get_time_in_micro_sec(start_tick, stop_tick);
+                    }
+                    else
+                    {
+                        vp8_yv12_copy_frame(common->frame_to_show, &common->post_proc_buffer);
+                    }
+
+                }
+
+                vp8_clear_system_state();
+
+                if (tab->add_noise == 1)
+                {
+
+                    vp8_plane_add_noise(common->post_proc_buffer.y_buffer,
+                                        common->post_proc_buffer.y_width, common->post_proc_buffer.y_height,
+                                        common->post_proc_buffer.y_stride, 63 - q, noise);
+                }
+
+
+                if (show_text == 1)
+                {
+#ifdef PACKET_TESTING
+                    {
+                        VP8_HEADER *oh2 = (VP8_HEADER *) pbi->Source;
+                        sprintf(message, "%8d %d%d%d%d%d size:%d\n",
+                        oh2->frame_number ,
+                        oh2->update_gold  ,
+                        oh2->update_last  ,
+                        oh2->uses_gold    ,
+                        oh2->uses_last    ,
+                        oh2->type,
+                        vpxdxv_get_ximage_csize(src));
+                    }
+#else
+                    sprintf(message, "F:%1ldG:%1ldQ:%3ldF:%3ld,%3ldP:%d_s:%6ld,N:%d,",
+                            (common->frame_type == KEY_FRAME),
+                            common->refresh_golden_frame,
+                            common->base_qindex,
+                            common->filter_level,
+                            q,
+                            tab->postproc,
+                            vpxdxv_get_ximage_csize(src), noise);
+#endif
+
+                    vp8_blit_text(message, common->post_proc_buffer.y_buffer, common->post_proc_buffer.y_stride);
+
+                }
+                else if (show_text == 2)
+                {
+                    int i, j;
+                    unsigned char *y_ptr;
+                    YV12_BUFFER_CONFIG *post = &common->post_proc_buffer;
+                    int mb_rows = post->y_height >> 4;
+                    int mb_cols = post->y_width  >> 4;
+                    int mb_index = 0;
+                    MODE_INFO *mi = common->mi;
+
+                    y_ptr = post->y_buffer + 4 * post->y_stride + 4;
+
+                    // vp8_filter each macro block
+                    for (i = 0; i < mb_rows; i++)
+                    {
+                        for (j = 0; j < mb_cols; j++)
+                        {
+                            char zz[4];
+
+                            if (pp == 4)
+                                sprintf(zz, "%c", mi[mb_index].mbmi.mode + 'a');
+                            else
+                                sprintf(zz, "%c", mi[mb_index].mbmi.ref_frame + 'a');
+
+                            vp8_blit_text(zz, y_ptr, post->y_stride);
+                            mb_index ++;
+                            y_ptr += 16;
+                        }
+
+                        mb_index ++; //border
+                        y_ptr += post->y_stride  * 16 - post->y_width;
+
+                    }
+                }
+                else if (show_text == 3)
+                {
+                    int i, j;
+                    unsigned char *y_ptr;
+                    YV12_BUFFER_CONFIG *post = &common->post_proc_buffer;
+                    int mb_rows = post->y_height >> 4;
+                    int mb_cols = post->y_width  >> 4;
+                    int mb_index = 0;
+                    MODE_INFO *mi = common->mi;
+
+                    y_ptr = post->y_buffer + 4 * post->y_stride + 4;
+
+                    // vp8_filter each macro block
+                    for (i = 0; i < mb_rows; i++)
+                    {
+                        for (j = 0; j < mb_cols; j++)
+                        {
+                            char zz[4];
+
+                            if (j == 0)
+                                sprintf(zz, "%c", '0' + i % 10);
+                            else
+                                sprintf(zz, "%c", '0' + j % 10);
+
+                            vp8_blit_text(zz, y_ptr, post->y_stride);
+                            mb_index ++;
+                            y_ptr += 16;
+                        }
+
+                        y_ptr += post->y_stride  * 16 - post->y_width;
+
+                    }
+                }
+
+                vpx_memcpy(&tab->this_buffer, &common->post_proc_buffer, sizeof(YV12_BUFFER_CONFIG));
+            }
+            else
+            {
+                vpx_memcpy(&tab->this_buffer, common->frame_to_show, sizeof(YV12_BUFFER_CONFIG));
+            }
+
+
+            /* get a frame pointer to the scaled and postprocessed reconstructed buffer */
+            if (tab->passed_in_buffer == 0)
+            {
+                if (common->horiz_scale != NORMAL || common->vert_scale != NORMAL)
+                {
+                    neww = hs * tab->this_buffer.y_width / hr;
+                    newh = vs * tab->this_buffer.y_height / vr;
+
+                    neww += neww & 1;
+
+                    if (tab->hs != hs || tab->hr != hr || tab->vs != vs || tab->vr != vr)
+                    {
+                        vp8_yv12_alloc_frame_buffer(&tab->scaled_buffer, neww, newh , 8);
+                    }
+
+                    vp8_yv12_scale_or_center(&tab->this_buffer,
+                                             &tab->scaled_buffer,
+                                             neww, newh, SCALE_TO_FIT, hs, hr, vs, vr);
+
+                    convert_yv12_buffer_types(&tab->scaled_buffer, frame_buffer);
+
+                    cw = hs * common->Width / hr;
+                    ch = vs * common->Height / vr;
+
+                }
+                else
+                {
+                    convert_yv12_buffer_types(&tab->this_buffer, frame_buffer);
+
+                    cw = common->Width;
+                    ch = common->Height;
+                }
+            }
+            else
+            {
+                convert_yv12_buffer_types(tab->passed_in_buffer, frame_buffer);
+                cw = common->Width;
+                ch = common->Height;
+                tab->passed_in_buffer = 0;
+            }
+
+            frame_buffer->y_width = cw;
+            frame_buffer->y_height = ch;
+            frame_buffer->uv_width = cw / 2;
+            frame_buffer->uv_height = ch / 2;
+
+            p_size = vpx_get_size_of_pixel(tab->bd_tag);
+
+            /* remember to offset if requested */
+            y += view_y;
+            x += view_x ;
+
+            /* for planar destinations */
+            w = view_w;
+            h = vs_height;
+
+            if (w < frame_buffer->y_width)
+            {
+                frame_buffer->y_width = w;
+                frame_buffer->uv_width = (w + 1) / 2;
+            }
+
+            if (h < frame_buffer->y_height)
+            {
+                frame_buffer->y_height = h;
+                frame_buffer->uv_height = (h + 1) / 2;
+            }
+
+            if (frame_buffer->y_width < view_w)
+                x += (view_w - frame_buffer->y_width) / 2;
+
+            if (x & 1)
+                x -= 1;
+
+            if (frame_buffer->y_height < vs_height)
+                y += (vs_height - frame_buffer->y_height) / 2;
+
+
+            ptr_scrn += (x * p_size) + (y * this_pitch);
+
+            frame_buffer->y_stride *= -1;
+            frame_buffer->uv_stride *= -1;
+
+            if (tab->bd_tag == VPXDXV_YV12 || tab->bd_tag == VPXDXV_I420)
+            {
+                if (this_pitch < 0)
+                {
+                    frame_buffer->uv_start = (char *)(ptr_scrn + abs(this_pitch) + abs(this_pitch) * h / 4 + this_pitch / 2);
+                    frame_buffer->uv_dst_area = abs((this_pitch * h) / 4);
+                    frame_buffer->uv_used_area = 0;
+                }
+                else
+                {
+                    frame_buffer->uv_start = (char *)(ptr_scrn + (this_pitch * h));
+                    frame_buffer->uv_dst_area = (((this_pitch + 1) / 2) * ((h + 1) / 2));
+                    frame_buffer->uv_used_area = (((this_pitch + 1) / 2) * frame_buffer->uv_height);
+                }
+            }
+
+            if ((pbi->mb.segmentation_enabled & SEGMENT_PF) && (tab->bd_tag != VPXDXV_YV12 && tab->bd_tag != VPXDXV_I420))
+            {
+                int ypitch = frame_buffer->y_stride;
+                int uvpitch = frame_buffer->uv_stride;
+
+                frame_buffer->y_stride <<= 1;
+                frame_buffer->y_height >>= 1;
+                frame_buffer->uv_stride <<= 1;
+                frame_buffer->uv_height >>= 1;
+
+                ptr_scrn += this_pitch;
+                frame_buffer->y_buffer -= ypitch;
+                frame_buffer->u_buffer -= uvpitch;
+                frame_buffer->v_buffer -= uvpitch;
+                tab->blitter(ptr_scrn, 2 * this_pitch, (YUV_BUFFER_CONFIG *)(&tab->frame_buffer));
+
+                ptr_scrn -= this_pitch;
+                frame_buffer->y_buffer += ypitch;
+                frame_buffer->u_buffer += uvpitch;
+                frame_buffer->v_buffer += uvpitch;
+                tab->blitter(ptr_scrn, 2 * this_pitch, (YUV_BUFFER_CONFIG *)(&tab->frame_buffer));
+
+            }
+            else
+            {
+                /* blit the screen */
+                tab->blitter(ptr_scrn, this_pitch, (YUV_BUFFER_CONFIG *)(&tab->frame_buffer));
+                vpx_log("Decoder: Frame shown \n");
+            }
+
+        }
+        else
+            vpx_log("Decoder: Frame not shown scrn pointer 0\n");
+    }
+    else
+        vpx_log("Decoder: Frame not shown vscreen 0\n");
+
+    return DXV_OK;
+}
+/****************************************************************************
+ *
+ *  ROUTINE       :     onyx_decompress
+ *
+ *  INPUTS        :     None
+ *
+ *  OUTPUTS       :     None
+ *
+ *  RETURNS       :     None.
+ *
+ *  FUNCTION      :
+ *
+ *  SPECIAL NOTES :
+ *
+ ****************************************************************************/
+static
+int onyx_decompress(XIMAGE_HANDLE src, VSCREEN_HANDLE v_screen)
+{
+    VP8_XIMAGE_HANDLE this_algorithm_base = (VP8_XIMAGE_HANDLE)vpxdxv_get_algorithm_base_ptr(src);
+    unsigned char *c_addr;
+    unsigned int c_size;
+    int w, h, x, y;
+    int vp8_rv;
+
+    c_addr = vpxdxv_get_ximage_cdata_addr(src);
+    c_size = vpxdxv_get_ximage_csize(src);
+    vpxdxv_get_ximage_xywh(src, &x, &y, &w, &h);
+
+    // if we have a compressed frame decompress it ( otherwise we'll just redo
+    // the scaling and postprocessing from the last frame )
+    if (c_addr)
+    {
+        if (c_size != 0)
+        {
+            int flags;
+            int ret_val;
+
+            int f;
+
+            // decode the frame
+            ret_val = vp8d_decompress_frame((VP8D_PTR) this_algorithm_base->my_pbi,
+                                            c_size,
+                                            (char *) c_addr,
+                                            &this_algorithm_base->this_buffer,
+                                            &flags);
+
+
+            f = this_algorithm_base->my_pbi->common.filter_level * 10 / 6;
+
+            if (this_algorithm_base->my_pbi->common.frame_type == KEY_FRAME)
+                this_algorithm_base->avgq = 8 * f;
+            else
+                this_algorithm_base->avgq = this_algorithm_base->avgq * 7 / 8 + f;
+
+
+
+            if (ret_val != 0)
+            {
+                if (ret_val == -1)
+                    return DXV_VERSION_CONFLICT;
+                else
+                    return DXV_BAD_DATA;
+            }
+
+        }
+    }
+
+
+    vp8_rv = onyx_blit(src, v_screen, &this_algorithm_base->frame_buffer, x, y);
+
+
+    return vp8_rv;
+}
+/*
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/
+static
+int vp8_ximagedestroy(XIMAGE_HANDLE src)
+{
+    VP8_XIMAGE_HANDLE this_algorithm_base = (VP8_XIMAGE_HANDLE)vpxdxv_get_algorithm_base_ptr(src);
+
+    if (this_algorithm_base)
+    {
+
+        vp8_yv12_de_alloc_frame_buffer(&this_algorithm_base->scaled_buffer);
+
+        /* safety check in case stopdecode was not called */
+        if (this_algorithm_base->owned)
+            vp8dx_remove_decompressor((VP8D_PTR)(this_algorithm_base->my_pbi));
+
+        duck_free(this_algorithm_base);
+    }
+
+    return DXV_OK;
+}
+/*
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/
+static int
+onyx_get_post_proc(XIMAGE_HANDLE src, unsigned int *ppl)
+{
+    VP8_XIMAGE_HANDLE this_algorithm_base = (VP8_XIMAGE_HANDLE)vpxdxv_get_algorithm_base_ptr(src);
+
+    if (this_algorithm_base)
+    {
+        *ppl = this_algorithm_base->ppl_tag;
+
+        return DXV_OK;
+    }
+
+    return DXV_NULL_BASE;
+}
+/*
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/
+static int
+onyx_set_post_proc(XIMAGE_HANDLE src, unsigned int ppl)
+{
+    VP8_XIMAGE_HANDLE this_algorithm_base = (VP8_XIMAGE_HANDLE)vpxdxv_get_algorithm_base_ptr(src);
+
+    if (this_algorithm_base)
+    {
+        this_algorithm_base->ppl_tag = ppl;
+
+        return DXV_OK;
+    }
+
+    return DXV_NULL_BASE;
+}
+/*
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/
+static
+int vp8_ximagestop_decode(XIMAGE_HANDLE src)
+{
+    VP8_XIMAGE_HANDLE this_algorithm_base = (VP8_XIMAGE_HANDLE)vpxdxv_get_algorithm_base_ptr(src);
+
+    if (this_algorithm_base)
+    {
+
+        vp8_yv12_de_alloc_frame_buffer(&this_algorithm_base->scaled_buffer);
+
+        if (this_algorithm_base->owned)
+            vp8dx_remove_decompressor((VP8D_PTR)(this_algorithm_base->my_pbi));
+
+        this_algorithm_base->owned = 0;
+    }
+
+    return DXV_OK;
+}
+
+
+/*
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/
+static
+int vp8_ximagestart_decode
+(
+    XIMAGE_HANDLE src
+)
+{
+    VP8_XIMAGE_HANDLE this_algorithm_base = (VP8_XIMAGE_HANDLE)vpxdxv_get_algorithm_base_ptr(src);
+    XIMAGE_INFO_PTR xinfo = vpxdxv_get_ximage_info(src);
+    VP8D_CONFIG ocf;
+
+    if (xinfo)
+    {
+        ocf.Width = xinfo->width;
+        ocf.Height = xinfo->height;
+    }
+
+    if (this_algorithm_base->common == 0)
+    {
+        this_algorithm_base->my_pbi = (VP8D_COMP *) vp8dx_create_decompressor(&ocf);
+        this_algorithm_base->owned = 1;
+        this_algorithm_base->common = &this_algorithm_base->my_pbi->common;
+        this_algorithm_base->avgq = 0;
+
+    }
+
+    this_algorithm_base->passed_in_buffer = 0;
+    this_algorithm_base->post_proc2time = 0;
+    this_algorithm_base->post_proc4time = 0;
+    this_algorithm_base->ppcount = 64;
+
+    return DXV_OK;
+}
+/*
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/
+static
+DXV_HANDLE vp8_ximagecreate(XIMAGE_HANDLE src)
+{
+    VP8_XIMAGE_HANDLE this_algorithm_base;
+
+    /* create a new algorithm base container */
+    this_algorithm_base = (VP8_XIMAGE_HANDLE)duck_calloc(1, sizeof(VP8_XIMAGE), DMEM_GENERAL);
+
+    if (this_algorithm_base == NULL)
+        return NULL;
+
+    vp8_scale_machine_specific_config();
+
+    vpxdxv_register_ximage_start_decode(src, vp8_ximagestart_decode);
+
+    vpxdxv_register_ximage_stop_decode(src, vp8_ximagestop_decode);
+
+    vpxdxv_register_ximage_destroy(src, vp8_ximagedestroy);
+
+    vpxdxv_register_ximage_dx(src, onyx_decompress);
+
+    vpxdxv_register_ximage_set_parameter(src, onyx_set_parameter);
+
+    vpxdxv_register_ximage_output_format_func(src,
+            onyx_get_output_format,
+            onyx_set_output_format);
+
+    vpxdxv_register_ximage_post_proc_level_func(src,
+            onyx_get_post_proc,
+            onyx_set_post_proc);
+
+    return (DXV_HANDLE)this_algorithm_base;
+}
+
+/*
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/
+
+static int store_output_list(unsigned int supported, int count,
+                             unsigned int *outlist)
+{
+    int i = 0, j = 0,
+        ret = DXV_OK;
+
+    while (i < count)
+    {
+        while (supported && !(supported & 0x01))
+        {
+            supported >>= 1;
+            ++j;
+        }
+
+        *(outlist + i) = g_vp8_preferred_output_format_list[j];
+        ++i;
+        ++j;
+        supported >>= 1;
+    }
+
+
+    return ret;
+}
+
+/*
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/
+static int onyx_get_output_list(XIMAGE_INFO_PTR xinfo, unsigned int *outlist,
+                                unsigned int *size)
+{
+    int i,
+        ret = DXV_INVALID_REQUEST;
+    unsigned int supported = 0,
+                 count = 0;
+    (void)xinfo;
+
+    if (size)
+    {
+        for (i = 0; i < sizeof(g_vp8_preferred_output_format_list) / sizeof(unsigned int) && i < 32; ++i)
+        {
+            if (vpx_get_blitter(g_vp8_preferred_output_format_list[i]) != (void *)0xffffffff)
+            {
+                supported |= (1 << i);
+                ++count;
+            }
+        }
+
+        if (outlist)
+        {
+            if (count && ((count + 1) == (*size / sizeof(int))))
+                ret = store_output_list(supported, count, outlist);
+            else
+                *outlist = 0;
+        }
+        else
+        {
+            *size = (count + 1) * sizeof(int);
+            ret = DXV_OK;
+        }
+    }
+
+    return ret;
+}
+
+/*
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/
+int onyx_init(void)
+{
+    int vp8_rv;
+
+    /* register VPX blitters based on cpu */
+    vpx_set_blit();
+
+    vp8_rv = vpxdxv_register_ximage(vp8_ximagecreate, onyx_get_output_list, VP8_FOURCC);
+    return vp8_rv;
+
+    return DXV_OK;
+}
+/*
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/
+int onyx_exit(void)
+{
+
+    vpxdxv_un_register_ximage(VP8_FOURCC);
+
+    return DXV_OK;
+}
+/****************************************************************************
+ *
+ *  ROUTINE       :  onyx_set_parameter
+ *
+ *  INPUTS        :  XIMAGE_HANDLE src   :
+ *                   int Command             :
+ *                   unsigned long Parameter :
+ *
+ *  OUTPUTS       :  None.
+ *
+ *  RETURNS       :  void
+ *
+ *  FUNCTION      :
+ *
+ *
+ *  SPECIAL NOTES :  None.
+ *
+ ****************************************************************************/
+void onyx_set_parameter(XIMAGE_HANDLE src, int Command, unsigned int Parameter)
+{
+    VP8_XIMAGE_HANDLE this_algorithm_base = (VP8_XIMAGE_HANDLE)vpxdxv_get_algorithm_base_ptr(src);
+
+    switch (Command)
+    {
+    case PBC_SET_CPUFREE:
+        this_algorithm_base->cpu_free  = Parameter;
+        break;
+    case PBC_SET_POSTPROC:
+        this_algorithm_base->postproc = Parameter;
+        break;
+
+    case PBC_SET_BLITBUFF:
+        this_algorithm_base->passed_in_buffer = (YV12_BUFFER_CONFIG *) Parameter;
+        break;
+
+    case PBC_SET_REFERENCEFRAME:
+    {
+        VP8_XIMAGE_HANDLE tab = (VP8_XIMAGE_HANDLE)vpxdxv_get_algorithm_base_ptr(src);
+        VP8D_COMP *pbi;
+        pbi = tab->my_pbi;
+        vp8_yv12_copy_frame((YV12_BUFFER_CONFIG *) Parameter, &pbi->common.last_frame);
+    }
+    break;
+
+    case PBC_SET_COMMON:
+
+        if (Parameter)
+        {
+            this_algorithm_base->common = (VP8_COMMON *)Parameter;
+        }
+
+        break;
+    case PBC_SET_ADDNOISE:
+        this_algorithm_base->add_noise = Parameter;
+        break;
+    case PBC_SET_DEINTERLACEMODE:
+        this_algorithm_base->deinterlace = Parameter;
+        break;
+
+    }
+}
+/*
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/
+static int
+onyx_get_output_format(XIMAGE_HANDLE src, unsigned int *format_tag)
+{
+    VP8_XIMAGE_HANDLE this_algorithm_base = (VP8_XIMAGE_HANDLE)vpxdxv_get_algorithm_base_ptr(src);
+
+    if (this_algorithm_base)
+    {
+        *format_tag = this_algorithm_base->bd_tag;
+        return DXV_OK;
+    }
+
+    return DXV_NULL_BASE;
+}
+
+/*
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/
+static int
+onyx_set_output_format(XIMAGE_HANDLE src, unsigned int bd_tag)
+{
+    VP8_XIMAGE_HANDLE this_algorithm_base = (VP8_XIMAGE_HANDLE)vpxdxv_get_algorithm_base_ptr(src);
+    int i;
+    unsigned int bd_tag_found;
+
+    if (this_algorithm_base)
+    {
+        i = 0;
+        bd_tag_found = 0;
+
+        while (g_vp8_preferred_output_format_list[i] != 0)
+        {
+            if (g_vp8_preferred_output_format_list[i] == bd_tag)
+            {
+                bd_tag_found = 1;
+                break;
+            }
+
+            i++;
+        }
+
+        if (bd_tag_found)
+        {
+            this_algorithm_base->blitter = (vp8blit_func)vpx_get_blitter(bd_tag);
+            this_algorithm_base->bd_tag = bd_tag;
+            return DXV_OK;
+        }
+
+        return DXV_INVALID_BLIT;
+    }
+
+    return DXV_NULL_BASE;
+}
+
+/*
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/
+int
+vpx_get_size_of_pixel(unsigned int bd)
+{
+    int vp8_rv;
+
+    switch (bd)
+    {
+    case VPXDXV_YV12:
+    case VPXDXV_I420:
+        vp8_rv = 1;
+        break;
+
+#ifdef _ENABLE_SPLIT_PIXEL_
+    case VPXDXV_SPLIT565:
+#endif
+    case VPXDXV_RGB555:
+    case VPXDXV_RGB565:
+    case VPXDXV_YUY2:
+    case VPXDXV_UYVY:
+    case VPXDXV_YVYU:
+        vp8_rv = 2;
+        break;
+
+    case VPXDXV_RGB888:
+        vp8_rv = 3;
+        break;
+
+    case VPXDXV_RGB8888:
+        vp8_rv = 4;
+        break;
+
+    default:
+        vp8_rv = -1;
+        break;
+    }
+
+    return vp8_rv;
+}
diff --git a/vp8/decoder/x86/x86_dsystemdependent.c b/vp8/decoder/x86/x86_dsystemdependent.c
new file mode 100644
index 0000000..6d7cc36
--- /dev/null
+++ b/vp8/decoder/x86/x86_dsystemdependent.c
@@ -0,0 +1,51 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "vpx_ports/x86.h"
+#include "onyxd_int.h"
+
+
+#if HAVE_MMX
+void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q);
+
+void vp8_dequantize_b_mmx(BLOCKD *d)
+{
+    short *sq = (short *) d->qcoeff;
+    short *dq = (short *) d->dqcoeff;
+    short *q = (short *) d->dequant;
+    vp8_dequantize_b_impl_mmx(sq, dq, q);
+}
+#endif
+
+void vp8_arch_x86_decode_init(VP8D_COMP *pbi)
+{
+    int flags = x86_simd_caps();
+
+    /* Note:
+     *
+     * This platform can be built without runtime CPU detection as well. If
+     * you modify any of the function mappings present in this file, be sure
+     * to also update them in static mapings (<arch>/filename_<arch>.h)
+     */
+#if CONFIG_RUNTIME_CPU_DETECT
+    /* Override default functions with fastest ones for this CPU. */
+#if HAVE_MMX
+
+    if (flags & HAS_MMX)
+    {
+        pbi->dequant.block   = vp8_dequantize_b_mmx;
+        pbi->dequant.idct    = vp8_dequant_idct_mmx;
+        pbi->dequant.idct_dc = vp8_dequant_dc_idct_mmx;
+    }
+
+#endif
+#endif
+}
diff --git a/vp8/decoder/xprintf.c b/vp8/decoder/xprintf.c
new file mode 100644
index 0000000..cb2221c
--- /dev/null
+++ b/vp8/decoder/xprintf.c
@@ -0,0 +1,163 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+*
+*   Module Title :     xprintf.cpp
+*
+*   Description  :     Display a printf style message on the current video frame.
+*
+****************************************************************************/
+
+/****************************************************************************
+*  Header Files
+****************************************************************************/
+
+#include <stdio.h>
+#include <stdarg.h>
+#ifdef _WIN32_WCE
+#include <windows.h>
+#endif
+#include "xprintf.h"
+
+/****************************************************************************
+ *
+ *  ROUTINE       : xprintf
+ *
+ *  INPUTS        : const PB_INSTANCE *ppbi : Pointer to decoder instance.
+ *                  long n_pixel             : Offset into buffer to write text.
+ *                  const char *format      : Format string for print.
+ *                  ...                     : Variable length argument list.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : int: Size (in bytes) of the formatted text.
+ *
+ *  FUNCTION      : Display a printf style message on the current video frame.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+int onyx_xprintf(unsigned char *ppbuffer, long n_pixel, long n_size, long n_stride, const char *format, ...)
+{
+    BOOL b_rc;
+    va_list arglist;
+    HFONT hfont, hfonto;
+
+    int rc = 0;
+    char sz_formatted[256] = "";
+    unsigned char *p_dest = &ppbuffer[n_pixel];
+
+#ifdef _WIN32_WCE
+    //  Set up temporary bitmap
+    HDC hdc_memory   = NULL;
+    HBITMAP hbm_temp = NULL;
+    HBITMAP hbm_orig = NULL;
+
+    RECT rect;
+
+    //  Copy bitmap to video frame
+    long x;
+    long y;
+
+    //  Format text
+    va_start(arglist, format);
+    _vsnprintf(sz_formatted, sizeof(sz_formatted), format, arglist);
+    va_end(arglist);
+
+    rect.left   = 0;
+    rect.top    = 0;
+    rect.right  = 8 * strlen(sz_formatted);
+    rect.bottom = 8;
+
+    hdc_memory = create_compatible_dc(NULL);
+
+    if (hdc_memory == NULL)
+        goto Exit;
+
+    hbm_temp = create_bitmap(rect.right, rect.bottom, 1, 1, NULL);
+
+    if (hbm_temp == NULL)
+        goto Exit;
+
+    hbm_orig = (HBITMAP)(select_object(hdc_memory, hbm_temp));
+
+    if (!hbm_orig)
+        goto Exit;
+
+    //  Write text into bitmap
+    //  font?
+    hfont = create_font(8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, VARIABLE_PITCH | FF_SWISS, "");
+
+    if (hfont == NULL)
+        goto Exit;
+
+    hfonto = (HFONT)(select_object(hdc_memory, hbm_temp));
+
+    if (!hfonto)
+        goto Exit;
+
+    select_object(hdc_memory, hfont);
+    set_text_color(hdc_memory, 1);
+    set_bk_color(hdc_memory, 0);
+    set_bk_mode(hdc_memory, TRANSPARENT);
+
+    b_rc = bit_blt(hdc_memory, rect.left, rect.top, rect.right, rect.bottom, hdc_memory, rect.left, rect.top, BLACKNESS);
+
+    if (!b_rc)
+        goto Exit;
+
+    b_rc = ext_text_out(hdc_memory, 0, 0, ETO_CLIPPED, &rect, sz_formatted, strlen(sz_formatted), NULL);
+
+    if (!b_rc)
+        goto Exit;
+
+    for (y = rect.top; y < rect.bottom; ++y)
+    {
+        for (x = rect.left; x < rect.right; ++x)
+        {
+            if (get_pixel(hdc_memory, x, rect.bottom - 1 - y))
+                p_dest[x] = 255;
+        }
+
+        p_dest += n_stride;
+    }
+
+    rc = strlen(sz_formatted);
+
+Exit:
+
+    if (hbm_temp != NULL)
+    {
+        if (hbm_orig != NULL)
+        {
+            select_object(hdc_memory, hbm_orig);
+        }
+
+        delete_object(hbm_temp);
+    }
+
+    if (hfont != NULL)
+    {
+        if (hfonto != NULL)
+            select_object(hdc_memory, hfonto);
+
+        delete_object(hfont);
+    }
+
+    if (hdc_memory != NULL)
+        delete_dc(hdc_memory);
+
+    hdc_memory = 0;
+
+#endif
+
+    return rc;
+}
diff --git a/vp8/decoder/xprintf.h b/vp8/decoder/xprintf.h
new file mode 100644
index 0000000..2f175e9
--- /dev/null
+++ b/vp8/decoder/xprintf.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+*
+*   Module Title :     xprintf.h
+*
+*   Description  :     Debug print interface header file.
+*
+****************************************************************************/
+#ifndef __INC_XPRINTF_H
+#define __INC_XPRINTF_H
+
+/****************************************************************************
+*  Header Files
+****************************************************************************/
+
+/****************************************************************************
+*  Functions
+****************************************************************************/
+
+// Display a printf style message on the current video frame
+extern int onyx_xprintf(unsigned char *ppbuffer, long n_pixel, long n_size, long n_stride, const char *format, ...);
+
+#endif