Merge "Recalculate zbin_extra only if regular quantizer is being used"
diff --git a/vp8/common/alloccommon.c b/vp8/common/alloccommon.c
index 376707e..d732317 100644
--- a/vp8/common/alloccommon.c
+++ b/vp8/common/alloccommon.c
@@ -186,7 +186,7 @@
void vp8_create_common(VP8_COMMON *oci)
{
vp8_machine_specific_config(oci);
- vp8_default_coef_probs(oci);
+
vp8_init_mbmode_probs(oci);
vp8_default_bmode_probs(oci->fc.bmode_prob);
diff --git a/vp8/common/default_coef_probs.h b/vp8/common/default_coef_probs.h
new file mode 100755
index 0000000..0d19563
--- /dev/null
+++ b/vp8/common/default_coef_probs.h
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+*/
+
+
+/*Generated file, included by entropy.c*/
+
+
+static const vp8_prob default_coef_probs [BLOCK_TYPES]
+ [COEF_BANDS]
+ [PREV_COEF_CONTEXTS]
+ [ENTROPY_NODES] =
+{
+ { /* Block Type ( 0 ) */
+ { /* Coeff Band ( 0 )*/
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 1 )*/
+ { 253, 136, 254, 255, 228, 219, 128, 128, 128, 128, 128 },
+ { 189, 129, 242, 255, 227, 213, 255, 219, 128, 128, 128 },
+ { 106, 126, 227, 252, 214, 209, 255, 255, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 2 )*/
+ { 1, 98, 248, 255, 236, 226, 255, 255, 128, 128, 128 },
+ { 181, 133, 238, 254, 221, 234, 255, 154, 128, 128, 128 },
+ { 78, 134, 202, 247, 198, 180, 255, 219, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 3 )*/
+ { 1, 185, 249, 255, 243, 255, 128, 128, 128, 128, 128 },
+ { 184, 150, 247, 255, 236, 224, 128, 128, 128, 128, 128 },
+ { 77, 110, 216, 255, 236, 230, 128, 128, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 4 )*/
+ { 1, 101, 251, 255, 241, 255, 128, 128, 128, 128, 128 },
+ { 170, 139, 241, 252, 236, 209, 255, 255, 128, 128, 128 },
+ { 37, 116, 196, 243, 228, 255, 255, 255, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 5 )*/
+ { 1, 204, 254, 255, 245, 255, 128, 128, 128, 128, 128 },
+ { 207, 160, 250, 255, 238, 128, 128, 128, 128, 128, 128 },
+ { 102, 103, 231, 255, 211, 171, 128, 128, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 6 )*/
+ { 1, 152, 252, 255, 240, 255, 128, 128, 128, 128, 128 },
+ { 177, 135, 243, 255, 234, 225, 128, 128, 128, 128, 128 },
+ { 80, 129, 211, 255, 194, 224, 128, 128, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 7 )*/
+ { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 246, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+ }
+ },
+ { /* Block Type ( 1 ) */
+ { /* Coeff Band ( 0 )*/
+ { 198, 35, 237, 223, 193, 187, 162, 160, 145, 155, 62 },
+ { 131, 45, 198, 221, 172, 176, 220, 157, 252, 221, 1 },
+ { 68, 47, 146, 208, 149, 167, 221, 162, 255, 223, 128 }
+ },
+ { /* Coeff Band ( 1 )*/
+ { 1, 149, 241, 255, 221, 224, 255, 255, 128, 128, 128 },
+ { 184, 141, 234, 253, 222, 220, 255, 199, 128, 128, 128 },
+ { 81, 99, 181, 242, 176, 190, 249, 202, 255, 255, 128 }
+ },
+ { /* Coeff Band ( 2 )*/
+ { 1, 129, 232, 253, 214, 197, 242, 196, 255, 255, 128 },
+ { 99, 121, 210, 250, 201, 198, 255, 202, 128, 128, 128 },
+ { 23, 91, 163, 242, 170, 187, 247, 210, 255, 255, 128 }
+ },
+ { /* Coeff Band ( 3 )*/
+ { 1, 200, 246, 255, 234, 255, 128, 128, 128, 128, 128 },
+ { 109, 178, 241, 255, 231, 245, 255, 255, 128, 128, 128 },
+ { 44, 130, 201, 253, 205, 192, 255, 255, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 4 )*/
+ { 1, 132, 239, 251, 219, 209, 255, 165, 128, 128, 128 },
+ { 94, 136, 225, 251, 218, 190, 255, 255, 128, 128, 128 },
+ { 22, 100, 174, 245, 186, 161, 255, 199, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 5 )*/
+ { 1, 182, 249, 255, 232, 235, 128, 128, 128, 128, 128 },
+ { 124, 143, 241, 255, 227, 234, 128, 128, 128, 128, 128 },
+ { 35, 77, 181, 251, 193, 211, 255, 205, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 6 )*/
+ { 1, 157, 247, 255, 236, 231, 255, 255, 128, 128, 128 },
+ { 121, 141, 235, 255, 225, 227, 255, 255, 128, 128, 128 },
+ { 45, 99, 188, 251, 195, 217, 255, 224, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 7 )*/
+ { 1, 1, 251, 255, 213, 255, 128, 128, 128, 128, 128 },
+ { 203, 1, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
+ { 137, 1, 177, 255, 224, 255, 128, 128, 128, 128, 128 }
+ }
+ },
+ { /* Block Type ( 2 ) */
+ { /* Coeff Band ( 0 )*/
+ { 253, 9, 248, 251, 207, 208, 255, 192, 128, 128, 128 },
+ { 175, 13, 224, 243, 193, 185, 249, 198, 255, 255, 128 },
+ { 73, 17, 171, 221, 161, 179, 236, 167, 255, 234, 128 }
+ },
+ { /* Coeff Band ( 1 )*/
+ { 1, 95, 247, 253, 212, 183, 255, 255, 128, 128, 128 },
+ { 239, 90, 244, 250, 211, 209, 255, 255, 128, 128, 128 },
+ { 155, 77, 195, 248, 188, 195, 255, 255, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 2 )*/
+ { 1, 24, 239, 251, 218, 219, 255, 205, 128, 128, 128 },
+ { 201, 51, 219, 255, 196, 186, 128, 128, 128, 128, 128 },
+ { 69, 46, 190, 239, 201, 218, 255, 228, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 3 )*/
+ { 1, 191, 251, 255, 255, 128, 128, 128, 128, 128, 128 },
+ { 223, 165, 249, 255, 213, 255, 128, 128, 128, 128, 128 },
+ { 141, 124, 248, 255, 255, 128, 128, 128, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 4 )*/
+ { 1, 16, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
+ { 190, 36, 230, 255, 236, 255, 128, 128, 128, 128, 128 },
+ { 149, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 5 )*/
+ { 1, 226, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 247, 192, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 6 )*/
+ { 1, 134, 252, 255, 255, 128, 128, 128, 128, 128, 128 },
+ { 213, 62, 250, 255, 255, 128, 128, 128, 128, 128, 128 },
+ { 55, 93, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 7 )*/
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+ }
+ },
+ { /* Block Type ( 3 ) */
+ { /* Coeff Band ( 0 )*/
+ { 202, 24, 213, 235, 186, 191, 220, 160, 240, 175, 255 },
+ { 126, 38, 182, 232, 169, 184, 228, 174, 255, 187, 128 },
+ { 61, 46, 138, 219, 151, 178, 240, 170, 255, 216, 128 }
+ },
+ { /* Coeff Band ( 1 )*/
+ { 1, 112, 230, 250, 199, 191, 247, 159, 255, 255, 128 },
+ { 166, 109, 228, 252, 211, 215, 255, 174, 128, 128, 128 },
+ { 39, 77, 162, 232, 172, 180, 245, 178, 255, 255, 128 }
+ },
+ { /* Coeff Band ( 2 )*/
+ { 1, 52, 220, 246, 198, 199, 249, 220, 255, 255, 128 },
+ { 124, 74, 191, 243, 183, 193, 250, 221, 255, 255, 128 },
+ { 24, 71, 130, 219, 154, 170, 243, 182, 255, 255, 128 }
+ },
+ { /* Coeff Band ( 3 )*/
+ { 1, 182, 225, 249, 219, 240, 255, 224, 128, 128, 128 },
+ { 149, 150, 226, 252, 216, 205, 255, 171, 128, 128, 128 },
+ { 28, 108, 170, 242, 183, 194, 254, 223, 255, 255, 128 }
+ },
+ { /* Coeff Band ( 4 )*/
+ { 1, 81, 230, 252, 204, 203, 255, 192, 128, 128, 128 },
+ { 123, 102, 209, 247, 188, 196, 255, 233, 128, 128, 128 },
+ { 20, 95, 153, 243, 164, 173, 255, 203, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 5 )*/
+ { 1, 222, 248, 255, 216, 213, 128, 128, 128, 128, 128 },
+ { 168, 175, 246, 252, 235, 205, 255, 255, 128, 128, 128 },
+ { 47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 6 )*/
+ { 1, 121, 236, 253, 212, 214, 255, 255, 128, 128, 128 },
+ { 141, 84, 213, 252, 201, 202, 255, 219, 128, 128, 128 },
+ { 42, 80, 160, 240, 162, 185, 255, 205, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 7 )*/
+ { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 244, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 238, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+ }
+ }
+};
diff --git a/vp8/common/defaultcoefcounts.h b/vp8/common/defaultcoefcounts.h
deleted file mode 100644
index 7a1e28b..0000000
--- a/vp8/common/defaultcoefcounts.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef __DEFAULTCOEFCOUNTS_H
-#define __DEFAULTCOEFCOUNTS_H
-
-#include "entropy.h"
-
-extern const unsigned int vp8_default_coef_counts[BLOCK_TYPES]
- [COEF_BANDS]
- [PREV_COEF_CONTEXTS]
- [MAX_ENTROPY_TOKENS];
-
-#endif //__DEFAULTCOEFCOUNTS_H
diff --git a/vp8/common/entropy.c b/vp8/common/entropy.c
index 0eee60e..f3d5a9c 100644
--- a/vp8/common/entropy.c
+++ b/vp8/common/entropy.c
@@ -15,6 +15,7 @@
#include "string.h"
#include "blockd.h"
#include "onyxc_int.h"
+#include "vpx_mem/vpx_mem.h"
#define uchar unsigned char /* typedefs can clash */
#define uint unsigned int
@@ -153,39 +154,15 @@
{ cat6, Pcat6, 11, 67},
{ 0, 0, 0, 0}
};
-#include "defaultcoefcounts.h"
+
+#include "default_coef_probs.h"
void vp8_default_coef_probs(VP8_COMMON *pc)
{
- int h = 0;
-
- do
- {
- int i = 0;
-
- do
- {
- int k = 0;
-
- do
- {
- unsigned int branch_ct [ENTROPY_NODES] [2];
- vp8_tree_probs_from_distribution(
- MAX_ENTROPY_TOKENS, vp8_coef_encodings, vp8_coef_tree,
- pc->fc.coef_probs[h][i][k],
- branch_ct,
- vp8_default_coef_counts[h][i][k],
- 256, 1);
-
- }
- while (++k < PREV_COEF_CONTEXTS);
- }
- while (++i < COEF_BANDS);
- }
- while (++h < BLOCK_TYPES);
+ vpx_memcpy(pc->fc.coef_probs, default_coef_probs,
+ sizeof(default_coef_probs));
}
-
void vp8_coef_tree_initialize()
{
init_bit_trees();
diff --git a/vp8/common/x86/idctllm_sse2.asm b/vp8/common/x86/idctllm_sse2.asm
index 34a7e18..83d3765 100644
--- a/vp8/common/x86/idctllm_sse2.asm
+++ b/vp8/common/x86/idctllm_sse2.asm
@@ -11,7 +11,7 @@
%include "vpx_ports/x86_abi_support.asm"
-;void idct_dequant_0_2x_sse2
+;void vp8_idct_dequant_0_2x_sse2
; (
; short *qcoeff - 0
; short *dequant - 1
@@ -21,8 +21,8 @@
; int blk_stride - 5
; )
-global sym(idct_dequant_0_2x_sse2)
-sym(idct_dequant_0_2x_sse2):
+global sym(vp8_idct_dequant_0_2x_sse2)
+sym(vp8_idct_dequant_0_2x_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
@@ -97,8 +97,8 @@
pop rbp
ret
-global sym(idct_dequant_full_2x_sse2)
-sym(idct_dequant_full_2x_sse2):
+global sym(vp8_idct_dequant_full_2x_sse2)
+sym(vp8_idct_dequant_full_2x_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
@@ -353,7 +353,7 @@
pop rbp
ret
-;void idct_dequant_dc_0_2x_sse2
+;void vp8_idct_dequant_dc_0_2x_sse2
; (
; short *qcoeff - 0
; short *dequant - 1
@@ -362,8 +362,8 @@
; int dst_stride - 4
; short *dc - 5
; )
-global sym(idct_dequant_dc_0_2x_sse2)
-sym(idct_dequant_dc_0_2x_sse2):
+global sym(vp8_idct_dequant_dc_0_2x_sse2)
+sym(vp8_idct_dequant_dc_0_2x_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
@@ -438,8 +438,8 @@
pop rbp
ret
-global sym(idct_dequant_dc_full_2x_sse2)
-sym(idct_dequant_dc_full_2x_sse2):
+global sym(vp8_idct_dequant_dc_full_2x_sse2)
+sym(vp8_idct_dequant_dc_full_2x_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
diff --git a/vp8/common/x86/loopfilter_mmx.asm b/vp8/common/x86/loopfilter_mmx.asm
index ad47284..697a5de 100644
--- a/vp8/common/x86/loopfilter_mmx.asm
+++ b/vp8/common/x86/loopfilter_mmx.asm
@@ -40,7 +40,7 @@
movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
movsxd rcx, dword ptr arg(5) ;count
-next8_h:
+.next8_h:
mov rdx, arg(3) ;limit
movq mm7, [rdx]
mov rdi, rsi ; rdi points to row +1 for indirect addressing
@@ -211,7 +211,7 @@
add rsi,8
neg rax
dec rcx
- jnz next8_h
+ jnz .next8_h
add rsp, 32
pop rsp
@@ -255,7 +255,7 @@
lea rsi, [rsi + rax*4 - 4]
movsxd rcx, dword ptr arg(5) ;count
-next8_v:
+.next8_v:
mov rdi, rsi ; rdi points to row +1 for indirect addressing
add rdi, rax
@@ -581,7 +581,7 @@
lea rsi, [rsi+rax*8]
dec rcx
- jnz next8_v
+ jnz .next8_v
add rsp, 64
pop rsp
@@ -622,7 +622,7 @@
movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
movsxd rcx, dword ptr arg(5) ;count
-next8_mbh:
+.next8_mbh:
mov rdx, arg(3) ;limit
movq mm7, [rdx]
mov rdi, rsi ; rdi points to row +1 for indirect addressing
@@ -898,7 +898,7 @@
neg rax
add rsi,8
dec rcx
- jnz next8_mbh
+ jnz .next8_mbh
add rsp, 32
pop rsp
@@ -942,7 +942,7 @@
lea rsi, [rsi + rax*4 - 4]
movsxd rcx, dword ptr arg(5) ;count
-next8_mbv:
+.next8_mbv:
lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
;transpose
@@ -1365,7 +1365,7 @@
lea rsi, [rsi+rax*8]
dec rcx
- jnz next8_mbv
+ jnz .next8_mbv
add rsp, 96
pop rsp
@@ -1398,7 +1398,7 @@
movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
mov rcx, 2 ; count
-nexts8_h:
+.nexts8_h:
mov rdx, arg(2) ;blimit ; get blimit
movq mm3, [rdx] ;
@@ -1483,7 +1483,7 @@
add rsi,8
neg rax
dec rcx
- jnz nexts8_h
+ jnz .nexts8_h
; begin epilog
pop rdi
@@ -1520,7 +1520,7 @@
lea rsi, [rsi + rax*4- 2]; ;
mov rcx, 2 ; count
-nexts8_v:
+.nexts8_v:
lea rdi, [rsi + rax];
movd mm0, [rdi + rax * 2] ; xx xx xx xx 73 72 71 70
@@ -1695,7 +1695,7 @@
lea rsi, [rsi+rax*8] ; next 8
dec rcx
- jnz nexts8_v
+ jnz .nexts8_v
add rsp, 32
pop rsp
diff --git a/vp8/common/x86/loopfilter_sse2.asm b/vp8/common/x86/loopfilter_sse2.asm
index 4efff7e..295609c 100644
--- a/vp8/common/x86/loopfilter_sse2.asm
+++ b/vp8/common/x86/loopfilter_sse2.asm
@@ -1395,8 +1395,8 @@
neg rax
; calculate mask
- movdqu xmm1, [rsi+2*rax] ; p1
- movdqu xmm0, [rdi] ; q1
+ movdqa xmm1, [rsi+2*rax] ; p1
+ movdqa xmm0, [rdi] ; q1
movdqa xmm2, xmm1
movdqa xmm7, xmm0
movdqa xmm4, xmm0
@@ -1406,8 +1406,8 @@
pand xmm1, [GLOBAL(tfe)] ; set lsb of each byte to zero
psrlw xmm1, 1 ; abs(p1-q1)/2
- movdqu xmm5, [rsi+rax] ; p0
- movdqu xmm4, [rsi] ; q0
+ movdqa xmm5, [rsi+rax] ; p0
+ movdqa xmm4, [rsi] ; q0
movdqa xmm0, xmm4 ; q0
movdqa xmm6, xmm5 ; p0
psubusb xmm5, xmm4 ; p0-=q0
@@ -1449,7 +1449,7 @@
psubsb xmm3, xmm0 ; q0-= q0 add
pxor xmm3, [GLOBAL(t80)] ; unoffset
- movdqu [rsi], xmm3 ; write back
+ movdqa [rsi], xmm3 ; write back
; now do +3 side
psubsb xmm5, [GLOBAL(t1s)] ; +3 instead of +4
@@ -1465,7 +1465,7 @@
paddsb xmm6, xmm0 ; p0+= p0 add
pxor xmm6, [GLOBAL(t80)] ; unoffset
- movdqu [rsi+rax], xmm6 ; write back
+ movdqa [rsi+rax], xmm6 ; write back
; begin epilog
pop rdi
@@ -1507,17 +1507,17 @@
lea rdx, [rsi + rax*4]
lea rcx, [rdx + rax]
- movdqu xmm0, [rsi] ; (high 96 bits unused) 03 02 01 00
- movdqu xmm1, [rdx] ; (high 96 bits unused) 43 42 41 40
- movdqu xmm2, [rdi] ; 13 12 11 10
- movdqu xmm3, [rcx] ; 53 52 51 50
+ movd xmm0, [rsi] ; (high 96 bits unused) 03 02 01 00
+ movd xmm1, [rdx] ; (high 96 bits unused) 43 42 41 40
+ movd xmm2, [rdi] ; 13 12 11 10
+ movd xmm3, [rcx] ; 53 52 51 50
punpckldq xmm0, xmm1 ; (high 64 bits unused) 43 42 41 40 03 02 01 00
punpckldq xmm2, xmm3 ; 53 52 51 50 13 12 11 10
- movdqu xmm4, [rsi + rax*2] ; 23 22 21 20
- movdqu xmm5, [rdx + rax*2] ; 63 62 61 60
- movdqu xmm6, [rdi + rax*2] ; 33 32 31 30
- movdqu xmm7, [rcx + rax*2] ; 73 72 71 70
+ movd xmm4, [rsi + rax*2] ; 23 22 21 20
+ movd xmm5, [rdx + rax*2] ; 63 62 61 60
+ movd xmm6, [rdi + rax*2] ; 33 32 31 30
+ movd xmm7, [rcx + rax*2] ; 73 72 71 70
punpckldq xmm4, xmm5 ; 63 62 61 60 23 22 21 20
punpckldq xmm6, xmm7 ; 73 72 71 70 33 32 31 30
@@ -1540,17 +1540,17 @@
lea rdx, [rsi + rax*4]
lea rcx, [rdx + rax]
- movdqu xmm4, [rsi] ; 83 82 81 80
- movdqu xmm1, [rdx] ; c3 c2 c1 c0
- movdqu xmm6, [rdi] ; 93 92 91 90
- movdqu xmm3, [rcx] ; d3 d2 d1 d0
+ movd xmm4, [rsi] ; 83 82 81 80
+ movd xmm1, [rdx] ; c3 c2 c1 c0
+ movd xmm6, [rdi] ; 93 92 91 90
+ movd xmm3, [rcx] ; d3 d2 d1 d0
punpckldq xmm4, xmm1 ; c3 c2 c1 c0 83 82 81 80
punpckldq xmm6, xmm3 ; d3 d2 d1 d0 93 92 91 90
- movdqu xmm0, [rsi + rax*2] ; a3 a2 a1 a0
- movdqu xmm5, [rdx + rax*2] ; e3 e2 e1 e0
- movdqu xmm2, [rdi + rax*2] ; b3 b2 b1 b0
- movdqu xmm7, [rcx + rax*2] ; f3 f2 f1 f0
+ movd xmm0, [rsi + rax*2] ; a3 a2 a1 a0
+ movd xmm5, [rdx + rax*2] ; e3 e2 e1 e0
+ movd xmm2, [rdi + rax*2] ; b3 b2 b1 b0
+ movd xmm7, [rcx + rax*2] ; f3 f2 f1 f0
punpckldq xmm0, xmm5 ; e3 e2 e1 e0 a3 a2 a1 a0
punpckldq xmm2, xmm7 ; f3 f2 f1 f0 b3 b2 b1 b0
diff --git a/vp8/common/x86/postproc_mmx.asm b/vp8/common/x86/postproc_mmx.asm
index 787e832..8112218 100644
--- a/vp8/common/x86/postproc_mmx.asm
+++ b/vp8/common/x86/postproc_mmx.asm
@@ -58,10 +58,10 @@
movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
pxor mm0, mm0 ; mm0 = 00000000
-nextrow:
+.nextrow:
xor rdx, rdx ; clear out rdx for use as loop counter
-nextcol:
+.nextcol:
pxor mm7, mm7 ; mm7 = 00000000
movq mm6, [rbx + 32 ] ; mm6 = kernel 2 taps
@@ -146,7 +146,7 @@
add rdx, 4
cmp edx, dword ptr arg(5) ;cols
- jl nextcol
+ jl .nextcol
; done with the all cols, start the across filtering in place
sub rsi, rdx
sub rdi, rdx
@@ -156,7 +156,7 @@
xor rdx, rdx
mov rax, [rdi-4];
-acrossnextcol:
+.acrossnextcol:
pxor mm7, mm7 ; mm7 = 00000000
movq mm6, [rbx + 32 ] ;
movq mm4, [rdi+rdx] ; mm4 = p0..p7
@@ -237,7 +237,7 @@
add rdx, 4
cmp edx, dword ptr arg(5) ;cols
- jl acrossnextcol;
+ jl .acrossnextcol;
mov DWORD PTR [rdi+rdx-4], eax
pop rax
@@ -249,7 +249,7 @@
movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; destination pitch?
dec rcx ; decrement count
- jnz nextrow ; next row
+ jnz .nextrow ; next row
pop rbx
; begin epilog
@@ -293,7 +293,7 @@
add dword ptr arg(2), 8
;for(c=0; c<cols; c+=4)
-loop_col:
+.loop_col:
mov rsi, arg(0) ;s
pxor mm0, mm0 ;
@@ -312,7 +312,7 @@
mov rcx, 15 ;
-loop_initvar:
+.loop_initvar:
movd mm1, DWORD PTR [rdi];
punpcklbw mm1, mm0 ;
@@ -329,10 +329,10 @@
lea rdi, [rdi+rax] ;
dec rcx
- jne loop_initvar
+ jne .loop_initvar
;save the var and sum
xor rdx, rdx
-loop_row:
+.loop_row:
movd mm1, DWORD PTR [rsi] ; [s-pitch*8]
movd mm2, DWORD PTR [rdi] ; [s+pitch*7]
@@ -438,13 +438,13 @@
add rdx, 1
cmp edx, dword arg(2) ;rows
- jl loop_row
+ jl .loop_row
add dword arg(0), 4 ; s += 4
sub dword arg(3), 4 ; cols -= 4
cmp dword arg(3), 0
- jg loop_col
+ jg .loop_col
add rsp, 136
pop rsp
@@ -475,7 +475,7 @@
push rdi
; end prolog
-addnoise_loop:
+.addnoise_loop:
call sym(rand) WRT_PLT
mov rcx, arg(1) ;noise
and rax, 0xff
@@ -492,7 +492,7 @@
mov rsi, arg(0) ;Pos
xor rax,rax
-addnoise_nextset:
+.addnoise_nextset:
movq mm1,[rsi+rax] ; get the source
psubusb mm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise
@@ -506,12 +506,12 @@
add rax,8 ; move to the next line
cmp rax, rcx
- jl addnoise_nextset
+ jl .addnoise_nextset
movsxd rax, dword arg(7) ; Pitch
add arg(0), rax ; Start += Pitch
sub dword arg(6), 1 ; Height -= 1
- jg addnoise_loop
+ jg .addnoise_loop
; begin epilog
pop rdi
diff --git a/vp8/common/x86/postproc_mmx.c b/vp8/common/x86/postproc_mmx.c
deleted file mode 100644
index 6b6321a..0000000
--- a/vp8/common/x86/postproc_mmx.c
+++ /dev/null
@@ -1,1508 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include <math.h>
-#include <stdlib.h>
-#include "vpx_scale/yv12config.h"
-#include "pragmas.h"
-
-#define VP8_FILTER_WEIGHT 128
-#define VP8_FILTER_SHIFT 7
-
-
-
-/* static constants */
-__declspec(align(16))
-const static short Blur[48] =
-{
-
- 16, 16, 16, 16, 16, 16, 16, 16,
- 16, 16, 16, 16, 16, 16, 16, 16,
- 64, 64, 64, 64, 64, 64, 64, 64,
- 16, 16, 16, 16, 16, 16, 16, 16,
- 16, 16, 16, 16, 16, 16, 16, 16,
- 0, 0, 0, 0, 0, 0, 0, 0,
-
-};
-#define RD __declspec(align(16)) __int64 rd = 0x0040004000400040;
-#define R4D2 __declspec(align(16)) __int64 rd42[2] = {0x0004000400040004,0x0004000400040004};
-
-#ifndef RELOCATEABLE
-const static RD;
-const static R4D2;
-#endif
-
-
-/* external references */
-extern double vp8_gaussian(double sigma, double mu, double x);
-extern short vp8_rv[];
-extern int vp8_q2mbl(int x) ;
-
-
-
-void vp8_post_proc_down_and_across_mmx
-(
- unsigned char *src_ptr,
- unsigned char *dst_ptr,
- int src_pixels_per_line,
- int dst_pixels_per_line,
- int rows,
- int cols,
- int flimit
-)
-{
-#ifdef RELOCATEABLE
- RD
- R4D2
-#endif
-
- __asm
- {
- push ebx
- lea ebx, Blur
- movd mm2, flimit
- punpcklwd mm2, mm2
- punpckldq mm2, mm2
-
- mov esi, src_ptr
- mov edi, dst_ptr
-
- mov ecx, DWORD PTR rows
- mov eax, src_pixels_per_line ;
- destination pitch?
- pxor mm0, mm0 ;
- mm0 = 00000000
-
- nextrow:
-
- xor edx, edx ;
-
- clear out edx for use as loop counter
- nextcol:
-
- pxor mm7, mm7 ;
-
- mm7 = 00000000
- movq mm6, [ebx + 32 ] ;
- mm6 = kernel 2 taps
- movq mm3, [esi] ;
- mm4 = r0 p0..p7
- punpcklbw mm3, mm0 ;
- mm3 = p0..p3
- movq mm1, mm3 ;
- mm1 = p0..p3
- pmullw mm3, mm6 ;
- mm3 *= kernel 2 modifiers
-
- movq mm6, [ebx + 48] ;
- mm6 = kernel 3 taps
- movq mm5, [esi + eax] ;
- mm4 = r1 p0..p7
- punpcklbw mm5, mm0 ;
- mm5 = r1 p0..p3
- pmullw mm6, mm5 ;
- mm6 *= p0..p3 * kernel 3 modifiers
- paddusw mm3, mm6 ;
- mm3 += mm6
-
- ;
- thresholding
- movq mm7, mm1 ;
- mm7 = r0 p0..p3
- psubusw mm7, mm5 ;
- mm7 = r0 p0..p3 - r1 p0..p3
- psubusw mm5, mm1 ;
- mm5 = r1 p0..p3 - r0 p0..p3
- paddusw mm7, mm5 ;
- mm7 = abs(r0 p0..p3 - r1 p0..p3)
- pcmpgtw mm7, mm2
-
- movq mm6, [ebx + 64 ] ;
- mm6 = kernel 4 modifiers
- movq mm5, [esi + 2*eax] ;
- mm4 = r2 p0..p7
- punpcklbw mm5, mm0 ;
- mm5 = r2 p0..p3
- pmullw mm6, mm5 ;
- mm5 *= kernel 4 modifiers
- paddusw mm3, mm6 ;
- mm3 += mm5
-
- ;
- thresholding
- movq mm6, mm1 ;
- mm6 = r0 p0..p3
- psubusw mm6, mm5 ;
- mm6 = r0 p0..p3 - r2 p0..p3
- psubusw mm5, mm1 ;
- mm5 = r2 p0..p3 - r2 p0..p3
- paddusw mm6, mm5 ;
- mm6 = abs(r0 p0..p3 - r2 p0..p3)
- pcmpgtw mm6, mm2
- por mm7, mm6 ;
- accumulate thresholds
-
-
- neg eax
- movq mm6, [ebx ] ;
- kernel 0 taps
- movq mm5, [esi+2*eax] ;
- mm4 = r-2 p0..p7
- punpcklbw mm5, mm0 ;
- mm5 = r-2 p0..p3
- pmullw mm6, mm5 ;
- mm5 *= kernel 0 modifiers
- paddusw mm3, mm6 ;
- mm3 += mm5
-
- ;
- thresholding
- movq mm6, mm1 ;
- mm6 = r0 p0..p3
- psubusw mm6, mm5 ;
- mm6 = p0..p3 - r-2 p0..p3
- psubusw mm5, mm1 ;
- mm5 = r-2 p0..p3 - p0..p3
- paddusw mm6, mm5 ;
- mm6 = abs(r0 p0..p3 - r-2 p0..p3)
- pcmpgtw mm6, mm2
- por mm7, mm6 ;
- accumulate thresholds
-
- movq mm6, [ebx + 16] ;
- kernel 1 taps
- movq mm4, [esi+eax] ;
- mm4 = r-1 p0..p7
- punpcklbw mm4, mm0 ;
- mm4 = r-1 p0..p3
- pmullw mm6, mm4 ;
- mm4 *= kernel 1 modifiers.
- paddusw mm3, mm6 ;
- mm3 += mm5
-
- ;
- thresholding
- movq mm6, mm1 ;
- mm6 = r0 p0..p3
- psubusw mm6, mm4 ;
- mm6 = p0..p3 - r-2 p0..p3
- psubusw mm4, mm1 ;
- mm5 = r-1 p0..p3 - p0..p3
- paddusw mm6, mm4 ;
- mm6 = abs(r0 p0..p3 - r-1 p0..p3)
- pcmpgtw mm6, mm2
- por mm7, mm6 ;
- accumulate thresholds
-
-
- paddusw mm3, rd ;
- mm3 += round value
- psraw mm3, VP8_FILTER_SHIFT ;
- mm3 /= 128
-
- pand mm1, mm7 ;
- mm1 select vals > thresh from source
- pandn mm7, mm3 ;
- mm7 select vals < thresh from blurred result
- paddusw mm1, mm7 ;
- combination
-
- packuswb mm1, mm0 ;
- pack to bytes
-
- movd [edi], mm1 ;
- neg eax ;
- pitch is positive
-
-
- add esi, 4
- add edi, 4
- add edx, 4
-
- cmp edx, cols
- jl nextcol
- // done with the all cols, start the across filtering in place
- sub esi, edx
- sub edi, edx
-
-
- push eax
- xor edx, edx
- mov eax, [edi-4];
-
- acrossnextcol:
- pxor mm7, mm7 ;
- mm7 = 00000000
- movq mm6, [ebx + 32 ] ;
- movq mm4, [edi+edx] ;
- mm4 = p0..p7
- movq mm3, mm4 ;
- mm3 = p0..p7
- punpcklbw mm3, mm0 ;
- mm3 = p0..p3
- movq mm1, mm3 ;
- mm1 = p0..p3
- pmullw mm3, mm6 ;
- mm3 *= kernel 2 modifiers
-
- movq mm6, [ebx + 48]
- psrlq mm4, 8 ;
- mm4 = p1..p7
- movq mm5, mm4 ;
- mm5 = p1..p7
- punpcklbw mm5, mm0 ;
- mm5 = p1..p4
- pmullw mm6, mm5 ;
- mm6 *= p1..p4 * kernel 3 modifiers
- paddusw mm3, mm6 ;
- mm3 += mm6
-
- ;
- thresholding
- movq mm7, mm1 ;
- mm7 = p0..p3
- psubusw mm7, mm5 ;
- mm7 = p0..p3 - p1..p4
- psubusw mm5, mm1 ;
- mm5 = p1..p4 - p0..p3
- paddusw mm7, mm5 ;
- mm7 = abs(p0..p3 - p1..p4)
- pcmpgtw mm7, mm2
-
- movq mm6, [ebx + 64 ]
- psrlq mm4, 8 ;
- mm4 = p2..p7
- movq mm5, mm4 ;
- mm5 = p2..p7
- punpcklbw mm5, mm0 ;
- mm5 = p2..p5
- pmullw mm6, mm5 ;
- mm5 *= kernel 4 modifiers
- paddusw mm3, mm6 ;
- mm3 += mm5
-
- ;
- thresholding
- movq mm6, mm1 ;
- mm6 = p0..p3
- psubusw mm6, mm5 ;
- mm6 = p0..p3 - p1..p4
- psubusw mm5, mm1 ;
- mm5 = p1..p4 - p0..p3
- paddusw mm6, mm5 ;
- mm6 = abs(p0..p3 - p1..p4)
- pcmpgtw mm6, mm2
- por mm7, mm6 ;
- accumulate thresholds
-
-
- movq mm6, [ebx ]
- movq mm4, [edi+edx-2] ;
- mm4 = p-2..p5
- movq mm5, mm4 ;
- mm5 = p-2..p5
- punpcklbw mm5, mm0 ;
- mm5 = p-2..p1
- pmullw mm6, mm5 ;
- mm5 *= kernel 0 modifiers
- paddusw mm3, mm6 ;
- mm3 += mm5
-
- ;
- thresholding
- movq mm6, mm1 ;
- mm6 = p0..p3
- psubusw mm6, mm5 ;
- mm6 = p0..p3 - p1..p4
- psubusw mm5, mm1 ;
- mm5 = p1..p4 - p0..p3
- paddusw mm6, mm5 ;
- mm6 = abs(p0..p3 - p1..p4)
- pcmpgtw mm6, mm2
- por mm7, mm6 ;
- accumulate thresholds
-
- movq mm6, [ebx + 16]
- psrlq mm4, 8 ;
- mm4 = p-1..p5
- punpcklbw mm4, mm0 ;
- mm4 = p-1..p2
- pmullw mm6, mm4 ;
- mm4 *= kernel 1 modifiers.
- paddusw mm3, mm6 ;
- mm3 += mm5
-
- ;
- thresholding
- movq mm6, mm1 ;
- mm6 = p0..p3
- psubusw mm6, mm4 ;
- mm6 = p0..p3 - p1..p4
- psubusw mm4, mm1 ;
- mm5 = p1..p4 - p0..p3
- paddusw mm6, mm4 ;
- mm6 = abs(p0..p3 - p1..p4)
- pcmpgtw mm6, mm2
- por mm7, mm6 ;
- accumulate thresholds
-
- paddusw mm3, rd ;
- mm3 += round value
- psraw mm3, VP8_FILTER_SHIFT ;
- mm3 /= 128
-
- pand mm1, mm7 ;
- mm1 select vals > thresh from source
- pandn mm7, mm3 ;
- mm7 select vals < thresh from blurred result
- paddusw mm1, mm7 ;
- combination
-
- packuswb mm1, mm0 ;
- pack to bytes
- mov DWORD PTR [edi+edx-4], eax ;
- store previous four bytes
- movd eax, mm1
-
- add edx, 4
- cmp edx, cols
- jl acrossnextcol;
-
- mov DWORD PTR [edi+edx-4], eax
- pop eax
-
- // done with this rwo
- add esi, eax ;
- next line
- mov eax, dst_pixels_per_line ;
- destination pitch?
- add edi, eax ;
- next destination
- mov eax, src_pixels_per_line ;
- destination pitch?
-
- dec ecx ;
- decrement count
- jnz nextrow ;
- next row
- pop ebx
-
- }
-}
-
-
-
-void vp8_post_proc_down_and_across_xmm
-(
- unsigned char *src_ptr,
- unsigned char *dst_ptr,
- int src_pixels_per_line,
- int dst_pixels_per_line,
- int rows,
- int cols,
- int flimit
-)
-{
-#ifdef RELOCATEABLE
- R4D2
-#endif
-
- __asm
- {
- movd xmm2, flimit
- punpcklwd xmm2, xmm2
- punpckldq xmm2, xmm2
- punpcklqdq xmm2, xmm2
-
- mov esi, src_ptr
- mov edi, dst_ptr
-
- mov ecx, DWORD PTR rows
- mov eax, src_pixels_per_line ;
- destination pitch?
- pxor xmm0, xmm0 ;
- mm0 = 00000000
-
- nextrow:
-
- xor edx, edx ;
-
- clear out edx for use as loop counter
- nextcol:
- movq xmm3, QWORD PTR [esi] ;
-
- mm4 = r0 p0..p7
- punpcklbw xmm3, xmm0 ;
- mm3 = p0..p3
- movdqa xmm1, xmm3 ;
- mm1 = p0..p3
- psllw xmm3, 2 ;
-
- movq xmm5, QWORD PTR [esi + eax] ;
- mm4 = r1 p0..p7
- punpcklbw xmm5, xmm0 ;
- mm5 = r1 p0..p3
- paddusw xmm3, xmm5 ;
- mm3 += mm6
-
- ;
- thresholding
- movdqa xmm7, xmm1 ;
- mm7 = r0 p0..p3
- psubusw xmm7, xmm5 ;
- mm7 = r0 p0..p3 - r1 p0..p3
- psubusw xmm5, xmm1 ;
- mm5 = r1 p0..p3 - r0 p0..p3
- paddusw xmm7, xmm5 ;
- mm7 = abs(r0 p0..p3 - r1 p0..p3)
- pcmpgtw xmm7, xmm2
-
- movq xmm5, QWORD PTR [esi + 2*eax] ;
- mm4 = r2 p0..p7
- punpcklbw xmm5, xmm0 ;
- mm5 = r2 p0..p3
- paddusw xmm3, xmm5 ;
- mm3 += mm5
-
- ;
- thresholding
- movdqa xmm6, xmm1 ;
- mm6 = r0 p0..p3
- psubusw xmm6, xmm5 ;
- mm6 = r0 p0..p3 - r2 p0..p3
- psubusw xmm5, xmm1 ;
- mm5 = r2 p0..p3 - r2 p0..p3
- paddusw xmm6, xmm5 ;
- mm6 = abs(r0 p0..p3 - r2 p0..p3)
- pcmpgtw xmm6, xmm2
- por xmm7, xmm6 ;
- accumulate thresholds
-
-
- neg eax
- movq xmm5, QWORD PTR [esi+2*eax] ;
- mm4 = r-2 p0..p7
- punpcklbw xmm5, xmm0 ;
- mm5 = r-2 p0..p3
- paddusw xmm3, xmm5 ;
- mm3 += mm5
-
- ;
- thresholding
- movdqa xmm6, xmm1 ;
- mm6 = r0 p0..p3
- psubusw xmm6, xmm5 ;
- mm6 = p0..p3 - r-2 p0..p3
- psubusw xmm5, xmm1 ;
- mm5 = r-2 p0..p3 - p0..p3
- paddusw xmm6, xmm5 ;
- mm6 = abs(r0 p0..p3 - r-2 p0..p3)
- pcmpgtw xmm6, xmm2
- por xmm7, xmm6 ;
- accumulate thresholds
-
- movq xmm4, QWORD PTR [esi+eax] ;
- mm4 = r-1 p0..p7
- punpcklbw xmm4, xmm0 ;
- mm4 = r-1 p0..p3
- paddusw xmm3, xmm4 ;
- mm3 += mm5
-
- ;
- thresholding
- movdqa xmm6, xmm1 ;
- mm6 = r0 p0..p3
- psubusw xmm6, xmm4 ;
- mm6 = p0..p3 - r-2 p0..p3
- psubusw xmm4, xmm1 ;
- mm5 = r-1 p0..p3 - p0..p3
- paddusw xmm6, xmm4 ;
- mm6 = abs(r0 p0..p3 - r-1 p0..p3)
- pcmpgtw xmm6, xmm2
- por xmm7, xmm6 ;
- accumulate thresholds
-
-
- paddusw xmm3, rd42 ;
- mm3 += round value
- psraw xmm3, 3 ;
- mm3 /= 8
-
- pand xmm1, xmm7 ;
- mm1 select vals > thresh from source
- pandn xmm7, xmm3 ;
- mm7 select vals < thresh from blurred result
- paddusw xmm1, xmm7 ;
- combination
-
- packuswb xmm1, xmm0 ;
- pack to bytes
- movq QWORD PTR [edi], xmm1 ;
-
- neg eax ;
- pitch is positive
- add esi, 8
- add edi, 8
-
- add edx, 8
- cmp edx, cols
-
- jl nextcol
-
- // done with the all cols, start the across filtering in place
- sub esi, edx
- sub edi, edx
-
- xor edx, edx
- movq mm0, QWORD PTR [edi-8];
-
- acrossnextcol:
- movq xmm7, QWORD PTR [edi +edx -2]
- movd xmm4, DWORD PTR [edi +edx +6]
-
- pslldq xmm4, 8
- por xmm4, xmm7
-
- movdqa xmm3, xmm4
- psrldq xmm3, 2
- punpcklbw xmm3, xmm0 ;
- mm3 = p0..p3
- movdqa xmm1, xmm3 ;
- mm1 = p0..p3
- psllw xmm3, 2
-
-
- movdqa xmm5, xmm4
- psrldq xmm5, 3
- punpcklbw xmm5, xmm0 ;
- mm5 = p1..p4
- paddusw xmm3, xmm5 ;
- mm3 += mm6
-
- ;
- thresholding
- movdqa xmm7, xmm1 ;
- mm7 = p0..p3
- psubusw xmm7, xmm5 ;
- mm7 = p0..p3 - p1..p4
- psubusw xmm5, xmm1 ;
- mm5 = p1..p4 - p0..p3
- paddusw xmm7, xmm5 ;
- mm7 = abs(p0..p3 - p1..p4)
- pcmpgtw xmm7, xmm2
-
- movdqa xmm5, xmm4
- psrldq xmm5, 4
- punpcklbw xmm5, xmm0 ;
- mm5 = p2..p5
- paddusw xmm3, xmm5 ;
- mm3 += mm5
-
- ;
- thresholding
- movdqa xmm6, xmm1 ;
- mm6 = p0..p3
- psubusw xmm6, xmm5 ;
- mm6 = p0..p3 - p1..p4
- psubusw xmm5, xmm1 ;
- mm5 = p1..p4 - p0..p3
- paddusw xmm6, xmm5 ;
- mm6 = abs(p0..p3 - p1..p4)
- pcmpgtw xmm6, xmm2
- por xmm7, xmm6 ;
- accumulate thresholds
-
-
- movdqa xmm5, xmm4 ;
- mm5 = p-2..p5
- punpcklbw xmm5, xmm0 ;
- mm5 = p-2..p1
- paddusw xmm3, xmm5 ;
- mm3 += mm5
-
- ;
- thresholding
- movdqa xmm6, xmm1 ;
- mm6 = p0..p3
- psubusw xmm6, xmm5 ;
- mm6 = p0..p3 - p1..p4
- psubusw xmm5, xmm1 ;
- mm5 = p1..p4 - p0..p3
- paddusw xmm6, xmm5 ;
- mm6 = abs(p0..p3 - p1..p4)
- pcmpgtw xmm6, xmm2
- por xmm7, xmm6 ;
- accumulate thresholds
-
- psrldq xmm4, 1 ;
- mm4 = p-1..p5
- punpcklbw xmm4, xmm0 ;
- mm4 = p-1..p2
- paddusw xmm3, xmm4 ;
- mm3 += mm5
-
- ;
- thresholding
- movdqa xmm6, xmm1 ;
- mm6 = p0..p3
- psubusw xmm6, xmm4 ;
- mm6 = p0..p3 - p1..p4
- psubusw xmm4, xmm1 ;
- mm5 = p1..p4 - p0..p3
- paddusw xmm6, xmm4 ;
- mm6 = abs(p0..p3 - p1..p4)
- pcmpgtw xmm6, xmm2
- por xmm7, xmm6 ;
- accumulate thresholds
-
- paddusw xmm3, rd42 ;
- mm3 += round value
- psraw xmm3, 3 ;
- mm3 /= 8
-
- pand xmm1, xmm7 ;
- mm1 select vals > thresh from source
- pandn xmm7, xmm3 ;
- mm7 select vals < thresh from blurred result
- paddusw xmm1, xmm7 ;
- combination
-
- packuswb xmm1, xmm0 ;
- pack to bytes
- movq QWORD PTR [edi+edx-8], mm0 ;
- store previous four bytes
- movdq2q mm0, xmm1
-
- add edx, 8
- cmp edx, cols
- jl acrossnextcol;
-
- // last 8 pixels
- movq QWORD PTR [edi+edx-8], mm0
-
- // done with this rwo
- add esi, eax ;
- next line
- mov eax, dst_pixels_per_line ;
- destination pitch?
- add edi, eax ;
- next destination
- mov eax, src_pixels_per_line ;
- destination pitch?
-
- dec ecx ;
- decrement count
- jnz nextrow ;
- next row
- }
-}
-
-
-void vp8_mbpost_proc_down_mmx(unsigned char *dst, int pitch, int rows, int cols, int flimit)
-{
- int c, i;
- __declspec(align(16))
- int flimit2[2];
- __declspec(align(16))
- unsigned char d[16][8];
-
- flimit = vp8_q2mbl(flimit);
-
- for (i = 0; i < 2; i++)
- flimit2[i] = flimit;
-
- rows += 8;
-
- for (c = 0; c < cols; c += 4)
- {
- unsigned char *s = &dst[c];
-
- __asm
- {
- mov esi, s ;
- pxor mm0, mm0 ;
-
- mov eax, pitch ;
- neg eax // eax = -pitch
-
- lea esi, [esi + eax*8]; // edi = s[-pitch*8]
- neg eax
-
-
- pxor mm5, mm5
- pxor mm6, mm6 ;
-
- pxor mm7, mm7 ;
- mov edi, esi
-
- mov ecx, 15 ;
-
- loop_initvar:
- movd mm1, DWORD PTR [edi];
- punpcklbw mm1, mm0 ;
-
- paddw mm5, mm1 ;
- pmullw mm1, mm1 ;
-
- movq mm2, mm1 ;
- punpcklwd mm1, mm0 ;
-
- punpckhwd mm2, mm0 ;
- paddd mm6, mm1 ;
-
- paddd mm7, mm2 ;
- lea edi, [edi+eax] ;
-
- dec ecx
- jne loop_initvar
- //save the var and sum
- xor edx, edx
- loop_row:
- movd mm1, DWORD PTR [esi] // [s-pitch*8]
- movd mm2, DWORD PTR [edi] // [s+pitch*7]
-
- punpcklbw mm1, mm0
- punpcklbw mm2, mm0
-
- paddw mm5, mm2
- psubw mm5, mm1
-
- pmullw mm2, mm2
- movq mm4, mm2
-
- punpcklwd mm2, mm0
- punpckhwd mm4, mm0
-
- paddd mm6, mm2
- paddd mm7, mm4
-
- pmullw mm1, mm1
- movq mm2, mm1
-
- punpcklwd mm1, mm0
- psubd mm6, mm1
-
- punpckhwd mm2, mm0
- psubd mm7, mm2
-
-
- movq mm3, mm6
- pslld mm3, 4
-
- psubd mm3, mm6
- movq mm1, mm5
-
- movq mm4, mm5
- pmullw mm1, mm1
-
- pmulhw mm4, mm4
- movq mm2, mm1
-
- punpcklwd mm1, mm4
- punpckhwd mm2, mm4
-
- movq mm4, mm7
- pslld mm4, 4
-
- psubd mm4, mm7
-
- psubd mm3, mm1
- psubd mm4, mm2
-
- psubd mm3, flimit2
- psubd mm4, flimit2
-
- psrad mm3, 31
- psrad mm4, 31
-
- packssdw mm3, mm4
- packsswb mm3, mm0
-
- movd mm1, DWORD PTR [esi+eax*8]
-
- movq mm2, mm1
- punpcklbw mm1, mm0
-
- paddw mm1, mm5
- mov ecx, edx
-
- and ecx, 127
- movq mm4, vp8_rv[ecx*2]
-
- paddw mm1, mm4
- //paddw xmm1, eight8s
- psraw mm1, 4
-
- packuswb mm1, mm0
- pand mm1, mm3
-
- pandn mm3, mm2
- por mm1, mm3
-
- and ecx, 15
- movd DWORD PTR d[ecx*4], mm1
-
- mov ecx, edx
- sub ecx, 8
-
- and ecx, 15
- movd mm1, DWORD PTR d[ecx*4]
-
- movd [esi], mm1
- lea esi, [esi+eax]
-
- lea edi, [edi+eax]
- add edx, 1
-
- cmp edx, rows
- jl loop_row
-
- }
-
- }
-}
-
-void vp8_mbpost_proc_down_xmm(unsigned char *dst, int pitch, int rows, int cols, int flimit)
-{
- int c, i;
- __declspec(align(16))
- int flimit4[4];
- __declspec(align(16))
- unsigned char d[16][8];
-
- flimit = vp8_q2mbl(flimit);
-
- for (i = 0; i < 4; i++)
- flimit4[i] = flimit;
-
- rows += 8;
-
- for (c = 0; c < cols; c += 8)
- {
- unsigned char *s = &dst[c];
-
- __asm
- {
- mov esi, s ;
- pxor xmm0, xmm0 ;
-
- mov eax, pitch ;
- neg eax // eax = -pitch
-
- lea esi, [esi + eax*8]; // edi = s[-pitch*8]
- neg eax
-
-
- pxor xmm5, xmm5
- pxor xmm6, xmm6 ;
-
- pxor xmm7, xmm7 ;
- mov edi, esi
-
- mov ecx, 15 ;
-
- loop_initvar:
- movq xmm1, QWORD PTR [edi];
- punpcklbw xmm1, xmm0 ;
-
- paddw xmm5, xmm1 ;
- pmullw xmm1, xmm1 ;
-
- movdqa xmm2, xmm1 ;
- punpcklwd xmm1, xmm0 ;
-
- punpckhwd xmm2, xmm0 ;
- paddd xmm6, xmm1 ;
-
- paddd xmm7, xmm2 ;
- lea edi, [edi+eax] ;
-
- dec ecx
- jne loop_initvar
- //save the var and sum
- xor edx, edx
- loop_row:
- movq xmm1, QWORD PTR [esi] // [s-pitch*8]
- movq xmm2, QWORD PTR [edi] // [s+pitch*7]
-
- punpcklbw xmm1, xmm0
- punpcklbw xmm2, xmm0
-
- paddw xmm5, xmm2
- psubw xmm5, xmm1
-
- pmullw xmm2, xmm2
- movdqa xmm4, xmm2
-
- punpcklwd xmm2, xmm0
- punpckhwd xmm4, xmm0
-
- paddd xmm6, xmm2
- paddd xmm7, xmm4
-
- pmullw xmm1, xmm1
- movdqa xmm2, xmm1
-
- punpcklwd xmm1, xmm0
- psubd xmm6, xmm1
-
- punpckhwd xmm2, xmm0
- psubd xmm7, xmm2
-
-
- movdqa xmm3, xmm6
- pslld xmm3, 4
-
- psubd xmm3, xmm6
- movdqa xmm1, xmm5
-
- movdqa xmm4, xmm5
- pmullw xmm1, xmm1
-
- pmulhw xmm4, xmm4
- movdqa xmm2, xmm1
-
- punpcklwd xmm1, xmm4
- punpckhwd xmm2, xmm4
-
- movdqa xmm4, xmm7
- pslld xmm4, 4
-
- psubd xmm4, xmm7
-
- psubd xmm3, xmm1
- psubd xmm4, xmm2
-
- psubd xmm3, flimit4
- psubd xmm4, flimit4
-
- psrad xmm3, 31
- psrad xmm4, 31
-
- packssdw xmm3, xmm4
- packsswb xmm3, xmm0
-
- movq xmm1, QWORD PTR [esi+eax*8]
-
- movq xmm2, xmm1
- punpcklbw xmm1, xmm0
-
- paddw xmm1, xmm5
- mov ecx, edx
-
- and ecx, 127
- movdqu xmm4, vp8_rv[ecx*2]
-
- paddw xmm1, xmm4
- //paddw xmm1, eight8s
- psraw xmm1, 4
-
- packuswb xmm1, xmm0
- pand xmm1, xmm3
-
- pandn xmm3, xmm2
- por xmm1, xmm3
-
- and ecx, 15
- movq QWORD PTR d[ecx*8], xmm1
-
- mov ecx, edx
- sub ecx, 8
-
- and ecx, 15
- movq mm0, d[ecx*8]
-
- movq [esi], mm0
- lea esi, [esi+eax]
-
- lea edi, [edi+eax]
- add edx, 1
-
- cmp edx, rows
- jl loop_row
-
- }
-
- }
-}
-#if 0
-/****************************************************************************
- *
- * ROUTINE : plane_add_noise_wmt
- *
- * INPUTS : unsigned char *Start starting address of buffer to add gaussian
- * noise to
- * unsigned int Width width of plane
- * unsigned int Height height of plane
- * int Pitch distance between subsequent lines of frame
- * int q quantizer used to determine amount of noise
- * to add
- *
- * OUTPUTS : None.
- *
- * RETURNS : void.
- *
- * FUNCTION : adds gaussian noise to a plane of pixels
- *
- * SPECIAL NOTES : None.
- *
- ****************************************************************************/
-void vp8_plane_add_noise_wmt(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
-{
- unsigned int i;
-
- __declspec(align(16)) unsigned char blackclamp[16];
- __declspec(align(16)) unsigned char whiteclamp[16];
- __declspec(align(16)) unsigned char bothclamp[16];
- char char_dist[300];
- char Rand[2048];
- double sigma;
-// return;
- __asm emms
- sigma = a + .5 + .6 * (63 - q) / 63.0;
-
- // set up a lookup table of 256 entries that matches
- // a gaussian distribution with sigma determined by q.
- //
- {
- double i;
- int next, j;
-
- next = 0;
-
- for (i = -32; i < 32; i++)
- {
- double g = 256 * vp8_gaussian(sigma, 0, 1.0 * i);
- int a = (int)(g + .5);
-
- if (a)
- {
- for (j = 0; j < a; j++)
- {
- char_dist[next+j] = (char) i;
- }
-
- next = next + j;
- }
-
- }
-
- for (next = next; next < 256; next++)
- char_dist[next] = 0;
-
- }
-
- for (i = 0; i < 2048; i++)
- {
- Rand[i] = char_dist[rand() & 0xff];
- }
-
- for (i = 0; i < 16; i++)
- {
- blackclamp[i] = -char_dist[0];
- whiteclamp[i] = -char_dist[0];
- bothclamp[i] = -2 * char_dist[0];
- }
-
- for (i = 0; i < Height; i++)
- {
- unsigned char *Pos = Start + i * Pitch;
- char *Ref = Rand + (rand() & 0xff);
-
- __asm
- {
- mov ecx, [Width]
- mov esi, Pos
- mov edi, Ref
- xor eax, eax
-
- nextset:
- movdqu xmm1, [esi+eax] // get the source
-
- psubusb xmm1, blackclamp // clamp both sides so we don't outrange adding noise
- paddusb xmm1, bothclamp
- psubusb xmm1, whiteclamp
-
- movdqu xmm2, [edi+eax] // get the noise for this line
- paddb xmm1, xmm2 // add it in
- movdqu [esi+eax], xmm1 // store the result
-
- add eax, 16 // move to the next line
-
- cmp eax, ecx
- jl nextset
-
-
- }
-
- }
-}
-#endif
-__declspec(align(16))
-static const int four8s[4] = { 8, 8, 8, 8};
-void vp8_mbpost_proc_across_ip_xmm(unsigned char *src, int pitch, int rows, int cols, int flimit)
-{
- int r, i;
- __declspec(align(16))
- int flimit4[4];
- unsigned char *s = src;
- int sumsq;
- int sum;
-
-
- flimit = vp8_q2mbl(flimit);
- flimit4[0] =
- flimit4[1] =
- flimit4[2] =
- flimit4[3] = flimit;
-
- for (r = 0; r < rows; r++)
- {
-
-
- sumsq = 0;
- sum = 0;
-
- for (i = -8; i <= 6; i++)
- {
- sumsq += s[i] * s[i];
- sum += s[i];
- }
-
- __asm
- {
- mov eax, sumsq
- movd xmm7, eax
-
- mov eax, sum
- movd xmm6, eax
-
- mov esi, s
- xor ecx, ecx
-
- mov edx, cols
- add edx, 8
- pxor mm0, mm0
- pxor mm1, mm1
-
- pxor xmm0, xmm0
- nextcol4:
-
- movd xmm1, DWORD PTR [esi+ecx-8] // -8 -7 -6 -5
- movd xmm2, DWORD PTR [esi+ecx+7] // +7 +8 +9 +10
-
- punpcklbw xmm1, xmm0 // expanding
- punpcklbw xmm2, xmm0 // expanding
-
- punpcklwd xmm1, xmm0 // expanding to dwords
- punpcklwd xmm2, xmm0 // expanding to dwords
-
- psubd xmm2, xmm1 // 7--8 8--7 9--6 10--5
- paddd xmm1, xmm1 // -8*2 -7*2 -6*2 -5*2
-
- paddd xmm1, xmm2 // 7+-8 8+-7 9+-6 10+-5
- pmaddwd xmm1, xmm2 // squared of 7+-8 8+-7 9+-6 10+-5
-
- paddd xmm6, xmm2
- paddd xmm7, xmm1
-
- pshufd xmm6, xmm6, 0 // duplicate the last ones
- pshufd xmm7, xmm7, 0 // duplicate the last ones
-
- psrldq xmm1, 4 // 8--7 9--6 10--5 0000
- psrldq xmm2, 4 // 8--7 9--6 10--5 0000
-
- pshufd xmm3, xmm1, 3 // 0000 8--7 8--7 8--7 squared
- pshufd xmm4, xmm2, 3 // 0000 8--7 8--7 8--7 squared
-
- paddd xmm6, xmm4
- paddd xmm7, xmm3
-
- pshufd xmm3, xmm1, 01011111b // 0000 0000 9--6 9--6 squared
- pshufd xmm4, xmm2, 01011111b // 0000 0000 9--6 9--6 squared
-
- paddd xmm7, xmm3
- paddd xmm6, xmm4
-
- pshufd xmm3, xmm1, 10111111b // 0000 0000 8--7 8--7 squared
- pshufd xmm4, xmm2, 10111111b // 0000 0000 8--7 8--7 squared
-
- paddd xmm7, xmm3
- paddd xmm6, xmm4
-
- movdqa xmm3, xmm6
- pmaddwd xmm3, xmm3
-
- movdqa xmm5, xmm7
- pslld xmm5, 4
-
- psubd xmm5, xmm7
- psubd xmm5, xmm3
-
- psubd xmm5, flimit4
- psrad xmm5, 31
-
- packssdw xmm5, xmm0
- packsswb xmm5, xmm0
-
- movd xmm1, DWORD PTR [esi+ecx]
- movq xmm2, xmm1
-
- punpcklbw xmm1, xmm0
- punpcklwd xmm1, xmm0
-
- paddd xmm1, xmm6
- paddd xmm1, four8s
-
- psrad xmm1, 4
- packssdw xmm1, xmm0
-
- packuswb xmm1, xmm0
- pand xmm1, xmm5
-
- pandn xmm5, xmm2
- por xmm5, xmm1
-
- movd [esi+ecx-8], mm0
- movq mm0, mm1
-
- movdq2q mm1, xmm5
- psrldq xmm7, 12
-
- psrldq xmm6, 12
- add ecx, 4
-
- cmp ecx, edx
- jl nextcol4
-
- }
- s += pitch;
- }
-}
-
-#if 0
-
-/****************************************************************************
- *
- * ROUTINE : plane_add_noise_mmx
- *
- * INPUTS : unsigned char *Start starting address of buffer to add gaussian
- * noise to
- * unsigned int Width width of plane
- * unsigned int Height height of plane
- * int Pitch distance between subsequent lines of frame
- * int q quantizer used to determine amount of noise
- * to add
- *
- * OUTPUTS : None.
- *
- * RETURNS : void.
- *
- * FUNCTION : adds gaussian noise to a plane of pixels
- *
- * SPECIAL NOTES : None.
- *
- ****************************************************************************/
-void vp8_plane_add_noise_mmx(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
-{
- unsigned int i;
- int Pitch4 = Pitch * 4;
- const int noise_amount = 2;
- const int noise_adder = 2 * noise_amount + 1;
-
- __declspec(align(16)) unsigned char blackclamp[16];
- __declspec(align(16)) unsigned char whiteclamp[16];
- __declspec(align(16)) unsigned char bothclamp[16];
-
- char char_dist[300];
- char Rand[2048];
-
- double sigma;
- __asm emms
- sigma = a + .5 + .6 * (63 - q) / 63.0;
-
- // set up a lookup table of 256 entries that matches
- // a gaussian distribution with sigma determined by q.
- //
- {
- double i, sum = 0;
- int next, j;
-
- next = 0;
-
- for (i = -32; i < 32; i++)
- {
- int a = (int)(.5 + 256 * vp8_gaussian(sigma, 0, i));
-
- if (a)
- {
- for (j = 0; j < a; j++)
- {
- char_dist[next+j] = (char) i;
- }
-
- next = next + j;
- }
-
- }
-
- for (next = next; next < 256; next++)
- char_dist[next] = 0;
-
- }
-
- for (i = 0; i < 2048; i++)
- {
- Rand[i] = char_dist[rand() & 0xff];
- }
-
- for (i = 0; i < 16; i++)
- {
- blackclamp[i] = -char_dist[0];
- whiteclamp[i] = -char_dist[0];
- bothclamp[i] = -2 * char_dist[0];
- }
-
- for (i = 0; i < Height; i++)
- {
- unsigned char *Pos = Start + i * Pitch;
- char *Ref = Rand + (rand() & 0xff);
-
- __asm
- {
- mov ecx, [Width]
- mov esi, Pos
- mov edi, Ref
- xor eax, eax
-
- nextset:
- movq mm1, [esi+eax] // get the source
-
- psubusb mm1, blackclamp // clamp both sides so we don't outrange adding noise
- paddusb mm1, bothclamp
- psubusb mm1, whiteclamp
-
- movq mm2, [edi+eax] // get the noise for this line
- paddb mm1, mm2 // add it in
- movq [esi+eax], mm1 // store the result
-
- add eax, 8 // move to the next line
-
- cmp eax, ecx
- jl nextset
-
-
- }
-
- }
-}
-#else
-extern char an[8][64][3072];
-extern int cd[8][64];
-
-void vp8_plane_add_noise_mmx(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
-{
- unsigned int i;
- __declspec(align(16)) unsigned char blackclamp[16];
- __declspec(align(16)) unsigned char whiteclamp[16];
- __declspec(align(16)) unsigned char bothclamp[16];
-
-
- __asm emms
-
- for (i = 0; i < 16; i++)
- {
- blackclamp[i] = -cd[a][q];
- whiteclamp[i] = -cd[a][q];
- bothclamp[i] = -2 * cd[a][q];
- }
-
- for (i = 0; i < Height; i++)
- {
- unsigned char *Pos = Start + i * Pitch;
- char *Ref = an[a][q] + (rand() & 0xff);
-
- __asm
- {
- mov ecx, [Width]
- mov esi, Pos
- mov edi, Ref
- xor eax, eax
-
- nextset:
- movq mm1, [esi+eax] // get the source
-
- psubusb mm1, blackclamp // clamp both sides so we don't outrange adding noise
- paddusb mm1, bothclamp
- psubusb mm1, whiteclamp
-
- movq mm2, [edi+eax] // get the noise for this line
- paddb mm1, mm2 // add it in
- movq [esi+eax], mm1 // store the result
-
- add eax, 8 // move to the next line
-
- cmp eax, ecx
- jl nextset
- }
- }
-}
-
-
-void vp8_plane_add_noise_wmt(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
-{
- unsigned int i;
-
- __declspec(align(16)) unsigned char blackclamp[16];
- __declspec(align(16)) unsigned char whiteclamp[16];
- __declspec(align(16)) unsigned char bothclamp[16];
-
- __asm emms
-
- for (i = 0; i < 16; i++)
- {
- blackclamp[i] = -cd[a][q];
- whiteclamp[i] = -cd[a][q];
- bothclamp[i] = -2 * cd[a][q];
- }
-
- for (i = 0; i < Height; i++)
- {
- unsigned char *Pos = Start + i * Pitch;
- char *Ref = an[a][q] + (rand() & 0xff);
-
- __asm
- {
- mov ecx, [Width]
- mov esi, Pos
- mov edi, Ref
- xor eax, eax
-
- nextset:
- movdqu xmm1, [esi+eax] // get the source
-
- psubusb xmm1, blackclamp // clamp both sides so we don't outrange adding noise
- paddusb xmm1, bothclamp
- psubusb xmm1, whiteclamp
-
- movdqu xmm2, [edi+eax] // get the noise for this line
- paddb xmm1, xmm2 // add it in
- movdqu [esi+eax], xmm1 // store the result
-
- add eax, 16 // move to the next line
-
- cmp eax, ecx
- jl nextset
- }
- }
-}
-
-#endif
diff --git a/vp8/common/x86/postproc_sse2.asm b/vp8/common/x86/postproc_sse2.asm
index 06d51ec..1f219ca 100644
--- a/vp8/common/x86/postproc_sse2.asm
+++ b/vp8/common/x86/postproc_sse2.asm
@@ -57,10 +57,10 @@
movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
pxor xmm0, xmm0 ; mm0 = 00000000
-nextrow:
+.nextrow:
xor rdx, rdx ; clear out rdx for use as loop counter
-nextcol:
+.nextcol:
movq xmm3, QWORD PTR [rsi] ; mm4 = r0 p0..p7
punpcklbw xmm3, xmm0 ; mm3 = p0..p3
movdqa xmm1, xmm3 ; mm1 = p0..p3
@@ -133,7 +133,7 @@
add rdx, 8
cmp edx, dword arg(5) ;cols
- jl nextcol
+ jl .nextcol
; done with the all cols, start the across filtering in place
sub rsi, rdx
@@ -142,7 +142,7 @@
xor rdx, rdx
movq mm0, QWORD PTR [rdi-8];
-acrossnextcol:
+.acrossnextcol:
movq xmm7, QWORD PTR [rdi +rdx -2]
movd xmm4, DWORD PTR [rdi +rdx +6]
@@ -219,7 +219,7 @@
add rdx, 8
cmp edx, dword arg(5) ;cols
- jl acrossnextcol;
+ jl .acrossnextcol;
; last 8 pixels
movq QWORD PTR [rdi+rdx-8], mm0
@@ -231,7 +231,7 @@
mov eax, dword arg(2) ;src_pixels_per_line ; destination pitch?
dec rcx ; decrement count
- jnz nextrow ; next row
+ jnz .nextrow ; next row
%if ABI_IS_32BIT=1 && CONFIG_PIC=1
add rsp,16
@@ -282,7 +282,7 @@
add dword arg(2), 8
;for(c=0; c<cols; c+=8)
-loop_col:
+.loop_col:
mov rsi, arg(0) ; s
pxor xmm0, xmm0 ;
@@ -301,7 +301,7 @@
mov rcx, 15 ;
-loop_initvar:
+.loop_initvar:
movq xmm1, QWORD PTR [rdi];
punpcklbw xmm1, xmm0 ;
@@ -318,10 +318,10 @@
lea rdi, [rdi+rax] ;
dec rcx
- jne loop_initvar
+ jne .loop_initvar
;save the var and sum
xor rdx, rdx
-loop_row:
+.loop_row:
movq xmm1, QWORD PTR [rsi] ; [s-pitch*8]
movq xmm2, QWORD PTR [rdi] ; [s+pitch*7]
@@ -428,12 +428,12 @@
add rdx, 1
cmp edx, dword arg(2) ;rows
- jl loop_row
+ jl .loop_row
add dword arg(0), 8 ; s += 8
sub dword arg(3), 8 ; cols -= 8
cmp dword arg(3), 0
- jg loop_col
+ jg .loop_col
add rsp, 128+16
pop rsp
@@ -475,13 +475,13 @@
;for(r=0;r<rows;r++)
-ip_row_loop:
+.ip_row_loop:
xor rdx, rdx ;sumsq=0;
xor rcx, rcx ;sum=0;
mov rsi, arg(0); s
mov rdi, -8
-ip_var_loop:
+.ip_var_loop:
;for(i=-8;i<=6;i++)
;{
; sumsq += s[i]*s[i];
@@ -493,7 +493,7 @@
add edx, eax
add rdi, 1
cmp rdi, 6
- jle ip_var_loop
+ jle .ip_var_loop
;mov rax, sumsq
@@ -513,7 +513,7 @@
pxor mm1, mm1
pxor xmm0, xmm0
-nextcol4:
+.nextcol4:
movd xmm1, DWORD PTR [rsi+rcx-8] ; -8 -7 -6 -5
movd xmm2, DWORD PTR [rsi+rcx+7] ; +7 +8 +9 +10
@@ -600,7 +600,7 @@
add rcx, 4
cmp rcx, rdx
- jl nextcol4
+ jl .nextcol4
;s+=pitch;
movsxd rax, dword arg(1)
@@ -608,7 +608,7 @@
sub dword arg(2), 1 ;rows-=1
cmp dword arg(2), 0
- jg ip_row_loop
+ jg .ip_row_loop
add rsp, 16
pop rsp
@@ -640,7 +640,7 @@
push rdi
; end prolog
-addnoise_loop:
+.addnoise_loop:
call sym(rand) WRT_PLT
mov rcx, arg(1) ;noise
and rax, 0xff
@@ -657,7 +657,7 @@
mov rsi, arg(0) ;Pos
xor rax,rax
-addnoise_nextset:
+.addnoise_nextset:
movdqu xmm1,[rsi+rax] ; get the source
psubusb xmm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise
@@ -671,12 +671,12 @@
add rax,16 ; move to the next line
cmp rax, rcx
- jl addnoise_nextset
+ jl .addnoise_nextset
movsxd rax, dword arg(7) ; Pitch
add arg(0), rax ; Start += Pitch
sub dword arg(6), 1 ; Height -= 1
- jg addnoise_loop
+ jg .addnoise_loop
; begin epilog
pop rdi
diff --git a/vp8/common/x86/recon_sse2.asm b/vp8/common/x86/recon_sse2.asm
index 0e23116..f54cc4e 100644
--- a/vp8/common/x86/recon_sse2.asm
+++ b/vp8/common/x86/recon_sse2.asm
@@ -503,7 +503,7 @@
mov rdi, arg(0) ;dst;
movsxd rcx, dword ptr arg(1) ;dst_stride
-vp8_intra_pred_uv_tm_%1_loop:
+.vp8_intra_pred_uv_tm_%1_loop:
movd xmm3, [rsi]
movd xmm5, [rsi+rax]
%ifidn %1, sse2
@@ -525,7 +525,7 @@
lea rsi, [rsi+rax*2]
lea rdi, [rdi+rcx*2]
dec edx
- jnz vp8_intra_pred_uv_tm_%1_loop
+ jnz .vp8_intra_pred_uv_tm_%1_loop
; begin epilog
pop rdi
@@ -615,7 +615,7 @@
%endif
dec rsi
%ifidn %1, mmx2
-vp8_intra_pred_uv_ho_%1_loop:
+.vp8_intra_pred_uv_ho_%1_loop:
movd mm0, [rsi]
movd mm1, [rsi+rax]
punpcklbw mm0, mm0
@@ -627,7 +627,7 @@
lea rsi, [rsi+rax*2]
lea rdi, [rdi+rcx*2]
dec edx
- jnz vp8_intra_pred_uv_ho_%1_loop
+ jnz .vp8_intra_pred_uv_ho_%1_loop
%else
movd xmm0, [rsi]
movd xmm3, [rsi+rax]
diff --git a/vp8/common/x86/subpixel_mmx.asm b/vp8/common/x86/subpixel_mmx.asm
index 9004b52..e68d950 100644
--- a/vp8/common/x86/subpixel_mmx.asm
+++ b/vp8/common/x86/subpixel_mmx.asm
@@ -50,7 +50,7 @@
movsxd rax, dword ptr arg(5) ;output_width ; destination pitch?
pxor mm0, mm0 ; mm0 = 00000000
-nextrow:
+.nextrow:
movq mm3, [rsi-2] ; mm3 = p-2..p5
movq mm4, mm3 ; mm4 = p-2..p5
psrlq mm3, 8 ; mm3 = p-1..p5
@@ -102,7 +102,7 @@
%endif
dec rcx ; decrement count
- jnz nextrow ; next row
+ jnz .nextrow ; next row
; begin epilog
pop rdi
@@ -152,7 +152,7 @@
pxor mm0, mm0 ; mm0 = 00000000
-nextrow_cv:
+.nextrow_cv:
movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1
pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
@@ -190,7 +190,7 @@
; avoidable!!!.
lea rdi, [rdi+rax] ;
dec rcx ; decrement count
- jnz nextrow_cv ; next row
+ jnz .nextrow_cv ; next row
pop rbx
@@ -282,7 +282,7 @@
packuswb mm7, mm4 ;
add rsi, rdx ; next line
-next_row_8x8:
+.next_row_8x8:
movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
movq mm4, mm3 ; make a copy of current line
@@ -349,7 +349,7 @@
add rdi, r8 ;dst_pitch
%endif
cmp rdi, rcx ;
- jne next_row_8x8
+ jne .next_row_8x8
; begin epilog
pop rdi
@@ -437,7 +437,7 @@
packuswb mm7, mm4 ;
add rsi, rdx ; next line
-next_row_8x4:
+.next_row_8x4:
movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
movq mm4, mm3 ; make a copy of current line
@@ -504,7 +504,7 @@
add rdi, r8
%endif
cmp rdi, rcx ;
- jne next_row_8x4
+ jne .next_row_8x4
; begin epilog
pop rdi
@@ -579,7 +579,7 @@
packuswb mm7, mm0 ;
add rsi, rdx ; next line
-next_row_4x4:
+.next_row_4x4:
movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
@@ -622,7 +622,7 @@
%endif
cmp rdi, rcx ;
- jne next_row_4x4
+ jne .next_row_4x4
; begin epilog
pop rdi
diff --git a/vp8/common/x86/subpixel_sse2.asm b/vp8/common/x86/subpixel_sse2.asm
index 83e3b14..b62b5c6 100644
--- a/vp8/common/x86/subpixel_sse2.asm
+++ b/vp8/common/x86/subpixel_sse2.asm
@@ -55,7 +55,7 @@
%endif
pxor xmm0, xmm0 ; clear xmm0 for unpack
-filter_block1d8_h6_rowloop:
+.filter_block1d8_h6_rowloop:
movq xmm3, MMWORD PTR [rsi - 2]
movq xmm1, MMWORD PTR [rsi + 6]
@@ -124,7 +124,7 @@
%endif
dec rcx
- jnz filter_block1d8_h6_rowloop ; next row
+ jnz .filter_block1d8_h6_rowloop ; next row
; begin epilog
pop rdi
@@ -176,7 +176,7 @@
pxor xmm0, xmm0 ; clear xmm0 for unpack
-filter_block1d16_h6_sse2_rowloop:
+.filter_block1d16_h6_sse2_rowloop:
movq xmm3, MMWORD PTR [rsi - 2]
movq xmm1, MMWORD PTR [rsi + 6]
@@ -301,7 +301,7 @@
%endif
dec rcx
- jnz filter_block1d16_h6_sse2_rowloop ; next row
+ jnz .filter_block1d16_h6_sse2_rowloop ; next row
; begin epilog
pop rdi
@@ -356,7 +356,7 @@
movsxd r8, dword ptr arg(2) ; dst_ptich
%endif
-vp8_filter_block1d8_v6_sse2_loop:
+.vp8_filter_block1d8_v6_sse2_loop:
movdqa xmm1, XMMWORD PTR [rsi]
pmullw xmm1, [rax]
@@ -396,7 +396,7 @@
add rdi, r8
%endif
dec rcx ; decrement count
- jnz vp8_filter_block1d8_v6_sse2_loop ; next row
+ jnz .vp8_filter_block1d8_v6_sse2_loop ; next row
; begin epilog
pop rdi
@@ -448,7 +448,7 @@
movsxd r8, dword ptr arg(2) ; dst_ptich
%endif
-vp8_filter_block1d16_v6_sse2_loop:
+.vp8_filter_block1d16_v6_sse2_loop:
; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order.
movdqa xmm1, XMMWORD PTR [rsi + rdx] ; line 2
movdqa xmm2, XMMWORD PTR [rsi + rdx + 16]
@@ -511,7 +511,7 @@
add rdi, r8
%endif
dec rcx ; decrement count
- jnz vp8_filter_block1d16_v6_sse2_loop ; next row
+ jnz .vp8_filter_block1d16_v6_sse2_loop ; next row
; begin epilog
pop rdi
@@ -556,7 +556,7 @@
%endif
pxor xmm0, xmm0 ; clear xmm0 for unpack
-filter_block1d8_h6_only_rowloop:
+.filter_block1d8_h6_only_rowloop:
movq xmm3, MMWORD PTR [rsi - 2]
movq xmm1, MMWORD PTR [rsi + 6]
@@ -624,7 +624,7 @@
%endif
dec rcx
- jnz filter_block1d8_h6_only_rowloop ; next row
+ jnz .filter_block1d8_h6_only_rowloop ; next row
; begin epilog
pop rdi
@@ -670,7 +670,7 @@
pxor xmm0, xmm0 ; clear xmm0 for unpack
-filter_block1d16_h6_only_sse2_rowloop:
+.filter_block1d16_h6_only_sse2_rowloop:
movq xmm3, MMWORD PTR [rsi - 2]
movq xmm1, MMWORD PTR [rsi + 6]
@@ -789,7 +789,7 @@
%endif
dec rcx
- jnz filter_block1d16_h6_only_sse2_rowloop ; next row
+ jnz .filter_block1d16_h6_only_sse2_rowloop ; next row
; begin epilog
pop rdi
@@ -837,7 +837,7 @@
movsxd r8, dword ptr arg(3) ; dst_ptich
%endif
-vp8_filter_block1d8_v6_only_sse2_loop:
+.vp8_filter_block1d8_v6_only_sse2_loop:
movq xmm1, MMWORD PTR [rsi]
movq xmm2, MMWORD PTR [rsi + rdx]
movq xmm3, MMWORD PTR [rsi + rdx * 2]
@@ -883,7 +883,7 @@
add rdi, r8
%endif
dec rcx ; decrement count
- jnz vp8_filter_block1d8_v6_only_sse2_loop ; next row
+ jnz .vp8_filter_block1d8_v6_only_sse2_loop ; next row
; begin epilog
pop rdi
@@ -924,7 +924,7 @@
movsxd r8, dword ptr arg(4) ;output_width ; Pitch for Source
%endif
-unpack_block1d16_h6_sse2_rowloop:
+.unpack_block1d16_h6_sse2_rowloop:
movq xmm1, MMWORD PTR [rsi] ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2
movq xmm3, MMWORD PTR [rsi+8] ; make copy of xmm1
@@ -941,7 +941,7 @@
add rdi, r8
%endif
dec rcx
- jnz unpack_block1d16_h6_sse2_rowloop ; next row
+ jnz .unpack_block1d16_h6_sse2_rowloop ; next row
; begin epilog
pop rdi
@@ -980,7 +980,7 @@
movsxd rax, dword ptr arg(2) ;xoffset
cmp rax, 0 ;skip first_pass filter if xoffset=0
- je b16x16_sp_only
+ je .b16x16_sp_only
shl rax, 5
add rax, rcx ;HFilter
@@ -995,7 +995,7 @@
movsxd rax, dword ptr arg(3) ;yoffset
cmp rax, 0 ;skip second_pass filter if yoffset=0
- je b16x16_fp_only
+ je .b16x16_fp_only
shl rax, 5
add rax, rcx ;VFilter
@@ -1041,7 +1041,7 @@
packuswb xmm7, xmm4
add rsi, rdx ; next line
-next_row:
+.next_row:
movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
movdqa xmm4, xmm3 ; make a copy of current line
@@ -1104,11 +1104,11 @@
%endif
cmp rdi, rcx
- jne next_row
+ jne .next_row
- jmp done
+ jmp .done
-b16x16_sp_only:
+.b16x16_sp_only:
movsxd rax, dword ptr arg(3) ;yoffset
shl rax, 5
add rax, rcx ;VFilter
@@ -1130,7 +1130,7 @@
movdqu xmm7, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
add rsi, rax ; next line
-next_row_spo:
+.next_row_spo:
movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
movdqa xmm5, xmm7
@@ -1164,17 +1164,17 @@
add rsi, rax ; next line
add rdi, rdx ;dst_pitch
cmp rdi, rcx
- jne next_row_spo
+ jne .next_row_spo
- jmp done
+ jmp .done
-b16x16_fp_only:
+.b16x16_fp_only:
lea rcx, [rdi+rdx*8]
lea rcx, [rcx+rdx*8]
movsxd rax, dword ptr arg(1) ;src_pixels_per_line
pxor xmm0, xmm0
-next_row_fpo:
+.next_row_fpo:
movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
movdqa xmm4, xmm3 ; make a copy of current line
@@ -1208,9 +1208,9 @@
add rsi, rax ; next line
add rdi, rdx ; dst_pitch
cmp rdi, rcx
- jne next_row_fpo
+ jne .next_row_fpo
-done:
+.done:
; begin epilog
pop rdi
pop rsi
@@ -1318,7 +1318,7 @@
movdqa xmm7, xmm3
add rsp, 16 ; next line
-next_row8x8:
+.next_row8x8:
movdqa xmm3, XMMWORD PTR [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
movdqa xmm4, xmm3 ; make a copy of current line
psrldq xmm4, 1
@@ -1352,7 +1352,7 @@
add rdi, rdx
cmp rdi, rcx
- jne next_row8x8
+ jne .next_row8x8
;add rsp, 144
pop rsp
diff --git a/vp8/common/x86/subpixel_ssse3.asm b/vp8/common/x86/subpixel_ssse3.asm
index 1ddbc54..6bca82b 100644
--- a/vp8/common/x86/subpixel_ssse3.asm
+++ b/vp8/common/x86/subpixel_ssse3.asm
@@ -70,7 +70,7 @@
sub rdi, rdx
;xmm3 free
-filter_block1d8_h6_rowloop_ssse3:
+.filter_block1d8_h6_rowloop_ssse3:
movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
movq xmm2, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
@@ -102,7 +102,7 @@
packuswb xmm0, xmm0
movq MMWORD Ptr [rdi], xmm0
- jnz filter_block1d8_h6_rowloop_ssse3
+ jnz .filter_block1d8_h6_rowloop_ssse3
; begin epilog
pop rdi
@@ -129,7 +129,7 @@
sub rdi, rdx
-filter_block1d8_h4_rowloop_ssse3:
+.filter_block1d8_h4_rowloop_ssse3:
movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
movq xmm1, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
@@ -158,7 +158,7 @@
movq MMWORD Ptr [rdi], xmm0
- jnz filter_block1d8_h4_rowloop_ssse3
+ jnz .filter_block1d8_h4_rowloop_ssse3
; begin epilog
pop rdi
@@ -207,7 +207,7 @@
movsxd rcx, dword ptr arg(4) ;output_height
movsxd rdx, dword ptr arg(3) ;output_pitch
-filter_block1d16_h6_rowloop_ssse3:
+.filter_block1d16_h6_rowloop_ssse3:
movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
movq xmm3, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
@@ -264,7 +264,7 @@
lea rdi, [rdi + rdx]
dec rcx
- jnz filter_block1d16_h6_rowloop_ssse3
+ jnz .filter_block1d16_h6_rowloop_ssse3
; begin epilog
pop rdi
@@ -304,7 +304,7 @@
movdqa xmm7, [GLOBAL(rd)]
cmp esi, DWORD PTR [rax]
- je vp8_filter_block1d4_h4_ssse3
+ je .vp8_filter_block1d4_h4_ssse3
movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
@@ -318,7 +318,7 @@
movsxd rdx, dword ptr arg(3) ;output_pitch
;xmm3 free
-filter_block1d4_h6_rowloop_ssse3:
+.filter_block1d4_h6_rowloop_ssse3:
movdqu xmm0, XMMWORD PTR [rsi - 2]
movdqa xmm1, xmm0
@@ -346,7 +346,7 @@
add rdi, rdx
dec rcx
- jnz filter_block1d4_h6_rowloop_ssse3
+ jnz .filter_block1d4_h6_rowloop_ssse3
; begin epilog
pop rdi
@@ -356,7 +356,7 @@
pop rbp
ret
-vp8_filter_block1d4_h4_ssse3:
+.vp8_filter_block1d4_h4_ssse3:
movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
movdqa xmm0, XMMWORD PTR [GLOBAL(shuf2b)]
@@ -369,7 +369,7 @@
movsxd rdx, dword ptr arg(3) ;output_pitch
-filter_block1d4_h4_rowloop_ssse3:
+.filter_block1d4_h4_rowloop_ssse3:
movdqu xmm1, XMMWORD PTR [rsi - 2]
movdqa xmm2, xmm1
@@ -391,7 +391,7 @@
add rdi, rdx
dec rcx
- jnz filter_block1d4_h4_rowloop_ssse3
+ jnz .filter_block1d4_h4_rowloop_ssse3
; begin epilog
pop rdi
@@ -432,7 +432,7 @@
add rax, rdx
cmp esi, DWORD PTR [rax]
- je vp8_filter_block1d16_v4_ssse3
+ je .vp8_filter_block1d16_v4_ssse3
movdqa xmm5, XMMWORD PTR [rax] ;k0_k5
movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
@@ -450,7 +450,7 @@
add rax, rdx
-vp8_filter_block1d16_v6_ssse3_loop:
+.vp8_filter_block1d16_v6_ssse3_loop:
movq xmm1, MMWORD PTR [rsi] ;A
movq xmm2, MMWORD PTR [rsi + rdx] ;B
movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
@@ -508,7 +508,7 @@
add rdi, r8
%endif
dec rcx
- jnz vp8_filter_block1d16_v6_ssse3_loop
+ jnz .vp8_filter_block1d16_v6_ssse3_loop
; begin epilog
pop rdi
@@ -519,7 +519,7 @@
pop rbp
ret
-vp8_filter_block1d16_v4_ssse3:
+.vp8_filter_block1d16_v4_ssse3:
movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
@@ -534,7 +534,7 @@
movsxd rcx, DWORD PTR arg(4) ;output_height
add rax, rdx
-vp8_filter_block1d16_v4_ssse3_loop:
+.vp8_filter_block1d16_v4_ssse3_loop:
movq xmm2, MMWORD PTR [rsi + rdx] ;B
movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
@@ -581,7 +581,7 @@
add rdi, r8
%endif
dec rcx
- jnz vp8_filter_block1d16_v4_ssse3_loop
+ jnz .vp8_filter_block1d16_v4_ssse3_loop
; begin epilog
pop rdi
@@ -627,7 +627,7 @@
movsxd rcx, DWORD PTR arg(4) ;[output_height]
cmp esi, DWORD PTR [rax]
- je vp8_filter_block1d8_v4_ssse3
+ je .vp8_filter_block1d8_v4_ssse3
movdqa xmm5, XMMWORD PTR [rax] ;k0_k5
movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
@@ -638,7 +638,7 @@
mov rax, rsi
add rax, rdx
-vp8_filter_block1d8_v6_ssse3_loop:
+.vp8_filter_block1d8_v6_ssse3_loop:
movq xmm1, MMWORD PTR [rsi] ;A
movq xmm2, MMWORD PTR [rsi + rdx] ;B
movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
@@ -673,7 +673,7 @@
add rdi, r8
%endif
dec rcx
- jnz vp8_filter_block1d8_v6_ssse3_loop
+ jnz .vp8_filter_block1d8_v6_ssse3_loop
; begin epilog
pop rdi
@@ -684,7 +684,7 @@
pop rbp
ret
-vp8_filter_block1d8_v4_ssse3:
+.vp8_filter_block1d8_v4_ssse3:
movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
movdqa xmm5, [GLOBAL(rd)]
@@ -694,7 +694,7 @@
mov rax, rsi
add rax, rdx
-vp8_filter_block1d8_v4_ssse3_loop:
+.vp8_filter_block1d8_v4_ssse3_loop:
movq xmm2, MMWORD PTR [rsi + rdx] ;B
movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
@@ -722,7 +722,7 @@
add rdi, r8
%endif
dec rcx
- jnz vp8_filter_block1d8_v4_ssse3_loop
+ jnz .vp8_filter_block1d8_v4_ssse3_loop
; begin epilog
pop rdi
@@ -766,7 +766,7 @@
movsxd rcx, DWORD PTR arg(4) ;[output_height]
cmp esi, DWORD PTR [rax]
- je vp8_filter_block1d4_v4_ssse3
+ je .vp8_filter_block1d4_v4_ssse3
movq mm5, MMWORD PTR [rax] ;k0_k5
movq mm6, MMWORD PTR [rax+256] ;k2_k4
@@ -777,7 +777,7 @@
mov rax, rsi
add rax, rdx
-vp8_filter_block1d4_v6_ssse3_loop:
+.vp8_filter_block1d4_v6_ssse3_loop:
movd mm1, DWORD PTR [rsi] ;A
movd mm2, DWORD PTR [rsi + rdx] ;B
movd mm3, DWORD PTR [rsi + rdx * 2] ;C
@@ -813,7 +813,7 @@
add rdi, r8
%endif
dec rcx
- jnz vp8_filter_block1d4_v6_ssse3_loop
+ jnz .vp8_filter_block1d4_v6_ssse3_loop
; begin epilog
pop rdi
@@ -823,7 +823,7 @@
pop rbp
ret
-vp8_filter_block1d4_v4_ssse3:
+.vp8_filter_block1d4_v4_ssse3:
movq mm6, MMWORD PTR [rax+256] ;k2_k4
movq mm7, MMWORD PTR [rax+128] ;k1_k3
movq mm5, MMWORD PTR [GLOBAL(rd)]
@@ -833,7 +833,7 @@
mov rax, rsi
add rax, rdx
-vp8_filter_block1d4_v4_ssse3_loop:
+.vp8_filter_block1d4_v4_ssse3_loop:
movd mm2, DWORD PTR [rsi + rdx] ;B
movd mm3, DWORD PTR [rsi + rdx * 2] ;C
movd mm4, DWORD PTR [rax + rdx * 2] ;D
@@ -861,7 +861,7 @@
add rdi, r8
%endif
dec rcx
- jnz vp8_filter_block1d4_v4_ssse3_loop
+ jnz .vp8_filter_block1d4_v4_ssse3_loop
; begin epilog
pop rdi
@@ -895,7 +895,7 @@
movsxd rax, dword ptr arg(2) ; xoffset
cmp rax, 0 ; skip first_pass filter if xoffset=0
- je b16x16_sp_only
+ je .b16x16_sp_only
shl rax, 4
lea rax, [rax + rcx] ; HFilter
@@ -909,7 +909,7 @@
movsxd rax, dword ptr arg(3) ; yoffset
cmp rax, 0 ; skip second_pass filter if yoffset=0
- je b16x16_fp_only
+ je .b16x16_fp_only
shl rax, 4
lea rax, [rax + rcx] ; VFilter
@@ -996,9 +996,9 @@
cmp rdi, rcx
jne .next_row
- jmp done
+ jmp .done
-b16x16_sp_only:
+.b16x16_sp_only:
movsxd rax, dword ptr arg(3) ; yoffset
shl rax, 4
lea rax, [rax + rcx] ; VFilter
@@ -1018,7 +1018,7 @@
movq xmm2, [rsi + 8] ; load row 0
lea rsi, [rsi + rax] ; next line
-.next_row:
+.next_row_sp:
movq xmm3, [rsi] ; load row + 1
movq xmm5, [rsi + 8] ; load row + 1
@@ -1062,16 +1062,16 @@
lea rdi, [rdi + 2*rdx]
cmp rdi, rcx
- jne .next_row
+ jne .next_row_sp
- jmp done
+ jmp .done
-b16x16_fp_only:
+.b16x16_fp_only:
lea rcx, [rdi+rdx*8]
lea rcx, [rcx+rdx*8]
movsxd rax, dword ptr arg(1) ; src_pixels_per_line
-.next_row:
+.next_row_fp:
movq xmm2, [rsi] ; 00 01 02 03 04 05 06 07
movq xmm4, [rsi+1] ; 01 02 03 04 05 06 07 08
@@ -1122,9 +1122,9 @@
cmp rdi, rcx
- jne .next_row
+ jne .next_row_fp
-done:
+.done:
; begin epilog
pop rdi
pop rsi
@@ -1191,7 +1191,7 @@
movsxd rax, dword ptr arg(2) ; xoffset
cmp rax, 0 ; skip first_pass filter if xoffset=0
- je b8x8_sp_only
+ je .b8x8_sp_only
shl rax, 4
add rax, rcx ; HFilter
@@ -1203,7 +1203,7 @@
movsxd rax, dword ptr arg(3) ; yoffset
cmp rax, 0 ; skip second_pass filter if yoffset=0
- je b8x8_fp_only
+ je .b8x8_fp_only
shl rax, 4
lea rax, [rax + rcx] ; VFilter
@@ -1260,9 +1260,9 @@
cmp rdi, rcx
jne .next_row
- jmp done8x8
+ jmp .done8x8
-b8x8_sp_only:
+.b8x8_sp_only:
movsxd rax, dword ptr arg(3) ; yoffset
shl rax, 4
lea rax, [rax + rcx] ; VFilter
@@ -1364,12 +1364,12 @@
movq [rdi+rdx], xmm1
lea rsp, [rsp + 144]
- jmp done8x8
+ jmp .done8x8
-b8x8_fp_only:
+.b8x8_fp_only:
lea rcx, [rdi+rdx*8]
-.next_row:
+.next_row_fp:
movdqa xmm1, XMMWORD PTR [rsp]
movdqa xmm3, XMMWORD PTR [rsp+16]
@@ -1430,11 +1430,11 @@
lea rdi, [rdi + 2*rdx]
cmp rdi, rcx
- jne .next_row
+ jne .next_row_fp
lea rsp, [rsp + 16]
-done8x8:
+.done8x8:
;add rsp, 144
pop rsp
; begin epilog
diff --git a/vp8/decoder/x86/idct_blk_sse2.c b/vp8/decoder/x86/idct_blk_sse2.c
index 4c88db4..3a48068 100644
--- a/vp8/decoder/x86/idct_blk_sse2.c
+++ b/vp8/decoder/x86/idct_blk_sse2.c
@@ -12,17 +12,17 @@
#include "vp8/common/idct.h"
#include "vp8/decoder/dequantize.h"
-void idct_dequant_dc_0_2x_sse2
+void vp8_idct_dequant_dc_0_2x_sse2
(short *q, short *dq, unsigned char *pre,
unsigned char *dst, int dst_stride, short *dc);
-void idct_dequant_dc_full_2x_sse2
+void vp8_idct_dequant_dc_full_2x_sse2
(short *q, short *dq, unsigned char *pre,
unsigned char *dst, int dst_stride, short *dc);
-void idct_dequant_0_2x_sse2
+void vp8_idct_dequant_0_2x_sse2
(short *q, short *dq ,unsigned char *pre,
unsigned char *dst, int dst_stride, int blk_stride);
-void idct_dequant_full_2x_sse2
+void vp8_idct_dequant_full_2x_sse2
(short *q, short *dq ,unsigned char *pre,
unsigned char *dst, int dst_stride, int blk_stride);
@@ -35,14 +35,14 @@
for (i = 0; i < 4; i++)
{
if (((short *)(eobs))[0] & 0xfefe)
- idct_dequant_dc_full_2x_sse2 (q, dq, pre, dst, stride, dc);
+ vp8_idct_dequant_dc_full_2x_sse2 (q, dq, pre, dst, stride, dc);
else
- idct_dequant_dc_0_2x_sse2 (q, dq, pre, dst, stride, dc);
+ vp8_idct_dequant_dc_0_2x_sse2 (q, dq, pre, dst, stride, dc);
if (((short *)(eobs))[1] & 0xfefe)
- idct_dequant_dc_full_2x_sse2 (q+32, dq, pre+8, dst+8, stride, dc+2);
+ vp8_idct_dequant_dc_full_2x_sse2 (q+32, dq, pre+8, dst+8, stride, dc+2);
else
- idct_dequant_dc_0_2x_sse2 (q+32, dq, pre+8, dst+8, stride, dc+2);
+ vp8_idct_dequant_dc_0_2x_sse2 (q+32, dq, pre+8, dst+8, stride, dc+2);
q += 64;
dc += 4;
@@ -61,14 +61,14 @@
for (i = 0; i < 4; i++)
{
if (((short *)(eobs))[0] & 0xfefe)
- idct_dequant_full_2x_sse2 (q, dq, pre, dst, stride, 16);
+ vp8_idct_dequant_full_2x_sse2 (q, dq, pre, dst, stride, 16);
else
- idct_dequant_0_2x_sse2 (q, dq, pre, dst, stride, 16);
+ vp8_idct_dequant_0_2x_sse2 (q, dq, pre, dst, stride, 16);
if (((short *)(eobs))[1] & 0xfefe)
- idct_dequant_full_2x_sse2 (q+32, dq, pre+8, dst+8, stride, 16);
+ vp8_idct_dequant_full_2x_sse2 (q+32, dq, pre+8, dst+8, stride, 16);
else
- idct_dequant_0_2x_sse2 (q+32, dq, pre+8, dst+8, stride, 16);
+ vp8_idct_dequant_0_2x_sse2 (q+32, dq, pre+8, dst+8, stride, 16);
q += 64;
pre += 64;
@@ -82,33 +82,33 @@
unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
{
if (((short *)(eobs))[0] & 0xfefe)
- idct_dequant_full_2x_sse2 (q, dq, pre, dstu, stride, 8);
+ vp8_idct_dequant_full_2x_sse2 (q, dq, pre, dstu, stride, 8);
else
- idct_dequant_0_2x_sse2 (q, dq, pre, dstu, stride, 8);
+ vp8_idct_dequant_0_2x_sse2 (q, dq, pre, dstu, stride, 8);
q += 32;
pre += 32;
dstu += stride*4;
if (((short *)(eobs))[1] & 0xfefe)
- idct_dequant_full_2x_sse2 (q, dq, pre, dstu, stride, 8);
+ vp8_idct_dequant_full_2x_sse2 (q, dq, pre, dstu, stride, 8);
else
- idct_dequant_0_2x_sse2 (q, dq, pre, dstu, stride, 8);
+ vp8_idct_dequant_0_2x_sse2 (q, dq, pre, dstu, stride, 8);
q += 32;
pre += 32;
if (((short *)(eobs))[2] & 0xfefe)
- idct_dequant_full_2x_sse2 (q, dq, pre, dstv, stride, 8);
+ vp8_idct_dequant_full_2x_sse2 (q, dq, pre, dstv, stride, 8);
else
- idct_dequant_0_2x_sse2 (q, dq, pre, dstv, stride, 8);
+ vp8_idct_dequant_0_2x_sse2 (q, dq, pre, dstv, stride, 8);
q += 32;
pre += 32;
dstv += stride*4;
if (((short *)(eobs))[3] & 0xfefe)
- idct_dequant_full_2x_sse2 (q, dq, pre, dstv, stride, 8);
+ vp8_idct_dequant_full_2x_sse2 (q, dq, pre, dstv, stride, 8);
else
- idct_dequant_0_2x_sse2 (q, dq, pre, dstv, stride, 8);
+ vp8_idct_dequant_0_2x_sse2 (q, dq, pre, dstv, stride, 8);
}
diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c
index b3c2439..cea8e12 100644
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -22,7 +22,8 @@
#include "vpx/vpx_encoder.h"
#include "vpx_mem/vpx_mem.h"
#include "bitstream.h"
-#include "vp8/common/defaultcoefcounts.h"
+
+#include "defaultcoefcounts.h"
const int vp8cx_base_skip_false_prob[128] =
{
@@ -1199,7 +1200,7 @@
if (cpi->common.frame_type == KEY_FRAME)
{
/* Reset to default probabilities at key frames */
- sum_probs_over_prev_coef_context(vp8_default_coef_counts[i][j],
+ sum_probs_over_prev_coef_context(default_coef_counts[i][j],
prev_coef_count_sum);
}
else
diff --git a/vp8/common/defaultcoefcounts.c b/vp8/encoder/defaultcoefcounts.h
similarity index 96%
rename from vp8/common/defaultcoefcounts.c
rename to vp8/encoder/defaultcoefcounts.h
index b0e2e70..2c0f3dd 100644
--- a/vp8/common/defaultcoefcounts.c
+++ b/vp8/encoder/defaultcoefcounts.h
@@ -8,14 +8,12 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "defaultcoefcounts.h"
-
/* Generated file, included by entropy.c */
-const unsigned int vp8_default_coef_counts[BLOCK_TYPES]
- [COEF_BANDS]
- [PREV_COEF_CONTEXTS]
- [MAX_ENTROPY_TOKENS] =
+static const unsigned int default_coef_counts[BLOCK_TYPES]
+ [COEF_BANDS]
+ [PREV_COEF_CONTEXTS]
+ [MAX_ENTROPY_TOKENS] =
{
{
diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index 9cdc1e5..8559142 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -1473,7 +1473,6 @@
int i;
double boost_score = 0.0;
- double fwd_boost_score = 0.0;
double mv_ratio_accumulator = 0.0;
double decay_accumulator = 1.0;
double this_frame_mv_in_out = 0.0;
diff --git a/vp8/encoder/generic/csystemdependent.c b/vp8/encoder/generic/csystemdependent.c
index 9906105..a14843a 100644
--- a/vp8/encoder/generic/csystemdependent.c
+++ b/vp8/encoder/generic/csystemdependent.c
@@ -94,16 +94,15 @@
#if !(CONFIG_REALTIME_ONLY)
cpi->rtcd.temporal.apply = vp8_temporal_filter_apply_c;
#endif
+#if CONFIG_INTERNAL_STATS
+ cpi->rtcd.variance.ssimpf_8x8 = vp8_ssim_parms_8x8_c;
+ cpi->rtcd.variance.ssimpf_16x16 = vp8_ssim_parms_16x16_c;
+#endif
#endif
// Pure C:
vp8_yv12_copy_partial_frame_ptr = vp8_yv12_copy_partial_frame;
-#if CONFIG_INTERNAL_STATS
- cpi->rtcd.variance.ssimpf_8x8 = ssim_parms_8x8_c;
- cpi->rtcd.variance.ssimpf = ssim_parms_c;
-#endif
-
#if ARCH_X86 || ARCH_X86_64
vp8_arch_x86_encoder_init(cpi);
#endif
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index ff9a641..35e187e 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -3237,16 +3237,17 @@
// Test code for segmentation of gf/arf (0,0)
//segmentation_test_function((VP8_PTR) cpi);
-#if CONFIG_REALTIME_ONLY
- if(cpi->oxcf.auto_key && cm->frame_type != KEY_FRAME)
+ if (cpi->compressor_speed == 2)
{
- if(cpi->force_next_frame_intra)
+ if(cpi->oxcf.auto_key && cm->frame_type != KEY_FRAME)
{
- cm->frame_type = KEY_FRAME; /* delayed intra frame */
+ if(cpi->force_next_frame_intra)
+ {
+ cm->frame_type = KEY_FRAME; /* delayed intra frame */
+ }
}
+ cpi->force_next_frame_intra = 0;
}
- cpi->force_next_frame_intra = 0;
-#endif
// For an alt ref frame in 2 pass we skip the call to the second pass function that sets the target bandwidth
#if !(CONFIG_REALTIME_ONLY)
@@ -3775,15 +3776,15 @@
// (assuming that we didn't)!
if (cpi->pass != 2 && cpi->oxcf.auto_key && cm->frame_type != KEY_FRAME)
{
+ int key_frame_decision = decide_key_frame(cpi);
-#if CONFIG_REALTIME_ONLY
+ if (cpi->compressor_speed == 2)
{
/* we don't do re-encoding in realtime mode
* if key frame is decided than we force it on next frame */
- cpi->force_next_frame_intra = decide_key_frame(cpi);
+ cpi->force_next_frame_intra = key_frame_decision;
}
-#else
- if (decide_key_frame(cpi))
+ else if (key_frame_decision)
{
// Reset all our sizing numbers and recode
cm->frame_type = KEY_FRAME;
@@ -3820,7 +3821,6 @@
continue;
}
-#endif
}
vp8_clear_system_state();
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index f75f6cb..aead2fb 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -609,9 +609,8 @@
int *lf_ref_frame_sign_bias;
int *lf_ref_frame;
-#if CONFIG_REALTIME_ONLY
int force_next_frame_intra; /* force next frame to intra when kf_auto says so */
-#endif
+
int droppable;
} VP8_COMP;
diff --git a/vp8/encoder/ssim.c b/vp8/encoder/ssim.c
index fea756f..d0f8e49 100644
--- a/vp8/encoder/ssim.c
+++ b/vp8/encoder/ssim.c
@@ -9,18 +9,9 @@
*/
-#include "vpx_scale/yv12config.h"
-#include "math.h"
#include "onyx_int.h"
-#if CONFIG_RUNTIME_CPU_DETECT
-#define IF_RTCD(x) (x)
-#else
-#define IF_RTCD(x) NULL
-#endif
-
-
-void ssim_parms_c
+void vp8_ssim_parms_16x16_c
(
unsigned char *s,
int sp,
@@ -46,7 +37,7 @@
}
}
}
-void ssim_parms_8x8_c
+void vp8_ssim_parms_8x8_c
(
unsigned char *s,
int sp,
@@ -107,14 +98,14 @@
const vp8_variance_rtcd_vtable_t *rtcd)
{
unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0;
- rtcd->ssimpf(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
+ SSIMPF_INVOKE(rtcd,16x16)(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 256);
}
static double ssim_8x8(unsigned char *s,int sp, unsigned char *r,int rp,
const vp8_variance_rtcd_vtable_t *rtcd)
{
unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0;
- rtcd->ssimpf_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
+ SSIMPF_INVOKE(rtcd,8x8)(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64);
}
@@ -134,7 +125,7 @@
c1 = cc1*16;
c2 = cc2*16;
- rtcd->ssimpf(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
+ SSIMPF_INVOKE(rtcd,16x16)(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
ssim_n1 = (2*sum_s*sum_r+ c1);
ssim_n2 =((int64_t) 2*256*sum_sxr-(int64_t) 2*sum_s*sum_r+c2);
diff --git a/vp8/encoder/variance.h b/vp8/encoder/variance.h
index 5fd6d3a..d9bf669 100644
--- a/vp8/encoder/variance.h
+++ b/vp8/encoder/variance.h
@@ -320,16 +320,16 @@
#endif
extern prototype_get16x16prederror(vp8_variance_get4x4sse_cs);
-#ifndef vp8_ssimpf
-#define vp8_ssimpf ssim_parms_c
-#endif
-extern prototype_ssimpf(vp8_ssimpf)
-
#ifndef vp8_ssimpf_8x8
-#define vp8_ssimpf_8x8 ssim_parms_8x8_c
+#define vp8_ssimpf_8x8 vp8_ssim_parms_8x8_c
#endif
extern prototype_ssimpf(vp8_ssimpf_8x8)
+#ifndef vp8_ssimpf_16x16
+#define vp8_ssimpf_16x16 vp8_ssim_parms_16x16_c
+#endif
+extern prototype_ssimpf(vp8_ssimpf_16x16)
+
typedef prototype_sad(*vp8_sad_fn_t);
typedef prototype_sad_multi_same_address(*vp8_sad_multi_fn_t);
typedef prototype_sad_multi_same_address_1(*vp8_sad_multi1_fn_t);
@@ -394,7 +394,7 @@
#if CONFIG_INTERNAL_STATS
vp8_ssimpf_fn_t ssimpf_8x8;
- vp8_ssimpf_fn_t ssimpf;
+ vp8_ssimpf_fn_t ssimpf_16x16;
#endif
} vp8_variance_rtcd_vtable_t;
@@ -417,8 +417,10 @@
#if CONFIG_RUNTIME_CPU_DETECT
#define VARIANCE_INVOKE(ctx,fn) (ctx)->fn
+#define SSIMPF_INVOKE(ctx,fn) (ctx)->ssimpf_##fn
#else
#define VARIANCE_INVOKE(ctx,fn) vp8_variance_##fn
+#define SSIMPF_INVOKE(ctx,fn) vp8_ssimpf_##fn
#endif
#endif
diff --git a/vp8/encoder/x86/encodeopt.asm b/vp8/encoder/x86/encodeopt.asm
index 9946294..7ec7d60 100644
--- a/vp8/encoder/x86/encodeopt.asm
+++ b/vp8/encoder/x86/encodeopt.asm
@@ -148,7 +148,7 @@
pcmpeqw mm1, mm7
mov rcx, 16
-mberror_loop_mmx:
+.mberror_loop_mmx:
movq mm3, [rsi]
movq mm4, [rdi]
@@ -186,7 +186,7 @@
add rdi, 32
sub rcx, 1
- jnz mberror_loop_mmx
+ jnz .mberror_loop_mmx
movq mm0, mm2
psrlq mm2, 32
@@ -226,7 +226,7 @@
pcmpeqw xmm5, xmm6
mov rcx, 16
-mberror_loop:
+.mberror_loop:
movdqa xmm0, [rsi]
movdqa xmm1, [rdi]
@@ -249,7 +249,7 @@
paddd xmm4, xmm2
paddd xmm4, xmm0
- jnz mberror_loop
+ jnz .mberror_loop
movdqa xmm0, xmm4
punpckldq xmm0, xmm6
@@ -289,7 +289,7 @@
mov rcx, 16
pxor mm7, mm7
-mbuverror_loop_mmx:
+.mbuverror_loop_mmx:
movq mm1, [rsi]
movq mm2, [rdi]
@@ -313,7 +313,7 @@
add rdi, 16
dec rcx
- jnz mbuverror_loop_mmx
+ jnz .mbuverror_loop_mmx
movq mm0, mm7
psrlq mm7, 32
@@ -346,7 +346,7 @@
mov rcx, 16
pxor xmm3, xmm3
-mbuverror_loop:
+.mbuverror_loop:
movdqa xmm1, [rsi]
movdqa xmm2, [rdi]
@@ -360,7 +360,7 @@
add rdi, 16
dec rcx
- jnz mbuverror_loop
+ jnz .mbuverror_loop
pxor xmm0, xmm0
movdqa xmm1, xmm3
diff --git a/vp8/encoder/x86/quantize_sse2.asm b/vp8/encoder/x86/quantize_sse2.asm
index 056b64c..c483933 100644
--- a/vp8/encoder/x86/quantize_sse2.asm
+++ b/vp8/encoder/x86/quantize_sse2.asm
@@ -137,17 +137,17 @@
; if (x >= zbin)
sub cx, WORD PTR[rdx] ; x - zbin
lea rdx, [rdx + 2] ; zbin_boost_ptr++
- jl rq_zigzag_loop_%1 ; x < zbin
+ jl .rq_zigzag_loop_%1 ; x < zbin
movsx edi, WORD PTR[rsp + temp_qcoeff + %1 * 2]
; downshift by quant_shift[rc]
movsx cx, BYTE PTR[rax + %1] ; quant_shift_ptr[rc]
sar edi, cl ; also sets Z bit
- je rq_zigzag_loop_%1 ; !y
+ je .rq_zigzag_loop_%1 ; !y
mov WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc]
mov rdx, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost
-rq_zigzag_loop_%1:
+.rq_zigzag_loop_%1:
%endmacro
; in vp8_default_zig_zag1d order: see vp8/common/entropy.c
ZIGZAG_LOOP 0
diff --git a/vp8/encoder/x86/quantize_sse4.asm b/vp8/encoder/x86/quantize_sse4.asm
index 258899e..95e1c20 100644
--- a/vp8/encoder/x86/quantize_sse4.asm
+++ b/vp8/encoder/x86/quantize_sse4.asm
@@ -140,21 +140,21 @@
; if (x >= zbin)
sub cx, WORD PTR[rdx] ; x - zbin
lea rdx, [rdx + 2] ; zbin_boost_ptr++
- jl rq_zigzag_loop_%1 ; x < zbin
+ jl .rq_zigzag_loop_%1 ; x < zbin
pextrw edi, %3, %2 ; y
; downshift by quant_shift[rc]
pextrb ecx, xmm5, %1 ; quant_shift[rc]
sar edi, cl ; also sets Z bit
- je rq_zigzag_loop_%1 ; !y
+ je .rq_zigzag_loop_%1 ; !y
%if ABI_IS_32BIT
mov WORD PTR[rsp + qcoeff + %1 *2], di
%else
pinsrw %5, edi, %2 ; qcoeff[rc]
%endif
mov rdx, rax ; reset to b->zrun_zbin_boost
-rq_zigzag_loop_%1:
+.rq_zigzag_loop_%1:
%endmacro
; in vp8_default_zig_zag1d order: see vp8/common/entropy.c
ZIGZAG_LOOP 0, 0, xmm2, xmm6, xmm4
diff --git a/vp8/encoder/x86/sad_mmx.asm b/vp8/encoder/x86/sad_mmx.asm
index 85cb023..407b399 100644
--- a/vp8/encoder/x86/sad_mmx.asm
+++ b/vp8/encoder/x86/sad_mmx.asm
@@ -43,7 +43,7 @@
pxor mm6, mm6
-x16x16sad_mmx_loop:
+.x16x16sad_mmx_loop:
movq mm0, QWORD PTR [rsi]
movq mm2, QWORD PTR [rsi+8]
@@ -83,7 +83,7 @@
paddw mm7, mm1
cmp rsi, rcx
- jne x16x16sad_mmx_loop
+ jne .x16x16sad_mmx_loop
movq mm0, mm7
@@ -135,7 +135,7 @@
pxor mm6, mm6
-x8x16sad_mmx_loop:
+.x8x16sad_mmx_loop:
movq mm0, QWORD PTR [rsi]
movq mm1, QWORD PTR [rdi]
@@ -158,7 +158,7 @@
paddw mm7, mm2
cmp rsi, rcx
- jne x8x16sad_mmx_loop
+ jne .x8x16sad_mmx_loop
movq mm0, mm7
punpcklwd mm0, mm6
@@ -205,7 +205,7 @@
pxor mm6, mm6
-x8x8sad_mmx_loop:
+.x8x8sad_mmx_loop:
movq mm0, QWORD PTR [rsi]
movq mm1, QWORD PTR [rdi]
@@ -228,7 +228,7 @@
paddw mm7, mm0
cmp rsi, rcx
- jne x8x8sad_mmx_loop
+ jne .x8x8sad_mmx_loop
movq mm0, mm7
punpcklwd mm0, mm6
@@ -364,7 +364,7 @@
pxor mm6, mm6
-x16x8sad_mmx_loop:
+.x16x8sad_mmx_loop:
movq mm0, [rsi]
movq mm1, [rdi]
@@ -404,7 +404,7 @@
paddw mm7, mm0
cmp rsi, rcx
- jne x16x8sad_mmx_loop
+ jne .x16x8sad_mmx_loop
movq mm0, mm7
punpcklwd mm0, mm6
diff --git a/vp8/encoder/x86/sad_sse2.asm b/vp8/encoder/x86/sad_sse2.asm
index 1011c95..fa8e3e3 100644
--- a/vp8/encoder/x86/sad_sse2.asm
+++ b/vp8/encoder/x86/sad_sse2.asm
@@ -37,7 +37,7 @@
lea rcx, [rcx+rax*8]
pxor xmm6, xmm6
-x16x16sad_wmt_loop:
+.x16x16sad_wmt_loop:
movq xmm0, QWORD PTR [rsi]
movq xmm2, QWORD PTR [rsi+8]
@@ -68,7 +68,7 @@
paddw xmm6, xmm4
cmp rsi, rcx
- jne x16x16sad_wmt_loop
+ jne .x16x16sad_wmt_loop
movq xmm0, xmm6
psrldq xmm6, 8
@@ -111,11 +111,11 @@
lea rcx, [rcx+rbx*8]
pxor mm7, mm7
-x8x16sad_wmt_loop:
+.x8x16sad_wmt_loop:
movq rax, mm7
cmp eax, arg(4)
- jg x8x16sad_wmt_early_exit
+ jg .x8x16sad_wmt_early_exit
movq mm0, QWORD PTR [rsi]
movq mm1, QWORD PTR [rdi]
@@ -133,11 +133,11 @@
paddw mm7, mm2
cmp rsi, rcx
- jne x8x16sad_wmt_loop
+ jne .x8x16sad_wmt_loop
movq rax, mm7
-x8x16sad_wmt_early_exit:
+.x8x16sad_wmt_early_exit:
; begin epilog
pop rdi
@@ -172,11 +172,11 @@
lea rcx, [rsi+rbx*8]
pxor mm7, mm7
-x8x8sad_wmt_loop:
+.x8x8sad_wmt_loop:
movq rax, mm7
cmp eax, arg(4)
- jg x8x8sad_wmt_early_exit
+ jg .x8x8sad_wmt_early_exit
movq mm0, QWORD PTR [rsi]
movq mm1, QWORD PTR [rdi]
@@ -188,10 +188,10 @@
paddw mm7, mm0
cmp rsi, rcx
- jne x8x8sad_wmt_loop
+ jne .x8x8sad_wmt_loop
movq rax, mm7
-x8x8sad_wmt_early_exit:
+.x8x8sad_wmt_early_exit:
; begin epilog
pop rdi
@@ -281,11 +281,11 @@
lea rcx, [rsi+rbx*8]
pxor mm7, mm7
-x16x8sad_wmt_loop:
+.x16x8sad_wmt_loop:
movq rax, mm7
cmp eax, arg(4)
- jg x16x8sad_wmt_early_exit
+ jg .x16x8sad_wmt_early_exit
movq mm0, QWORD PTR [rsi]
movq mm2, QWORD PTR [rsi+8]
@@ -315,11 +315,11 @@
paddw mm7, mm4
cmp rsi, rcx
- jne x16x8sad_wmt_loop
+ jne .x16x8sad_wmt_loop
movq rax, mm7
-x16x8sad_wmt_early_exit:
+.x16x8sad_wmt_early_exit:
; begin epilog
pop rdi
@@ -352,7 +352,7 @@
movsxd rdx, dword ptr arg(3) ;dst_stride
movsxd rcx, dword ptr arg(4) ;height
-block_copy_sse2_loopx4:
+.block_copy_sse2_loopx4:
movdqu xmm0, XMMWORD PTR [rsi]
movdqu xmm1, XMMWORD PTR [rsi + 16]
movdqu xmm2, XMMWORD PTR [rsi + rax]
@@ -383,12 +383,12 @@
sub rcx, 4
cmp rcx, 4
- jge block_copy_sse2_loopx4
+ jge .block_copy_sse2_loopx4
cmp rcx, 0
- je copy_is_done
+ je .copy_is_done
-block_copy_sse2_loop:
+.block_copy_sse2_loop:
movdqu xmm0, XMMWORD PTR [rsi]
movdqu xmm1, XMMWORD PTR [rsi + 16]
lea rsi, [rsi+rax]
@@ -398,9 +398,9 @@
lea rdi, [rdi+rdx]
sub rcx, 1
- jne block_copy_sse2_loop
+ jne .block_copy_sse2_loop
-copy_is_done:
+.copy_is_done:
; begin epilog
pop rdi
pop rsi
diff --git a/vp8/encoder/x86/sad_sse3.asm b/vp8/encoder/x86/sad_sse3.asm
index 9e05521..a255097 100644
--- a/vp8/encoder/x86/sad_sse3.asm
+++ b/vp8/encoder/x86/sad_sse3.asm
@@ -647,7 +647,7 @@
STACK_FRAME_CREATE_X3
-block_copy_sse3_loopx4:
+.block_copy_sse3_loopx4:
lea end_ptr, [src_ptr+src_stride*2]
movdqu xmm0, XMMWORD PTR [src_ptr]
@@ -676,13 +676,13 @@
sub height, 4
cmp height, 4
- jge block_copy_sse3_loopx4
+ jge .block_copy_sse3_loopx4
;Check to see if there is more rows need to be copied.
cmp height, 0
- je copy_is_done
+ je .copy_is_done
-block_copy_sse3_loop:
+.block_copy_sse3_loop:
movdqu xmm0, XMMWORD PTR [src_ptr]
movdqu xmm1, XMMWORD PTR [src_ptr + 16]
lea src_ptr, [src_ptr+src_stride]
@@ -692,9 +692,9 @@
lea ref_ptr, [ref_ptr+ref_stride]
sub height, 1
- jne block_copy_sse3_loop
+ jne .block_copy_sse3_loop
-copy_is_done:
+.copy_is_done:
STACK_FRAME_DESTROY_X3
;void vp8_sad16x16x4d_sse3(
diff --git a/vp8/encoder/x86/sad_ssse3.asm b/vp8/encoder/x86/sad_ssse3.asm
index 6ecf081..95b6c89 100644
--- a/vp8/encoder/x86/sad_ssse3.asm
+++ b/vp8/encoder/x86/sad_ssse3.asm
@@ -169,30 +169,30 @@
mov rdx, 0xf
and rdx, rdi
- jmp vp8_sad16x16x3_ssse3_skiptable
-vp8_sad16x16x3_ssse3_jumptable:
- dd vp8_sad16x16x3_ssse3_aligned_by_0 - vp8_sad16x16x3_ssse3_do_jump
- dd vp8_sad16x16x3_ssse3_aligned_by_1 - vp8_sad16x16x3_ssse3_do_jump
- dd vp8_sad16x16x3_ssse3_aligned_by_2 - vp8_sad16x16x3_ssse3_do_jump
- dd vp8_sad16x16x3_ssse3_aligned_by_3 - vp8_sad16x16x3_ssse3_do_jump
- dd vp8_sad16x16x3_ssse3_aligned_by_4 - vp8_sad16x16x3_ssse3_do_jump
- dd vp8_sad16x16x3_ssse3_aligned_by_5 - vp8_sad16x16x3_ssse3_do_jump
- dd vp8_sad16x16x3_ssse3_aligned_by_6 - vp8_sad16x16x3_ssse3_do_jump
- dd vp8_sad16x16x3_ssse3_aligned_by_7 - vp8_sad16x16x3_ssse3_do_jump
- dd vp8_sad16x16x3_ssse3_aligned_by_8 - vp8_sad16x16x3_ssse3_do_jump
- dd vp8_sad16x16x3_ssse3_aligned_by_9 - vp8_sad16x16x3_ssse3_do_jump
- dd vp8_sad16x16x3_ssse3_aligned_by_10 - vp8_sad16x16x3_ssse3_do_jump
- dd vp8_sad16x16x3_ssse3_aligned_by_11 - vp8_sad16x16x3_ssse3_do_jump
- dd vp8_sad16x16x3_ssse3_aligned_by_12 - vp8_sad16x16x3_ssse3_do_jump
- dd vp8_sad16x16x3_ssse3_aligned_by_13 - vp8_sad16x16x3_ssse3_do_jump
- dd vp8_sad16x16x3_ssse3_aligned_by_14 - vp8_sad16x16x3_ssse3_do_jump
- dd vp8_sad16x16x3_ssse3_aligned_by_15 - vp8_sad16x16x3_ssse3_do_jump
-vp8_sad16x16x3_ssse3_skiptable:
+ jmp .vp8_sad16x16x3_ssse3_skiptable
+.vp8_sad16x16x3_ssse3_jumptable:
+ dd .vp8_sad16x16x3_ssse3_aligned_by_0 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_1 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_2 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_3 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_4 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_5 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_6 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_7 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_8 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_9 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_10 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_11 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_12 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_13 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_14 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_15 - .vp8_sad16x16x3_ssse3_do_jump
+.vp8_sad16x16x3_ssse3_skiptable:
- call vp8_sad16x16x3_ssse3_do_jump
-vp8_sad16x16x3_ssse3_do_jump:
+ call .vp8_sad16x16x3_ssse3_do_jump
+.vp8_sad16x16x3_ssse3_do_jump:
pop rcx ; get the address of do_jump
- mov rax, vp8_sad16x16x3_ssse3_jumptable - vp8_sad16x16x3_ssse3_do_jump
+ mov rax, .vp8_sad16x16x3_ssse3_jumptable - .vp8_sad16x16x3_ssse3_do_jump
add rax, rcx ; get the absolute address of vp8_sad16x16x3_ssse3_jumptable
movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
@@ -203,23 +203,23 @@
jmp rcx
- PROCESS_16X16X3_OFFSET 0, vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 1, vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 2, vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 3, vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 4, vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 5, vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 6, vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 7, vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 8, vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 9, vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 10, vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 11, vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 12, vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 13, vp8_sad16x16x3_ssse3
- PROCESS_16X16X3_OFFSET 14, vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 0, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 1, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 2, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 3, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 4, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 5, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 6, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 7, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 8, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 9, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 10, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 11, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 12, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 13, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 14, .vp8_sad16x16x3_ssse3
-vp8_sad16x16x3_ssse3_aligned_by_15:
+.vp8_sad16x16x3_ssse3_aligned_by_15:
PROCESS_16X2X3 1
PROCESS_16X2X3 0
PROCESS_16X2X3 0
@@ -229,7 +229,7 @@
PROCESS_16X2X3 0
PROCESS_16X2X3 0
-vp8_sad16x16x3_ssse3_store_off:
+.vp8_sad16x16x3_ssse3_store_off:
mov rdi, arg(4) ;Results
movq xmm0, xmm5
@@ -282,30 +282,30 @@
mov rdx, 0xf
and rdx, rdi
- jmp vp8_sad16x8x3_ssse3_skiptable
-vp8_sad16x8x3_ssse3_jumptable:
- dd vp8_sad16x8x3_ssse3_aligned_by_0 - vp8_sad16x8x3_ssse3_do_jump
- dd vp8_sad16x8x3_ssse3_aligned_by_1 - vp8_sad16x8x3_ssse3_do_jump
- dd vp8_sad16x8x3_ssse3_aligned_by_2 - vp8_sad16x8x3_ssse3_do_jump
- dd vp8_sad16x8x3_ssse3_aligned_by_3 - vp8_sad16x8x3_ssse3_do_jump
- dd vp8_sad16x8x3_ssse3_aligned_by_4 - vp8_sad16x8x3_ssse3_do_jump
- dd vp8_sad16x8x3_ssse3_aligned_by_5 - vp8_sad16x8x3_ssse3_do_jump
- dd vp8_sad16x8x3_ssse3_aligned_by_6 - vp8_sad16x8x3_ssse3_do_jump
- dd vp8_sad16x8x3_ssse3_aligned_by_7 - vp8_sad16x8x3_ssse3_do_jump
- dd vp8_sad16x8x3_ssse3_aligned_by_8 - vp8_sad16x8x3_ssse3_do_jump
- dd vp8_sad16x8x3_ssse3_aligned_by_9 - vp8_sad16x8x3_ssse3_do_jump
- dd vp8_sad16x8x3_ssse3_aligned_by_10 - vp8_sad16x8x3_ssse3_do_jump
- dd vp8_sad16x8x3_ssse3_aligned_by_11 - vp8_sad16x8x3_ssse3_do_jump
- dd vp8_sad16x8x3_ssse3_aligned_by_12 - vp8_sad16x8x3_ssse3_do_jump
- dd vp8_sad16x8x3_ssse3_aligned_by_13 - vp8_sad16x8x3_ssse3_do_jump
- dd vp8_sad16x8x3_ssse3_aligned_by_14 - vp8_sad16x8x3_ssse3_do_jump
- dd vp8_sad16x8x3_ssse3_aligned_by_15 - vp8_sad16x8x3_ssse3_do_jump
-vp8_sad16x8x3_ssse3_skiptable:
+ jmp .vp8_sad16x8x3_ssse3_skiptable
+.vp8_sad16x8x3_ssse3_jumptable:
+ dd .vp8_sad16x8x3_ssse3_aligned_by_0 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_1 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_2 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_3 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_4 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_5 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_6 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_7 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_8 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_9 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_10 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_11 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_12 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_13 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_14 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_15 - .vp8_sad16x8x3_ssse3_do_jump
+.vp8_sad16x8x3_ssse3_skiptable:
- call vp8_sad16x8x3_ssse3_do_jump
-vp8_sad16x8x3_ssse3_do_jump:
+ call .vp8_sad16x8x3_ssse3_do_jump
+.vp8_sad16x8x3_ssse3_do_jump:
pop rcx ; get the address of do_jump
- mov rax, vp8_sad16x8x3_ssse3_jumptable - vp8_sad16x8x3_ssse3_do_jump
+ mov rax, .vp8_sad16x8x3_ssse3_jumptable - .vp8_sad16x8x3_ssse3_do_jump
add rax, rcx ; get the absolute address of vp8_sad16x8x3_ssse3_jumptable
movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
@@ -316,30 +316,30 @@
jmp rcx
- PROCESS_16X8X3_OFFSET 0, vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 1, vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 2, vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 3, vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 4, vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 5, vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 6, vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 7, vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 8, vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 9, vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 10, vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 11, vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 12, vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 13, vp8_sad16x8x3_ssse3
- PROCESS_16X8X3_OFFSET 14, vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 0, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 1, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 2, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 3, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 4, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 5, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 6, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 7, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 8, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 9, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 10, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 11, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 12, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 13, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 14, .vp8_sad16x8x3_ssse3
-vp8_sad16x8x3_ssse3_aligned_by_15:
+.vp8_sad16x8x3_ssse3_aligned_by_15:
PROCESS_16X2X3 1
PROCESS_16X2X3 0
PROCESS_16X2X3 0
PROCESS_16X2X3 0
-vp8_sad16x8x3_ssse3_store_off:
+.vp8_sad16x8x3_ssse3_store_off:
mov rdi, arg(4) ;Results
movq xmm0, xmm5
diff --git a/vp8/encoder/x86/ssim_opt.asm b/vp8/encoder/x86/ssim_opt.asm
index d5d267a..c6db3d1 100644
--- a/vp8/encoder/x86/ssim_opt.asm
+++ b/vp8/encoder/x86/ssim_opt.asm
@@ -44,7 +44,7 @@
paddd %1, xmm1
SUM_ACROSS_Q %1
%endmacro
-;void ssim_parms_sse3(
+;void ssim_parms_sse2(
; unsigned char *s,
; int sp,
; unsigned char *r,
@@ -61,8 +61,8 @@
; or pavgb At this point this is just meant to be first pass for calculating
; all the parms needed for 16x16 ssim so we can play with dssim as distortion
; in mode selection code.
-global sym(vp8_ssim_parms_16x16_sse3)
-sym(vp8_ssim_parms_16x16_sse3):
+global sym(vp8_ssim_parms_16x16_sse2)
+sym(vp8_ssim_parms_16x16_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 9
@@ -84,7 +84,7 @@
pxor xmm11,xmm11 ;sum_sxr
mov rdx, 16 ;row counter
-NextRow:
+.NextRow:
;grab source and reference pixels
movdqu xmm5, [rsi]
@@ -107,7 +107,7 @@
add rdi, rax ; next r row
dec rdx ; counter
- jnz NextRow
+ jnz .NextRow
SUM_ACROSS_W xmm15
SUM_ACROSS_W xmm14
@@ -134,7 +134,7 @@
pop rbp
ret
-;void ssim_parms_sse3(
+;void ssim_parms_sse2(
; unsigned char *s,
; int sp,
; unsigned char *r,
@@ -151,8 +151,8 @@
; or pavgb At this point this is just meant to be first pass for calculating
; all the parms needed for 16x16 ssim so we can play with dssim as distortion
; in mode selection code.
-global sym(vp8_ssim_parms_8x8_sse3)
-sym(vp8_ssim_parms_8x8_sse3):
+global sym(vp8_ssim_parms_8x8_sse2)
+sym(vp8_ssim_parms_8x8_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 9
@@ -174,7 +174,7 @@
pxor xmm11,xmm11 ;sum_sxr
mov rdx, 8 ;row counter
-NextRow2:
+.NextRow:
;grab source and reference pixels
movq xmm3, [rsi]
@@ -188,7 +188,7 @@
add rdi, rax ; next r row
dec rdx ; counter
- jnz NextRow2
+ jnz .NextRow
SUM_ACROSS_W xmm15
SUM_ACROSS_W xmm14
diff --git a/vp8/encoder/x86/subtract_mmx.asm b/vp8/encoder/x86/subtract_mmx.asm
index a47e1f0..4ce16ce 100644
--- a/vp8/encoder/x86/subtract_mmx.asm
+++ b/vp8/encoder/x86/subtract_mmx.asm
@@ -93,7 +93,7 @@
mov rcx, 16
pxor mm0, mm0
-submby_loop:
+.submby_loop:
movq mm1, [rsi]
movq mm3, [rax]
@@ -139,7 +139,7 @@
lea rsi, [rsi+rdx]
sub rcx, 1
- jnz submby_loop
+ jnz .submby_loop
pop rdi
pop rsi
diff --git a/vp8/encoder/x86/subtract_sse2.asm b/vp8/encoder/x86/subtract_sse2.asm
index 95888f6..3bd1ff6 100644
--- a/vp8/encoder/x86/subtract_sse2.asm
+++ b/vp8/encoder/x86/subtract_sse2.asm
@@ -91,7 +91,7 @@
mov rcx, 8 ; do two lines at one time
-submby_loop:
+.submby_loop:
movdqa xmm0, XMMWORD PTR [rsi] ; src
movdqa xmm1, XMMWORD PTR [rax] ; pred
@@ -133,7 +133,7 @@
lea rsi, [rsi+rdx*2]
sub rcx, 1
- jnz submby_loop
+ jnz .submby_loop
pop rdi
pop rsi
diff --git a/vp8/encoder/x86/temporal_filter_apply_sse2.asm b/vp8/encoder/x86/temporal_filter_apply_sse2.asm
index b777ef5..b97c694 100644
--- a/vp8/encoder/x86/temporal_filter_apply_sse2.asm
+++ b/vp8/encoder/x86/temporal_filter_apply_sse2.asm
@@ -71,26 +71,26 @@
lea rcx, [rdx + 16*16*1]
cmp dword ptr [rsp + block_size], 8
- jne temporal_filter_apply_load_16
+ jne .temporal_filter_apply_load_16
lea rcx, [rdx + 8*8*1]
-temporal_filter_apply_load_8:
+.temporal_filter_apply_load_8:
movq xmm0, [rsi] ; first row
lea rsi, [rsi + rbp] ; += stride
punpcklbw xmm0, xmm7 ; src[ 0- 7]
movq xmm1, [rsi] ; second row
lea rsi, [rsi + rbp] ; += stride
punpcklbw xmm1, xmm7 ; src[ 8-15]
- jmp temporal_filter_apply_load_finished
+ jmp .temporal_filter_apply_load_finished
-temporal_filter_apply_load_16:
+.temporal_filter_apply_load_16:
movdqa xmm0, [rsi] ; src (frame1)
lea rsi, [rsi + rbp] ; += stride
movdqa xmm1, xmm0
punpcklbw xmm0, xmm7 ; src[ 0- 7]
punpckhbw xmm1, xmm7 ; src[ 8-15]
-temporal_filter_apply_load_finished:
+.temporal_filter_apply_load_finished:
movdqa xmm2, [rdx] ; predictor (frame2)
movdqa xmm3, xmm2
punpcklbw xmm2, xmm7 ; pred[ 0- 7]
@@ -176,13 +176,13 @@
lea rdi, [rdi + 16*4] ; accumulator += 16*(sizeof(int))
cmp rdx, rcx
- je temporal_filter_apply_epilog
+ je .temporal_filter_apply_epilog
pxor xmm7, xmm7 ; zero for extraction
cmp dword ptr [rsp + block_size], 16
- je temporal_filter_apply_load_16
- jmp temporal_filter_apply_load_8
+ je .temporal_filter_apply_load_16
+ jmp .temporal_filter_apply_load_8
-temporal_filter_apply_epilog:
+.temporal_filter_apply_epilog:
; begin epilog
mov rbp, [rsp + rbp_backup]
add rsp, stack_size
diff --git a/vp8/encoder/x86/variance_impl_mmx.asm b/vp8/encoder/x86/variance_impl_mmx.asm
index 13b76ea..2be8bbe 100644
--- a/vp8/encoder/x86/variance_impl_mmx.asm
+++ b/vp8/encoder/x86/variance_impl_mmx.asm
@@ -27,7 +27,7 @@
mov rcx, 16
pxor mm4, mm4
-NEXTROW:
+.NEXTROW:
movq mm0, [rax]
movq mm1, [rax+8]
movq mm2, [rax+16]
@@ -44,7 +44,7 @@
add rax, 32
dec rcx
- ja NEXTROW
+ ja .NEXTROW
movq QWORD PTR [rsp], mm4
;return sum[0]+sum[1];
@@ -568,7 +568,7 @@
add rsi, r8
%endif
-filter_block2d_bil4x4_var_mmx_loop:
+.filter_block2d_bil4x4_var_mmx_loop:
movd mm1, [rsi] ;
movd mm3, [rsi+1] ;
@@ -614,7 +614,7 @@
add rdi, r9
%endif
sub rcx, 1 ;
- jnz filter_block2d_bil4x4_var_mmx_loop ;
+ jnz .filter_block2d_bil4x4_var_mmx_loop ;
pxor mm3, mm3 ;
@@ -726,7 +726,7 @@
add rsi, r8
%endif
-filter_block2d_bil_var_mmx_loop:
+.filter_block2d_bil_var_mmx_loop:
movq mm1, [rsi] ;
movq mm3, [rsi+1] ;
@@ -807,7 +807,7 @@
add rdi, r9
%endif
sub rcx, 1 ;
- jnz filter_block2d_bil_var_mmx_loop ;
+ jnz .filter_block2d_bil_var_mmx_loop ;
pxor mm3, mm3 ;
diff --git a/vp8/encoder/x86/variance_impl_sse2.asm b/vp8/encoder/x86/variance_impl_sse2.asm
index b7a6b32..7629220 100644
--- a/vp8/encoder/x86/variance_impl_sse2.asm
+++ b/vp8/encoder/x86/variance_impl_sse2.asm
@@ -33,7 +33,7 @@
mov rcx, 8
pxor xmm4, xmm4
-NEXTROW:
+.NEXTROW:
movdqa xmm0, [rax]
movdqa xmm1, [rax+16]
movdqa xmm2, [rax+32]
@@ -50,7 +50,7 @@
add rax, 0x40
dec rcx
- ja NEXTROW
+ ja .NEXTROW
movdqa xmm3,xmm4
psrldq xmm4,8
@@ -126,7 +126,7 @@
pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
mov rcx, 16
-var16loop:
+.var16loop:
movdqu xmm1, XMMWORD PTR [rsi]
movdqu xmm2, XMMWORD PTR [rdi]
@@ -160,7 +160,7 @@
add rdi, rdx
sub rcx, 1
- jnz var16loop
+ jnz .var16loop
movdqa xmm1, xmm6
diff --git a/vp8/encoder/x86/variance_impl_ssse3.asm b/vp8/encoder/x86/variance_impl_ssse3.asm
index a582f8d..97e8b0e 100644
--- a/vp8/encoder/x86/variance_impl_ssse3.asm
+++ b/vp8/encoder/x86/variance_impl_ssse3.asm
@@ -47,7 +47,7 @@
movsxd rax, dword ptr arg(5) ; xoffset
cmp rax, 0 ; skip first_pass filter if xoffset=0
- je filter_block2d_bil_var_ssse3_sp_only
+ je .filter_block2d_bil_var_ssse3_sp_only
shl rax, 4 ; point to filter coeff with xoffset
lea rax, [rax + rcx] ; HFilter
@@ -55,7 +55,7 @@
movsxd rdx, dword ptr arg(6) ; yoffset
cmp rdx, 0 ; skip second_pass filter if yoffset=0
- je filter_block2d_bil_var_ssse3_fp_only
+ je .filter_block2d_bil_var_ssse3_fp_only
shl rdx, 4
lea rdx, [rdx + rcx] ; VFilter
@@ -88,7 +88,7 @@
lea rsi, [rsi + r8]
%endif
-filter_block2d_bil_var_ssse3_loop:
+.filter_block2d_bil_var_ssse3_loop:
movdqu xmm1, XMMWORD PTR [rsi]
movdqu xmm2, XMMWORD PTR [rsi+1]
movdqa xmm3, xmm1
@@ -142,15 +142,15 @@
%endif
sub rcx, 1
- jnz filter_block2d_bil_var_ssse3_loop
+ jnz .filter_block2d_bil_var_ssse3_loop
- jmp filter_block2d_bil_variance
+ jmp .filter_block2d_bil_variance
-filter_block2d_bil_var_ssse3_sp_only:
+.filter_block2d_bil_var_ssse3_sp_only:
movsxd rdx, dword ptr arg(6) ; yoffset
cmp rdx, 0 ; Both xoffset =0 and yoffset=0
- je filter_block2d_bil_var_ssse3_full_pixel
+ je .filter_block2d_bil_var_ssse3_full_pixel
shl rdx, 4
lea rdx, [rdx + rcx] ; VFilter
@@ -169,7 +169,7 @@
lea rsi, [rsi + rax]
-filter_block2d_bil_sp_only_loop:
+.filter_block2d_bil_sp_only_loop:
movdqu xmm3, XMMWORD PTR [rsi]
movdqa xmm2, xmm1
movdqa xmm0, xmm3
@@ -209,11 +209,11 @@
%endif
sub rcx, 1
- jnz filter_block2d_bil_sp_only_loop
+ jnz .filter_block2d_bil_sp_only_loop
- jmp filter_block2d_bil_variance
+ jmp .filter_block2d_bil_variance
-filter_block2d_bil_var_ssse3_full_pixel:
+.filter_block2d_bil_var_ssse3_full_pixel:
mov rsi, arg(0) ;ref_ptr
mov rdi, arg(2) ;src_ptr
movsxd rcx, dword ptr arg(4) ;Height
@@ -221,7 +221,7 @@
movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
pxor xmm0, xmm0
-filter_block2d_bil_full_pixel_loop:
+.filter_block2d_bil_full_pixel_loop:
movq xmm1, QWORD PTR [rsi]
punpcklbw xmm1, xmm0
movq xmm2, QWORD PTR [rsi+8]
@@ -244,11 +244,11 @@
lea rsi, [rsi + rax] ;ref_pixels_per_line
lea rdi, [rdi + rdx] ;src_pixels_per_line
sub rcx, 1
- jnz filter_block2d_bil_full_pixel_loop
+ jnz .filter_block2d_bil_full_pixel_loop
- jmp filter_block2d_bil_variance
+ jmp .filter_block2d_bil_variance
-filter_block2d_bil_var_ssse3_fp_only:
+.filter_block2d_bil_var_ssse3_fp_only:
mov rsi, arg(0) ;ref_ptr
mov rdi, arg(2) ;src_ptr
movsxd rcx, dword ptr arg(4) ;Height
@@ -260,7 +260,7 @@
movsxd r9, dword ptr arg(3) ;src_pixels_per_line
%endif
-filter_block2d_bil_fp_only_loop:
+.filter_block2d_bil_fp_only_loop:
movdqu xmm1, XMMWORD PTR [rsi]
movdqu xmm2, XMMWORD PTR [rsi+1]
movdqa xmm3, xmm1
@@ -298,11 +298,11 @@
%endif
sub rcx, 1
- jnz filter_block2d_bil_fp_only_loop
+ jnz .filter_block2d_bil_fp_only_loop
- jmp filter_block2d_bil_variance
+ jmp .filter_block2d_bil_variance
-filter_block2d_bil_variance:
+.filter_block2d_bil_variance:
pxor xmm0, xmm0
pxor xmm1, xmm1
pxor xmm5, xmm5
diff --git a/vp8/encoder/x86/variance_x86.h b/vp8/encoder/x86/variance_x86.h
index af6c4d2..4b41b54 100644
--- a/vp8/encoder/x86/variance_x86.h
+++ b/vp8/encoder/x86/variance_x86.h
@@ -140,6 +140,8 @@
extern prototype_variance(vp8_mse16x16_wmt);
extern prototype_variance2(vp8_get8x8var_sse2);
extern prototype_variance2(vp8_get16x16var_sse2);
+extern prototype_ssimpf(vp8_ssim_parms_8x8_sse2)
+extern prototype_ssimpf(vp8_ssim_parms_16x16_sse2)
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_variance_sad4x4
@@ -208,6 +210,14 @@
#undef vp8_variance_mse16x16
#define vp8_variance_mse16x16 vp8_mse16x16_wmt
+#if ARCH_X86_64
+#undef vp8_ssimpf_8x8
+#define vp8_ssimpf_8x8 vp8_ssim_parms_8x8_sse2
+
+#undef vp8_ssimpf_16x16
+#define vp8_ssimpf_16x16 vp8_ssim_parms_16x16_sse2
+#endif
+
#endif
#endif
diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c
index badb9f0..36b7b71 100644
--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ b/vp8/encoder/x86/x86_csystemdependent.c
@@ -111,29 +111,6 @@
#endif
-#if HAVE_SSSE3
-#if CONFIG_INTERNAL_STATS
-#if ARCH_X86_64
-typedef void ssimpf
-(
- unsigned char *s,
- int sp,
- unsigned char *r,
- int rp,
- unsigned long *sum_s,
- unsigned long *sum_r,
- unsigned long *sum_sq_s,
- unsigned long *sum_sq_r,
- unsigned long *sum_sxr
-);
-
-extern ssimpf vp8_ssim_parms_16x16_sse3;
-extern ssimpf vp8_ssim_parms_8x8_sse3;
-#endif
-#endif
-#endif
-
-
void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
{
#if CONFIG_RUNTIME_CPU_DETECT
@@ -246,6 +223,13 @@
#if !(CONFIG_REALTIME_ONLY)
cpi->rtcd.temporal.apply = vp8_temporal_filter_apply_sse2;
#endif
+
+#if CONFIG_INTERNAL_STATS
+#if ARCH_X86_64
+ cpi->rtcd.variance.ssimpf_8x8 = vp8_ssim_parms_8x8_sse2;
+ cpi->rtcd.variance.ssimpf_16x16 = vp8_ssim_parms_16x16_sse2;
+#endif
+#endif
}
#endif
@@ -280,14 +264,6 @@
cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_ssse3;
cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_ssse3;
-
-#if CONFIG_INTERNAL_STATS
-#if ARCH_X86_64
- cpi->rtcd.variance.ssimpf_8x8 = vp8_ssim_parms_8x8_sse3;
- cpi->rtcd.variance.ssimpf = vp8_ssim_parms_16x16_sse3;
-#endif
-#endif
-
}
#endif
diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index 053ecae..9ec24d5 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -19,8 +19,6 @@
VP8_COMMON_SRCS-yes += common/blockd.c
VP8_COMMON_SRCS-yes += common/coefupdateprobs.h
VP8_COMMON_SRCS-yes += common/debugmodes.c
-VP8_COMMON_SRCS-yes += common/defaultcoefcounts.h
-VP8_COMMON_SRCS-yes += common/defaultcoefcounts.c
VP8_COMMON_SRCS-yes += common/entropy.c
VP8_COMMON_SRCS-yes += common/entropymode.c
VP8_COMMON_SRCS-yes += common/entropymv.c
diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk
index d46d99d..b71a54a 100644
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -34,6 +34,7 @@
#INCLUDES += encoder
VP8_CX_SRCS-yes += encoder/asm_enc_offsets.c
+VP8_CX_SRCS-yes += encoder/defaultcoefcounts.h
VP8_CX_SRCS-yes += encoder/bitstream.c
VP8_CX_SRCS-yes += encoder/boolhuff.c
VP8_CX_SRCS-yes += encoder/dct.c