Added vp8_fast_quantize_b_sse2
Moved vp8_fast_quantize_b_sse from quantize_mmx.asm into
quantize_sse2.asm and renamed. Updated the assembly code to
match the C version.
Change-Id: I1766d9e1ca60e173f65badc0ca0c160c2b51b200
diff --git a/vp8/encoder/x86/quantize_mmx.asm b/vp8/encoder/x86/quantize_mmx.asm
index 51cd940..f29a54e 100644
--- a/vp8/encoder/x86/quantize_mmx.asm
+++ b/vp8/encoder/x86/quantize_mmx.asm
@@ -284,156 +284,3 @@
UNSHADOW_ARGS
pop rbp
ret
-
-
-;int vp8_fast_quantize_b_impl_sse(short *coeff_ptr, short *zbin_ptr,
-; short *qcoeff_ptr,short *dequant_ptr,
-; short *scan_mask, short *round_ptr,
-; short *quant_ptr, short *dqcoeff_ptr);
-global sym(vp8_fast_quantize_b_impl_sse)
-sym(vp8_fast_quantize_b_impl_sse):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 8
- push rsi
- push rdi
- ; end prolog
-
-
- mov rsi, arg(0) ;coeff_ptr
- movdqa xmm0, [rsi]
-
- mov rax, arg(1) ;zbin_ptr
- movdqa xmm1, [rax]
-
- movdqa xmm3, xmm0
- psraw xmm0, 15
-
- pxor xmm3, xmm0
- psubw xmm3, xmm0 ; abs
-
- movdqa xmm2, xmm3
- pcmpgtw xmm1, xmm2
-
- pandn xmm1, xmm2
- movdqa xmm3, xmm1
-
- mov rdx, arg(6) ; quant_ptr
- movdqa xmm1, [rdx]
-
- mov rcx, arg(5) ; round_ptr
- movdqa xmm2, [rcx]
-
- paddw xmm3, xmm2
- pmulhuw xmm3, xmm1
-
- pxor xmm3, xmm0
- psubw xmm3, xmm0 ;gain the sign back
-
- mov rdi, arg(2) ;qcoeff_ptr
- movdqa xmm0, xmm3
-
- movdqa [rdi], xmm3
-
- mov rax, arg(3) ;dequant_ptr
- movdqa xmm2, [rax]
-
- pmullw xmm3, xmm2
- mov rax, arg(7) ;dqcoeff_ptr
-
- movdqa [rax], xmm3
-
- ; next 8
- movdqa xmm4, [rsi+16]
-
- mov rax, arg(1) ;zbin_ptr
- movdqa xmm5, [rax+16]
-
- movdqa xmm7, xmm4
- psraw xmm4, 15
-
- pxor xmm7, xmm4
- psubw xmm7, xmm4 ; abs
-
- movdqa xmm6, xmm7
- pcmpgtw xmm5, xmm6
-
- pandn xmm5, xmm6
- movdqa xmm7, xmm5
-
- movdqa xmm5, [rdx+16]
- movdqa xmm6, [rcx+16]
-
-
- paddw xmm7, xmm6
- pmulhuw xmm7, xmm5
-
- pxor xmm7, xmm4
- psubw xmm7, xmm4;gain the sign back
-
- mov rdi, arg(2) ;qcoeff_ptr
-
- movdqa xmm1, xmm7
- movdqa [rdi+16], xmm7
-
- mov rax, arg(3) ;dequant_ptr
- movdqa xmm6, [rax+16]
-
- pmullw xmm7, xmm6
- mov rax, arg(7) ;dqcoeff_ptr
-
- movdqa [rax+16], xmm7
- mov rdi, arg(4) ;scan_mask
-
- pxor xmm7, xmm7
- movdqa xmm2, [rdi]
-
- movdqa xmm3, [rdi+16];
- pcmpeqw xmm0, xmm7
-
- pcmpeqw xmm1, xmm7
- pcmpeqw xmm6, xmm6
-
- pxor xmm0, xmm6
- pxor xmm1, xmm6
-
- psrlw xmm0, 15
- psrlw xmm1, 15
-
- pmaddwd xmm0, xmm2
- pmaddwd xmm1, xmm3
-
- movq xmm2, xmm0
- movq xmm3, xmm1
-
- psrldq xmm0, 8
- psrldq xmm1, 8
-
- paddd xmm0, xmm1
- paddd xmm2, xmm3
-
- paddd xmm0, xmm2
- movq xmm1, xmm0
-
- psrldq xmm0, 4
- paddd xmm1, xmm0
-
- movq rcx, xmm1
- and rcx, 0xffff
-
- xor rdx, rdx
- sub rdx, rcx
-
- bsr rax, rcx
- inc rax
-
- sar rdx, 31
- and rax, rdx
-
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
diff --git a/vp8/encoder/x86/quantize_sse2.asm b/vp8/encoder/x86/quantize_sse2.asm
index a1b1c40..3248813 100644
--- a/vp8/encoder/x86/quantize_sse2.asm
+++ b/vp8/encoder/x86/quantize_sse2.asm
@@ -252,3 +252,137 @@
UNSHADOW_ARGS
pop rbp
ret
+
+
+;int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr,
+; short *qcoeff_ptr,short *dequant_ptr,
+; short *scan_mask, short *round_ptr,
+; short *quant_ptr, short *dqcoeff_ptr);
+global sym(vp8_fast_quantize_b_impl_ssse2)
+sym(vp8_fast_quantize_b_impl_ssse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+
+ %define save_xmm6 0
+ %define save_xmm7 16
+
+ %define vp8_fastquantizeb_stack_size save_xmm7 + 16
+
+ sub rsp, vp8_fastquantizeb_stack_size
+
+ movdqa XMMWORD PTR[rsp + save_xmm6], xmm6
+ movdqa XMMWORD PTR[rsp + save_xmm7], xmm7
+
+ mov rdx, arg(0) ;coeff_ptr
+ mov rcx, arg(2) ;dequant_ptr
+ mov rax, arg(3) ;scan_mask
+ mov rdi, arg(4) ;round_ptr
+ mov rsi, arg(5) ;quant_ptr
+
+ movdqa xmm0, XMMWORD PTR[rdx]
+ movdqa xmm4, XMMWORD PTR[rdx + 16]
+
+ movdqa xmm6, XMMWORD PTR[rdi] ;round lo
+ movdqa xmm7, XMMWORD PTR[rdi + 16] ;round hi
+
+ movdqa xmm1, xmm0
+ movdqa xmm5, xmm4
+
+ psraw xmm0, 15 ;sign of z (aka sz)
+ psraw xmm4, 15 ;sign of z (aka sz)
+
+ pxor xmm1, xmm0
+ pxor xmm5, xmm4
+ psubw xmm1, xmm0 ;x = abs(z)
+ psubw xmm5, xmm4 ;x = abs(z)
+
+ paddw xmm1, xmm6
+ paddw xmm5, xmm7
+
+ pmulhw xmm1, XMMWORD PTR[rsi]
+ pmulhw xmm5, XMMWORD PTR[rsi + 16]
+
+ mov rdi, arg(1) ;qcoeff_ptr
+ mov rsi, arg(6) ;dqcoeff_ptr
+
+ movdqa xmm6, XMMWORD PTR[rcx]
+ movdqa xmm7, XMMWORD PTR[rcx + 16]
+
+ pxor xmm1, xmm0
+ pxor xmm5, xmm4
+ psubw xmm1, xmm0
+ psubw xmm5, xmm4
+
+ movdqa XMMWORD PTR[rdi], xmm1
+ movdqa XMMWORD PTR[rdi + 16], xmm5
+
+ pmullw xmm6, xmm1
+ pmullw xmm7, xmm5
+
+ movdqa xmm2, XMMWORD PTR[rax]
+ movdqa xmm3, XMMWORD PTR[rax+16];
+
+ pxor xmm4, xmm4 ;clear all bits
+ pcmpeqw xmm1, xmm4
+ pcmpeqw xmm5, xmm4
+
+ pcmpeqw xmm4, xmm4 ;set all bits
+ pxor xmm1, xmm4
+ pxor xmm5, xmm4
+
+ psrlw xmm1, 15
+ psrlw xmm5, 15
+
+ pmaddwd xmm1, xmm2
+ pmaddwd xmm5, xmm3
+
+ movq xmm2, xmm1
+ movq xmm3, xmm5
+
+ psrldq xmm1, 8
+ psrldq xmm5, 8
+
+ paddd xmm1, xmm5
+ paddd xmm2, xmm3
+
+ paddd xmm1, xmm2
+ movq xmm5, xmm1
+
+ psrldq xmm1, 4
+ paddd xmm5, xmm1
+
+ movq rcx, xmm5
+ and rcx, 0xffff
+
+ xor rdx, rdx
+ sub rdx, rcx
+
+ bsr rax, rcx
+ inc rax
+
+ sar rdx, 31
+ and rax, rdx
+
+ movdqa XMMWORD PTR[rsi], xmm6 ;store dqcoeff
+ movdqa XMMWORD PTR[rsi + 16], xmm7 ;store dqcoeff
+
+ movdqa xmm6, XMMWORD PTR[rsp + save_xmm6]
+ movdqa xmm7, XMMWORD PTR[rsp + save_xmm7]
+
+ add rsp, vp8_fastquantizeb_stack_size
+ pop rsp
+
+ ; begin epilog
+ pop rbx
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c
index 18dc49c..7490a8a 100644
--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ b/vp8/encoder/x86/x86_csystemdependent.c
@@ -88,24 +88,22 @@
vp8_short_fdct4x4_sse2(input + 4, output + 16, pitch);
}
-int vp8_fast_quantize_b_impl_sse(short *coeff_ptr, short *zbin_ptr,
+int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr,
short *qcoeff_ptr, short *dequant_ptr,
short *scan_mask, short *round_ptr,
short *quant_ptr, short *dqcoeff_ptr);
-void vp8_fast_quantize_b_sse(BLOCK *b, BLOCKD *d)
+void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d)
{
short *scan_mask = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr;
short *coeff_ptr = &b->coeff[0];
- short *zbin_ptr = &b->zbin[0][0];
short *round_ptr = &b->round[0][0];
short *quant_ptr = &b->quant[0][0];
short *qcoeff_ptr = d->qcoeff;
short *dqcoeff_ptr = d->dqcoeff;
short *dequant_ptr = &d->dequant[0][0];
- d->eob = vp8_fast_quantize_b_impl_sse(
+ d->eob = vp8_fast_quantize_b_impl_ssse2(
coeff_ptr,
- zbin_ptr,
qcoeff_ptr,
dequant_ptr,
scan_mask,
@@ -116,6 +114,7 @@
);
}
+
int vp8_regular_quantize_b_impl_sse2(short *coeff_ptr, short *zbin_ptr,
short *qcoeff_ptr,short *dequant_ptr,
const int *default_zig_zag, short *round_ptr,
@@ -285,8 +284,10 @@
cpi->rtcd.encodemb.mbuverr = vp8_mbuverror_xmm;
/* cpi->rtcd.encodemb.sub* not implemented for wmt */
- /*cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_sse;
- cpi->rtcd.quantize.quantb = vp8_regular_quantize_b_sse2;*/
+ /*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b_sse2;*/
+
+ cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_sse2;
+
}
#endif