adds preload for armv6 encoder asm
Added preload instructions to armv6 encoder optimizations.
About 5% average speed-up on Tegra2 for VGA@30fps sequence.
Change-Id: I41d74737720fb71ce7a316f07555357822f3347e
diff --git a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
index 9e0a035..22a50eb 100644
--- a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
+++ b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
@@ -25,6 +25,10 @@
|vp8_variance_halfpixvar16x16_v_armv6| PROC
stmfd sp!, {r4-r12, lr}
+
+ pld [r0, r1, lsl #0]
+ pld [r2, r3, lsl #0]
+
mov r8, #0 ; initialize sum = 0
ldr r10, c80808080
mov r11, #0 ; initialize sse = 0
@@ -43,8 +47,10 @@
eor r4, r4, r10
usub8 r6, r4, r5 ; calculate difference
+ pld [r0, r1, lsl #1]
sel r7, r6, lr ; select bytes with positive difference
usub8 r6, r5, r4 ; calculate difference with reversed operands
+ pld [r2, r3, lsl #1]
sel r6, r6, lr ; select bytes with negative difference
; calculate partial sums