adds preload for armv6 encoder asm

Added preload instructions to armv6 encoder optimizations.
About 5% average speed-up on Tegra2 for VGA@30fps sequence.

Change-Id: I41d74737720fb71ce7a316f07555357822f3347e
diff --git a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
index 9e0a035..22a50eb 100644
--- a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
+++ b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
@@ -25,6 +25,10 @@
 |vp8_variance_halfpixvar16x16_v_armv6| PROC
 
     stmfd   sp!, {r4-r12, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
     mov     r8, #0              ; initialize sum = 0
     ldr     r10, c80808080
     mov     r11, #0             ; initialize sse = 0
@@ -43,8 +47,10 @@
     eor     r4, r4, r10
 
     usub8   r6, r4, r5          ; calculate difference
+    pld     [r0, r1, lsl #1]
     sel     r7, r6, lr          ; select bytes with positive difference
     usub8   r6, r5, r4          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
     sel     r6, r6, lr          ; select bytes with negative difference
 
     ; calculate partial sums