Merge "Use full-pixel MV in mvsadcost calculation"
diff --git a/build/make/obj_int_extract.c b/build/make/obj_int_extract.c
index c46d9d5..04e14a6 100644
--- a/build/make/obj_int_extract.c
+++ b/build/make/obj_int_extract.c
@@ -9,25 +9,13 @@
  */
 
 
+#include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 
 #include "vpx_config.h"
-
-#if defined(_MSC_VER) || defined(__MINGW32__)
-#include <io.h>
-#include <share.h>
 #include "vpx/vpx_integer.h"
-#else
-#include <stdint.h>
-#include <unistd.h>
-#endif
-
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <stdarg.h>
 
 typedef enum
 {
@@ -47,7 +35,6 @@
 }
 
 #if defined(__GNUC__) && __GNUC__
-
 #if defined(__MACH__)
 
 #include <mach-o/loader.h>
@@ -225,73 +212,6 @@
 
 }
 
-int main(int argc, char **argv)
-{
-    int fd;
-    char *f;
-    struct stat stat_buf;
-    uint8_t *file_buf;
-    int res;
-
-    if (argc < 2 || argc > 3)
-    {
-        fprintf(stderr, "Usage: %s [output format] <obj file>\n\n", argv[0]);
-        fprintf(stderr, "  <obj file>\tMachO format object file to parse\n");
-        fprintf(stderr, "Output Formats:\n");
-        fprintf(stderr, "  gas  - compatible with GNU assembler\n");
-        fprintf(stderr, "  rvds - compatible with armasm\n");
-        goto bail;
-    }
-
-    f = argv[2];
-
-    if (!((!strcmp(argv[1], "rvds")) || (!strcmp(argv[1], "gas"))))
-        f = argv[1];
-
-    fd = open(f, O_RDONLY);
-
-    if (fd < 0)
-    {
-        perror("Unable to open file");
-        goto bail;
-    }
-
-    if (fstat(fd, &stat_buf))
-    {
-        perror("stat");
-        goto bail;
-    }
-
-    file_buf = malloc(stat_buf.st_size);
-
-    if (!file_buf)
-    {
-        perror("malloc");
-        goto bail;
-    }
-
-    if (read(fd, file_buf, stat_buf.st_size) != stat_buf.st_size)
-    {
-        perror("read");
-        goto bail;
-    }
-
-    if (close(fd))
-    {
-        perror("close");
-        goto bail;
-    }
-
-    res = parse_macho(file_buf, stat_buf.st_size);
-    free(file_buf);
-
-    if (!res)
-        return EXIT_SUCCESS;
-
-bail:
-    return EXIT_FAILURE;
-}
-
 #elif defined(__ELF__)
 #include "elf.h"
 
@@ -740,96 +660,24 @@
     return 1;
 }
 
-int main(int argc, char **argv)
-{
-    int fd;
-    output_fmt_t mode;
-    char *f;
-    struct stat stat_buf;
-    uint8_t *file_buf;
-    int res;
-
-    if (argc < 2 || argc > 3)
-    {
-        fprintf(stderr, "Usage: %s [output format] <obj file>\n\n", argv[0]);
-        fprintf(stderr, "  <obj file>\tELF format object file to parse\n");
-        fprintf(stderr, "Output Formats:\n");
-        fprintf(stderr, "  gas  - compatible with GNU assembler\n");
-        fprintf(stderr, "  rvds - compatible with armasm\n");
-        goto bail;
-    }
-
-    f = argv[2];
-
-    if (!strcmp(argv[1], "rvds"))
-        mode = OUTPUT_FMT_RVDS;
-    else if (!strcmp(argv[1], "gas"))
-        mode = OUTPUT_FMT_GAS;
-    else
-        f = argv[1];
-
-
-    fd = open(f, O_RDONLY);
-
-    if (fd < 0)
-    {
-        perror("Unable to open file");
-        goto bail;
-    }
-
-    if (fstat(fd, &stat_buf))
-    {
-        perror("stat");
-        goto bail;
-    }
-
-    file_buf = malloc(stat_buf.st_size);
-
-    if (!file_buf)
-    {
-        perror("malloc");
-        goto bail;
-    }
-
-    if (read(fd, file_buf, stat_buf.st_size) != stat_buf.st_size)
-    {
-        perror("read");
-        goto bail;
-    }
-
-    if (close(fd))
-    {
-        perror("close");
-        goto bail;
-    }
-
-    res = parse_elf(file_buf, stat_buf.st_size, mode);
-    free(file_buf);
-
-    if (!res)
-        return EXIT_SUCCESS;
-
-bail:
-    return EXIT_FAILURE;
-}
 #endif
-#endif
+#endif /* defined(__GNUC__) && __GNUC__ */
 
 
-#if defined(_MSC_VER) || defined(__MINGW32__)
+#if defined(_MSC_VER) || defined(__MINGW32__) || defined(__CYGWIN__)
 /*  See "Microsoft Portable Executable and Common Object File Format Specification"
     for reference.
 */
 #define get_le32(x) ((*(x)) | (*(x+1)) << 8 |(*(x+2)) << 16 | (*(x+3)) << 24 )
 #define get_le16(x) ((*(x)) | (*(x+1)) << 8)
 
-int parse_coff(unsigned __int8 *buf, size_t sz)
+int parse_coff(uint8_t *buf, size_t sz)
 {
     unsigned int nsections, symtab_ptr, symtab_sz, strtab_ptr;
     unsigned int sectionrawdata_ptr;
     unsigned int i;
-    unsigned __int8 *ptr;
-    unsigned __int32 symoffset;
+    uint8_t *ptr;
+    uint32_t symoffset;
 
     char **sectionlist;  //this array holds all section names in their correct order.
     //it is used to check if the symbol is in .bss or .data section.
@@ -907,7 +755,7 @@
 
     for (i = 0; i < symtab_sz; i++)
     {
-        __int16 section = get_le16(ptr + 12); //section number
+        int16_t section = get_le16(ptr + 12); //section number
 
         if (section > 0 && ptr[16] == 2)
         {
@@ -978,20 +826,21 @@
 
     return 1;
 }
+#endif /* defined(_MSC_VER) || defined(__MINGW32__) || defined(__CYGWIN__) */
 
 int main(int argc, char **argv)
 {
-    int fd;
-    output_fmt_t mode;
+    output_fmt_t mode = OUTPUT_FMT_PLAIN;
     const char *f;
-    struct _stat stat_buf;
-    unsigned __int8 *file_buf;
+    uint8_t *file_buf;
     int res;
+    FILE *fp;
+    long int file_size;
 
     if (argc < 2 || argc > 3)
     {
         fprintf(stderr, "Usage: %s [output format] <obj file>\n\n", argv[0]);
-        fprintf(stderr, "  <obj file>\tELF format object file to parse\n");
+        fprintf(stderr, "  <obj file>\tobject file to parse\n");
         fprintf(stderr, "Output Formats:\n");
         fprintf(stderr, "  gas  - compatible with GNU assembler\n");
         fprintf(stderr, "  rvds - compatible with armasm\n");
@@ -1007,15 +856,22 @@
     else
         f = argv[1];
 
-    fd = _sopen(f, _O_BINARY, _SH_DENYNO, _S_IREAD | _S_IWRITE);
+    fp = fopen(f, "rb");
 
-    if (_fstat(fd, &stat_buf))
+    if (!fp)
+    {
+        perror("Unable to open file");
+        goto bail;
+    }
+
+    if (fseek(fp, 0, SEEK_END))
     {
         perror("stat");
         goto bail;
     }
 
-    file_buf = malloc(stat_buf.st_size);
+    file_size = ftell(fp);
+    file_buf = malloc(file_size);
 
     if (!file_buf)
     {
@@ -1023,19 +879,30 @@
         goto bail;
     }
 
-    if (_read(fd, file_buf, stat_buf.st_size) != stat_buf.st_size)
+    rewind(fp);
+
+    if (fread(file_buf, sizeof(char), file_size, fp) != file_size)
     {
         perror("read");
         goto bail;
     }
 
-    if (_close(fd))
+    if (fclose(fp))
     {
         perror("close");
         goto bail;
     }
 
-    res = parse_coff(file_buf, stat_buf.st_size);
+#if defined(__GNUC__) && __GNUC__
+#if defined(__MACH__)
+    res = parse_macho(file_buf, file_size);
+#elif defined(__ELF__)
+    res = parse_elf(file_buf, file_size, mode);
+#endif
+#endif
+#if defined(_MSC_VER) || defined(__MINGW32__) || defined(__CYGWIN__)
+    res = parse_coff(file_buf, file_size);
+#endif
 
     free(file_buf);
 
@@ -1045,4 +912,3 @@
 bail:
     return EXIT_FAILURE;
 }
-#endif
diff --git a/configure b/configure
index c177865..ab3936d 100755
--- a/configure
+++ b/configure
@@ -377,6 +377,7 @@
     if [ -f "${source_path}/build/make/version.sh" ]; then
         local ver=`"$source_path/build/make/version.sh" --bare $source_path`
         DIST_DIR="${DIST_DIR}-${ver}"
+        VERSION_STRING=${ver}
         ver=${ver%%-*}
         VERSION_PATCH=${ver##*.}
         ver=${ver%.*}
@@ -385,6 +386,8 @@
         VERSION_MAJOR=${ver%.*}
     fi
     enabled child || cat <<EOF >> config.mk
+
+PREFIX=${prefix}
 ifeq (\$(MAKECMDGOALS),dist)
 DIST_DIR?=${DIST_DIR}
 else
@@ -392,6 +395,8 @@
 endif
 LIBSUBDIR=${libdir##${prefix}/}
 
+VERSION_STRING=${VERSION_STRING}
+
 VERSION_MAJOR=${VERSION_MAJOR}
 VERSION_MINOR=${VERSION_MINOR}
 VERSION_PATCH=${VERSION_PATCH}
diff --git a/libs.mk b/libs.mk
index 6a5dc18..2cb7f49 100644
--- a/libs.mk
+++ b/libs.mk
@@ -204,6 +204,26 @@
 
 INSTALL-LIBS-$(CONFIG_SHARED) += $(LIBVPX_SO_SYMLINKS)
 INSTALL-LIBS-$(CONFIG_SHARED) += $(LIBSUBDIR)/$(LIBVPX_SO)
+
+LIBS-$(BUILD_LIBVPX) += vpx.pc
+vpx.pc: config.mk libs.mk
+	@echo "    [CREATE] $@"
+	$(qexec)echo '# pkg-config file from libvpx $(VERSION_STRING)' > $@
+	$(qexec)echo 'prefix=$(PREFIX)' >> $@
+	$(qexec)echo 'exec_prefix=$${prefix}' >> $@
+	$(qexec)echo 'libdir=$${prefix}/lib' >> $@
+	$(qexec)echo 'includedir=$${prefix}/include' >> $@
+	$(qexec)echo '' >> $@
+	$(qexec)echo 'Name: vpx' >> $@
+	$(qexec)echo 'Description: WebM Project VPx codec implementation' >> $@
+	$(qexec)echo 'Version: $(VERSION_MAJOR).$(VERSION_MINOR).$(VERSION_PATCH)' >> $@
+	$(qexec)echo 'Requires:' >> $@
+	$(qexec)echo 'Conflicts:' >> $@
+	$(qexec)echo 'Libs: -L$${libdir} -lvpx' >> $@
+	$(qexec)echo 'Cflags: -I$${includedir}' >> $@
+INSTALL-LIBS-yes += $(LIBSUBDIR)/pkgconfig/vpx.pc
+INSTALL_MAPS += $(LIBSUBDIR)/pkgconfig/%.pc %.pc
+CLEAN-OBJS += vpx.pc
 endif
 
 LIBS-$(LIPO_LIBVPX) += libvpx.a
diff --git a/vp8/encoder/arm/arm_csystemdependent.c b/vp8/encoder/arm/arm_csystemdependent.c
index d4a88dc..f148b89 100644
--- a/vp8/encoder/arm/arm_csystemdependent.c
+++ b/vp8/encoder/arm/arm_csystemdependent.c
@@ -66,10 +66,10 @@
 
         /*cpi->rtcd.encodemb.berr                  = vp8_block_error_c;
         cpi->rtcd.encodemb.mberr                 = vp8_mbblock_error_c;
-        cpi->rtcd.encodemb.mbuverr               = vp8_mbuverror_c;
-        cpi->rtcd.encodemb.subb                  = vp8_subtract_b_c;
-        cpi->rtcd.encodemb.submby                = vp8_subtract_mby_c;
-        cpi->rtcd.encodemb.submbuv               = vp8_subtract_mbuv_c;*/
+        cpi->rtcd.encodemb.mbuverr               = vp8_mbuverror_c;*/
+        cpi->rtcd.encodemb.subb                  = vp8_subtract_b_armv6;
+        cpi->rtcd.encodemb.submby                = vp8_subtract_mby_armv6;
+        cpi->rtcd.encodemb.submbuv               = vp8_subtract_mbuv_armv6;
 
         /*cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b;*/
         cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_armv6;
diff --git a/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm b/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm
new file mode 100644
index 0000000..0ca7438
--- /dev/null
+++ b/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm
@@ -0,0 +1,265 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_subtract_mby_armv6|
+    EXPORT  |vp8_subtract_mbuv_armv6|
+    EXPORT  |vp8_subtract_b_armv6|
+
+    INCLUDE asm_enc_offsets.asm
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    BLOCK *be
+; r1    BLOCKD *bd
+; r2    int pitch
+|vp8_subtract_b_armv6| PROC
+
+    stmfd   sp!, {r4-r9}
+
+    ldr     r4, [r0, #vp8_block_base_src]
+    ldr     r5, [r0, #vp8_block_src]
+    ldr     r6, [r0, #vp8_block_src_diff]
+
+    ldr     r3, [r4]
+    ldr     r7, [r0, #vp8_block_src_stride]
+    add     r3, r3, r5          ; src = *base_src + src
+    ldr     r8, [r1, #vp8_blockd_predictor]
+
+    mov     r9, #4              ; loop count
+
+loop_block
+
+    ldr     r0, [r3], r7        ; src
+    ldr     r1, [r8], r2        ; pred
+
+    uxtb16  r4, r0              ; [s2 | s0]
+    uxtb16  r5, r1              ; [p2 | p0]
+    uxtb16  r0, r0, ror #8      ; [s3 | s1]
+    uxtb16  r1, r1, ror #8      ; [p3 | p1]
+
+    usub16  r4, r4, r5          ; [d2 | d0]
+    usub16  r5, r0, r1          ; [d3 | d1]
+
+    subs    r9, r9, #1          ; decrement loop counter
+
+    pkhbt   r0, r4, r5, lsl #16 ; [d1 | d0]
+    pkhtb   r1, r5, r4, asr #16 ; [d3 | d2]
+
+    str     r0, [r6, #0]        ; diff
+    str     r1, [r6, #4]        ; diff
+
+    add     r6, r6, r2, lsl #1  ; update diff pointer
+    bne     loop_block
+
+    ldmfd   sp!, {r4-r9}
+    mov     pc, lr
+
+    ENDP
+
+
+; r0    short *diff
+; r1    unsigned char *usrc
+; r2    unsigned char *vsrc
+; r3    unsigned char *pred
+; stack int stride
+|vp8_subtract_mbuv_armv6| PROC
+
+    stmfd   sp!, {r4-r12, lr}
+
+    add     r0, r0, #512        ; set *diff point to Cb
+    add     r3, r3, #256        ; set *pred point to Cb
+
+    mov     r4, #8              ; loop count
+    ldr     r5, [sp, #40]       ; stride
+
+    ; Subtract U block
+loop_u
+    ldr     r6, [r1]            ; src       (A)
+    ldr     r7, [r3], #4        ; pred      (A)
+
+    uxtb16  r8, r6              ; [s2 | s0] (A)
+    uxtb16  r9, r7              ; [p2 | p0] (A)
+    uxtb16  r10, r6, ror #8     ; [s3 | s1] (A)
+    uxtb16  r11, r7, ror #8     ; [p3 | p1] (A)
+
+    usub16  r6, r8, r9          ; [d2 | d0] (A)
+    usub16  r7, r10, r11        ; [d3 | d1] (A)
+
+    ldr     r10, [r1, #4]       ; src       (B)
+    ldr     r11, [r3], #4       ; pred      (B)
+
+    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (A)
+    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (A)
+
+    str     r8, [r0], #4        ; diff      (A)
+    uxtb16  r8, r10             ; [s2 | s0] (B)
+    str     r9, [r0], #4        ; diff      (A)
+
+    uxtb16  r9, r11             ; [p2 | p0] (B)
+    uxtb16  r10, r10, ror #8    ; [s3 | s1] (B)
+    uxtb16  r11, r11, ror #8    ; [p3 | p1] (B)
+
+    usub16  r6, r8, r9          ; [d2 | d0] (B)
+    usub16  r7, r10, r11        ; [d3 | d1] (B)
+
+    add     r1, r1, r5          ; update usrc pointer
+
+    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (B)
+    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (B)
+
+    str     r8, [r0], #4        ; diff      (B)
+    subs    r4, r4, #1          ; update loop counter
+    str     r9, [r0], #4        ; diff      (B)
+
+    bne     loop_u
+
+    mov     r4, #8              ; loop count
+
+    ; Subtract V block
+loop_v
+    ldr     r6, [r2]            ; src       (A)
+    ldr     r7, [r3], #4        ; pred      (A)
+
+    uxtb16  r8, r6              ; [s2 | s0] (A)
+    uxtb16  r9, r7              ; [p2 | p0] (A)
+    uxtb16  r10, r6, ror #8     ; [s3 | s1] (A)
+    uxtb16  r11, r7, ror #8     ; [p3 | p1] (A)
+
+    usub16  r6, r8, r9          ; [d2 | d0] (A)
+    usub16  r7, r10, r11        ; [d3 | d1] (A)
+
+    ldr     r10, [r2, #4]       ; src       (B)
+    ldr     r11, [r3], #4       ; pred      (B)
+
+    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (A)
+    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (A)
+
+    str     r8, [r0], #4        ; diff      (A)
+    uxtb16  r8, r10             ; [s2 | s0] (B)
+    str     r9, [r0], #4        ; diff      (A)
+
+    uxtb16  r9, r11             ; [p2 | p0] (B)
+    uxtb16  r10, r10, ror #8    ; [s3 | s1] (B)
+    uxtb16  r11, r11, ror #8    ; [p3 | p1] (B)
+
+    usub16  r6, r8, r9          ; [d2 | d0] (B)
+    usub16  r7, r10, r11        ; [d3 | d1] (B)
+
+    add     r2, r2, r5          ; update vsrc pointer
+
+    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (B)
+    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (B)
+
+    str     r8, [r0], #4        ; diff      (B)
+    subs    r4, r4, #1          ; update loop counter
+    str     r9, [r0], #4        ; diff      (B)
+
+    bne     loop_v
+
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+
+; r0    short *diff
+; r1    unsigned char *src
+; r2    unsigned char *pred
+; r3    int stride
+|vp8_subtract_mby_armv6| PROC
+
+    stmfd   sp!, {r4-r11}
+
+    mov     r4, #16
+loop
+    ldr     r6, [r1]            ; src       (A)
+    ldr     r7, [r2], #4        ; pred      (A)
+
+    uxtb16  r8, r6              ; [s2 | s0] (A)
+    uxtb16  r9, r7              ; [p2 | p0] (A)
+    uxtb16  r10, r6, ror #8     ; [s3 | s1] (A)
+    uxtb16  r11, r7, ror #8     ; [p3 | p1] (A)
+
+    usub16  r6, r8, r9          ; [d2 | d0] (A)
+    usub16  r7, r10, r11        ; [d3 | d1] (A)
+
+    ldr     r10, [r1, #4]       ; src       (B)
+    ldr     r11, [r2], #4       ; pred      (B)
+
+    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (A)
+    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (A)
+
+    str     r8, [r0], #4        ; diff      (A)
+    uxtb16  r8, r10             ; [s2 | s0] (B)
+    str     r9, [r0], #4        ; diff      (A)
+
+    uxtb16  r9, r11             ; [p2 | p0] (B)
+    uxtb16  r10, r10, ror #8    ; [s3 | s1] (B)
+    uxtb16  r11, r11, ror #8    ; [p3 | p1] (B)
+
+    usub16  r6, r8, r9          ; [d2 | d0] (B)
+    usub16  r7, r10, r11        ; [d3 | d1] (B)
+
+    ldr     r10, [r1, #8]       ; src       (C)
+    ldr     r11, [r2], #4       ; pred      (C)
+
+    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (B)
+    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (B)
+
+    str     r8, [r0], #4        ; diff      (B)
+    uxtb16  r8, r10             ; [s2 | s0] (C)
+    str     r9, [r0], #4        ; diff      (B)
+
+    uxtb16  r9, r11             ; [p2 | p0] (C)
+    uxtb16  r10, r10, ror #8    ; [s3 | s1] (C)
+    uxtb16  r11, r11, ror #8    ; [p3 | p1] (C)
+
+    usub16  r6, r8, r9          ; [d2 | d0] (C)
+    usub16  r7, r10, r11        ; [d3 | d1] (C)
+
+    ldr     r10, [r1, #12]      ; src       (D)
+    ldr     r11, [r2], #4       ; pred      (D)
+
+    pkhbt   r8, r6, r7, lsl #16  ; [d1 | d0] (C)
+    pkhtb   r9, r7, r6, asr #16  ; [d3 | d2] (C)
+
+    str     r8, [r0], #4        ; diff      (C)
+    uxtb16  r8, r10             ; [s2 | s0] (D)
+    str     r9, [r0], #4        ; diff      (C)
+
+    uxtb16  r9, r11             ; [p2 | p0] (D)
+    uxtb16  r10, r10, ror #8    ; [s3 | s1] (D)
+    uxtb16  r11, r11, ror #8    ; [p3 | p1] (D)
+
+    usub16  r6, r8, r9          ; [d2 | d0] (D)
+    usub16  r7, r10, r11        ; [d3 | d1] (D)
+
+    add     r1, r1, r3          ; update src pointer
+
+    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (D)
+    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (D)
+
+    str     r8, [r0], #4        ; diff      (D)
+    subs    r4, r4, #1          ; update loop counter
+    str     r9, [r0], #4        ; diff      (D)
+
+    bne     loop
+
+    ldmfd   sp!, {r4-r11}
+    mov     pc, lr
+
+    ENDP
+
+    END
+
diff --git a/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm b/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm
index 8d7258a..9883763 100644
--- a/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm
+++ b/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm
@@ -25,14 +25,14 @@
 |vp8_variance16x16_armv6| PROC
 
     stmfd   sp!, {r4-r12, lr}
-    mov     r12, #16            ; set loop counter to 16 (=block height)
     mov     r8, #0              ; initialize sum = 0
     mov     r11, #0             ; initialize sse = 0
+    mov     r12, #16            ; set loop counter to 16 (=block height)
 
 loop
     ; 1st 4 pixels
-    ldr     r4, [r0, #0x0]      ; load 4 src pixels
-    ldr     r5, [r2, #0x0]      ; load 4 ref pixels
+    ldr     r4, [r0, #0]        ; load 4 src pixels
+    ldr     r5, [r2, #0]        ; load 4 ref pixels
 
     mov     lr, #0              ; constant zero
 
@@ -55,8 +55,8 @@
     smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
 
     ; 2nd 4 pixels
-    ldr     r4, [r0, #0x4]      ; load 4 src pixels
-    ldr     r5, [r2, #0x4]      ; load 4 ref pixels
+    ldr     r4, [r0, #4]        ; load 4 src pixels
+    ldr     r5, [r2, #4]        ; load 4 ref pixels
     smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
 
     usub8   r6, r4, r5          ; calculate difference
@@ -79,8 +79,8 @@
     smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
 
     ; 3rd 4 pixels
-    ldr     r4, [r0, #0x8]      ; load 4 src pixels
-    ldr     r5, [r2, #0x8]      ; load 4 ref pixels
+    ldr     r4, [r0, #8]        ; load 4 src pixels
+    ldr     r5, [r2, #8]        ; load 4 ref pixels
     smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
 
     usub8   r6, r4, r5          ; calculate difference
@@ -103,8 +103,8 @@
     smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
 
     ; 4th 4 pixels
-    ldr     r4, [r0, #0xc]      ; load 4 src pixels
-    ldr     r5, [r2, #0xc]      ; load 4 ref pixels
+    ldr     r4, [r0, #12]       ; load 4 src pixels
+    ldr     r5, [r2, #12]       ; load 4 ref pixels
     smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
 
     usub8   r6, r4, r5          ; calculate difference
@@ -135,13 +135,14 @@
     bne     loop
 
     ; return stuff
-    ldr     r6, [sp, #0x28]     ; get address of sse
+    ldr     r6, [sp, #40]       ; get address of sse
     mul     r0, r8, r8          ; sum * sum
     str     r11, [r6]           ; store sse
-    sub     r0, r11, r0, ASR #8 ; return (sse - ((sum * sum) >> 8))
+    sub     r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
 
     ldmfd   sp!, {r4-r12, pc}
 
     ENDP
 
     END
+
diff --git a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm
new file mode 100644
index 0000000..2350f3e
--- /dev/null
+++ b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm
@@ -0,0 +1,176 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_variance_halfpixvar16x16_h_armv6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vp8_variance_halfpixvar16x16_h_armv6| PROC
+
+    stmfd   sp!, {r4-r12, lr}
+    mov     r8, #0              ; initialize sum = 0
+    ldr     r10, c80808080
+    mov     r11, #0             ; initialize sse = 0
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+    mov     lr, #0              ; constant zero
+loop
+    ; 1st 4 pixels
+    ldr     r4, [r0, #0]        ; load 4 src pixels
+    ldr     r6, [r0, #1]        ; load 4 src pixels with 1 byte offset
+    ldr     r5, [r2, #0]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+    ; calculate total sum
+    adds    r8, r8, r4          ; add positive differences to sum
+    subs    r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r4, [r0, #4]        ; load 4 src pixels
+    ldr     r6, [r0, #5]        ; load 4 src pixels with 1 byte offset
+    ldr     r5, [r2, #4]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r4, [r0, #8]        ; load 4 src pixels
+    ldr     r6, [r0, #9]        ; load 4 src pixels with 1 byte offset
+    ldr     r5, [r2, #8]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r4, [r0, #12]       ; load 4 src pixels
+    ldr     r6, [r0, #13]       ; load 4 src pixels with 1 byte offset
+    ldr     r5, [r2, #12]       ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    subs    r12, r12, #1
+
+    bne     loop
+
+    ; return stuff
+    ldr     r6, [sp, #40]       ; get address of sse
+    mul     r0, r8, r8          ; sum * sum
+    str     r11, [r6]           ; store sse
+    sub     r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
+
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+c80808080
+    DCD     0x80808080
+
+    END
+
diff --git a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm
new file mode 100644
index 0000000..f9ae3b7
--- /dev/null
+++ b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm
@@ -0,0 +1,216 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_variance_halfpixvar16x16_hv_armv6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vp8_variance_halfpixvar16x16_hv_armv6| PROC
+
+    stmfd   sp!, {r4-r12, lr}
+    mov     r8, #0              ; initialize sum = 0
+    ldr     r10, c80808080
+    mov     r11, #0             ; initialize sse = 0
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+    mov     lr, #0              ; constant zero
+loop
+    add     r9, r0, r1          ; pointer to pixels on the next row
+    ; 1st 4 pixels
+    ldr     r4, [r0, #0]        ; load source pixels a, row N
+    ldr     r6, [r0, #1]        ; load source pixels b, row N
+    ldr     r5, [r9, #0]        ; load source pixels c, row N+1
+    ldr     r7, [r9, #1]        ; load source pixels d, row N+1
+
+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+    mvn     r7, r7
+    uhsub8  r5, r5, r7
+    eor     r5, r5, r10
+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+    mvn     r5, r5
+    uhsub8  r4, r4, r5
+    ldr     r5, [r2, #0]        ; load 4 ref pixels
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+    ; calculate total sum
+    adds    r8, r8, r4          ; add positive differences to sum
+    subs    r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r4, [r0, #4]        ; load source pixels a, row N
+    ldr     r6, [r0, #5]        ; load source pixels b, row N
+    ldr     r5, [r9, #4]        ; load source pixels c, row N+1
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    ldr     r7, [r9, #5]        ; load source pixels d, row N+1
+
+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+    mvn     r7, r7
+    uhsub8  r5, r5, r7
+    eor     r5, r5, r10
+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+    mvn     r5, r5
+    uhsub8  r4, r4, r5
+    ldr     r5, [r2, #4]        ; load 4 ref pixels
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r4, [r0, #8]        ; load source pixels a, row N
+    ldr     r6, [r0, #9]        ; load source pixels b, row N
+    ldr     r5, [r9, #8]        ; load source pixels c, row N+1
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    ldr     r7, [r9, #9]        ; load source pixels d, row N+1
+
+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+    mvn     r7, r7
+    uhsub8  r5, r5, r7
+    eor     r5, r5, r10
+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+    mvn     r5, r5
+    uhsub8  r4, r4, r5
+    ldr     r5, [r2, #8]        ; load 4 ref pixels
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r4, [r0, #12]       ; load source pixels a, row N
+    ldr     r6, [r0, #13]       ; load source pixels b, row N
+    ldr     r5, [r9, #12]       ; load source pixels c, row N+1
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+    ldr     r7, [r9, #13]       ; load source pixels d, row N+1
+
+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+    mvn     r7, r7
+    uhsub8  r5, r5, r7
+    eor     r5, r5, r10
+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+    mvn     r5, r5
+    uhsub8  r4, r4, r5
+    ldr     r5, [r2, #12]       ; load 4 ref pixels
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+    subs    r12, r12, #1
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    bne     loop
+
+    ; return stuff
+    ldr     r6, [sp, #40]       ; get address of sse
+    mul     r0, r8, r8          ; sum * sum
+    str     r11, [r6]           ; store sse
+    sub     r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
+
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+c80808080
+    DCD     0x80808080
+
+    END
diff --git a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
new file mode 100644
index 0000000..9e0a035
--- /dev/null
+++ b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
@@ -0,0 +1,178 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_variance_halfpixvar16x16_v_armv6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vp8_variance_halfpixvar16x16_v_armv6| PROC
+
+    stmfd   sp!, {r4-r12, lr}
+    mov     r8, #0              ; initialize sum = 0
+    ldr     r10, c80808080
+    mov     r11, #0             ; initialize sse = 0
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+    mov     lr, #0              ; constant zero
+loop
+    add     r9, r0, r1          ; set src pointer to next row
+    ; 1st 4 pixels
+    ldr     r4, [r0, #0]        ; load 4 src pixels
+    ldr     r6, [r9, #0]        ; load 4 src pixels from next row
+    ldr     r5, [r2, #0]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+    ; calculate total sum
+    adds    r8, r8, r4          ; add positive differences to sum
+    subs    r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r4, [r0, #4]        ; load 4 src pixels
+    ldr     r6, [r9, #4]        ; load 4 src pixels from next row
+    ldr     r5, [r2, #4]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r4, [r0, #8]        ; load 4 src pixels
+    ldr     r6, [r9, #8]        ; load 4 src pixels from next row
+    ldr     r5, [r2, #8]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r4, [r0, #12]       ; load 4 src pixels
+    ldr     r6, [r9, #12]       ; load 4 src pixels from next row
+    ldr     r5, [r2, #12]       ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+
+    subs    r12, r12, #1
+
+    bne     loop
+
+    ; return stuff
+    ldr     r6, [sp, #40]       ; get address of sse
+    mul     r0, r8, r8          ; sum * sum
+    str     r11, [r6]           ; store sse
+    sub     r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
+
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+c80808080
+    DCD     0x80808080
+
+    END
+
diff --git a/vp8/encoder/arm/encodemb_arm.c b/vp8/encoder/arm/encodemb_arm.c
deleted file mode 100644
index 88ad3fc..0000000
--- a/vp8/encoder/arm/encodemb_arm.c
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp8/encoder/encodemb.h"
-#include "vp8/common/reconinter.h"
-#include "vp8/encoder/quantize.h"
-#include "vp8/common/invtrans.h"
-#include "vp8/common/recon.h"
-#include "vp8/common/reconintra.h"
-#include "vp8/encoder/dct.h"
-#include "vpx_mem/vpx_mem.h"
-
-extern void vp8_subtract_b_neon_func(short *diff, unsigned char *src, unsigned char *pred, int stride, int pitch);
-
-void vp8_subtract_b_neon(BLOCK *be, BLOCKD *bd, int pitch)
-{
-    unsigned char *src_ptr = (*(be->base_src) + be->src);
-    short *diff_ptr = be->src_diff;
-    unsigned char *pred_ptr = bd->predictor;
-    int src_stride = be->src_stride;
-
-    vp8_subtract_b_neon_func(diff_ptr, src_ptr, pred_ptr, src_stride, pitch);
-}
diff --git a/vp8/encoder/arm/encodemb_arm.h b/vp8/encoder/arm/encodemb_arm.h
index 8fe4537..bf417fe 100644
--- a/vp8/encoder/arm/encodemb_arm.h
+++ b/vp8/encoder/arm/encodemb_arm.h
@@ -12,6 +12,24 @@
 #ifndef ENCODEMB_ARM_H
 #define ENCODEMB_ARM_H
 
+#if HAVE_ARMV6
+extern prototype_subb(vp8_subtract_b_armv6);
+extern prototype_submby(vp8_subtract_mby_armv6);
+extern prototype_submbuv(vp8_subtract_mbuv_armv6);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp8_encodemb_subb
+#define vp8_encodemb_subb vp8_subtract_b_armv6
+
+#undef  vp8_encodemb_submby
+#define vp8_encodemb_submby vp8_subtract_mby_armv6
+
+#undef  vp8_encodemb_submbuv
+#define vp8_encodemb_submbuv vp8_subtract_mbuv_armv6
+#endif
+
+#endif /* HAVE_ARMV6 */
+
 #if HAVE_ARMV7
 //extern prototype_berr(vp8_block_error_c);
 //extern prototype_mberr(vp8_mbblock_error_c);
diff --git a/vp8/encoder/arm/neon/subtract_neon.asm b/vp8/encoder/arm/neon/subtract_neon.asm
index 3ea00f8..68c2950 100644
--- a/vp8/encoder/arm/neon/subtract_neon.asm
+++ b/vp8/encoder/arm/neon/subtract_neon.asm
@@ -8,45 +8,58 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-
-    EXPORT |vp8_subtract_b_neon_func|
+    EXPORT |vp8_subtract_b_neon|
     EXPORT |vp8_subtract_mby_neon|
     EXPORT |vp8_subtract_mbuv_neon|
 
+    INCLUDE asm_enc_offsets.asm
+
     ARM
     REQUIRE8
     PRESERVE8
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
-;=========================================
-;void vp8_subtract_b_neon_func(short *diff, unsigned char *src, unsigned char *pred, int stride, int pitch);
-|vp8_subtract_b_neon_func| PROC
-    ldr             r12, [sp]               ;load pitch
 
-    vld1.8          {d0}, [r1], r3          ;load src
-    vld1.8          {d1}, [r2], r12         ;load pred
-    vld1.8          {d2}, [r1], r3
-    vld1.8          {d3}, [r2], r12
-    vld1.8          {d4}, [r1], r3
-    vld1.8          {d5}, [r2], r12
-    vld1.8          {d6}, [r1], r3
-    vld1.8          {d7}, [r2], r12
+;void vp8_subtract_b_neon(BLOCK *be, BLOCKD *bd, int pitch)
+|vp8_subtract_b_neon| PROC
+
+    stmfd   sp!, {r4-r7}
+
+    ldr     r3, [r0, #vp8_block_base_src]
+    ldr     r4, [r0, #vp8_block_src]
+    ldr     r5, [r0, #vp8_block_src_diff]
+    ldr     r3, [r3]
+    ldr     r6, [r0, #vp8_block_src_stride]
+    add     r3, r3, r4                      ; src = *base_src + src
+    ldr     r7, [r1, #vp8_blockd_predictor]
+
+    vld1.8          {d0}, [r3], r6          ;load src
+    vld1.8          {d1}, [r7], r2          ;load pred
+    vld1.8          {d2}, [r3], r6
+    vld1.8          {d3}, [r7], r2
+    vld1.8          {d4}, [r3], r6
+    vld1.8          {d5}, [r7], r2
+    vld1.8          {d6}, [r3], r6
+    vld1.8          {d7}, [r7], r2
 
     vsubl.u8        q10, d0, d1
     vsubl.u8        q11, d2, d3
     vsubl.u8        q12, d4, d5
     vsubl.u8        q13, d6, d7
 
-    mov             r12, r12, lsl #1
+    mov             r2, r2, lsl #1
 
-    vst1.16         {d20}, [r0], r12        ;store diff
-    vst1.16         {d22}, [r0], r12
-    vst1.16         {d24}, [r0], r12
-    vst1.16         {d26}, [r0], r12
+    vst1.16         {d20}, [r5], r2         ;store diff
+    vst1.16         {d22}, [r5], r2
+    vst1.16         {d24}, [r5], r2
+    vst1.16         {d26}, [r5], r2
 
+    ldmfd   sp!, {r4-r7}
     bx              lr
+
     ENDP
 
+
 ;==========================================
 ;void vp8_subtract_mby_neon(short *diff, unsigned char *src, unsigned char *pred, int stride)
 |vp8_subtract_mby_neon| PROC
diff --git a/vp8/encoder/arm/variance_arm.c b/vp8/encoder/arm/variance_arm.c
index ed1fb16..e77be9f 100644
--- a/vp8/encoder/arm/variance_arm.c
+++ b/vp8/encoder/arm/variance_arm.c
@@ -57,51 +57,38 @@
     unsigned short first_pass[36*16];
     unsigned char  second_pass[20*16];
     const short *HFilter, *VFilter;
+    unsigned int var;
 
-    HFilter = vp8_bilinear_filters[xoffset];
-    VFilter = vp8_bilinear_filters[yoffset];
+    if (xoffset == 4 && yoffset == 0)
+    {
+        var = vp8_variance_halfpixvar16x16_h_armv6(src_ptr, src_pixels_per_line,
+                                                   dst_ptr, dst_pixels_per_line, sse);
+    }
+    else if (xoffset == 0 && yoffset == 4)
+    {
+        var = vp8_variance_halfpixvar16x16_v_armv6(src_ptr, src_pixels_per_line,
+                                                   dst_ptr, dst_pixels_per_line, sse);
+    }
+    else if (xoffset == 4 && yoffset == 4)
+    {
+        var = vp8_variance_halfpixvar16x16_hv_armv6(src_ptr, src_pixels_per_line,
+                                                   dst_ptr, dst_pixels_per_line, sse);
+    }
+    else
+    {
+        HFilter = vp8_bilinear_filters[xoffset];
+        VFilter = vp8_bilinear_filters[yoffset];
 
-    vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,
-                                            src_pixels_per_line,
-                                            17, 16, HFilter);
-    vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
-                                             16, 16, 16, VFilter);
+        vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,
+                                                src_pixels_per_line,
+                                                17, 16, HFilter);
+        vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
+                                                 16, 16, 16, VFilter);
 
-    return vp8_variance16x16_armv6(second_pass, 16, dst_ptr,
-                                   dst_pixels_per_line, sse);
-}
-
-unsigned int vp8_variance_halfpixvar16x16_h_armv6(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    return vp8_sub_pixel_variance16x16_armv6(src_ptr, source_stride, 4, 0,
-                                         ref_ptr, recon_stride, sse);
-}
-
-unsigned int vp8_variance_halfpixvar16x16_v_armv6(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    return vp8_sub_pixel_variance16x16_armv6(src_ptr, source_stride, 0, 4,
-                                         ref_ptr, recon_stride, sse);
-}
-
-unsigned int vp8_variance_halfpixvar16x16_hv_armv6(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    return vp8_sub_pixel_variance16x16_armv6(src_ptr, source_stride, 4, 4,
-                                         ref_ptr, recon_stride, sse);
+        var = vp8_variance16x16_armv6(second_pass, 16, dst_ptr,
+                                       dst_pixels_per_line, sse);
+    }
+    return var;
 }
 
 #endif /* HAVE_ARMV6 */
diff --git a/vp8/encoder/asm_enc_offsets.c b/vp8/encoder/asm_enc_offsets.c
index c7983c1..9c81c8d 100644
--- a/vp8/encoder/asm_enc_offsets.c
+++ b/vp8/encoder/asm_enc_offsets.c
@@ -48,6 +48,14 @@
 DEFINE(vp8_blockd_dqcoeff,                      offsetof(BLOCKD, dqcoeff));
 DEFINE(vp8_blockd_eob,                          offsetof(BLOCKD, eob));
 
+// subtract
+DEFINE(vp8_block_base_src,                      offsetof(BLOCK, base_src));
+DEFINE(vp8_block_src,                           offsetof(BLOCK, src));
+DEFINE(vp8_block_src_diff,                      offsetof(BLOCK, src_diff));
+DEFINE(vp8_block_src_stride,                    offsetof(BLOCK, src_stride));
+
+DEFINE(vp8_blockd_predictor,                    offsetof(BLOCKD, predictor));
+
 //pack tokens
 DEFINE(vp8_writer_lowvalue,                     offsetof(vp8_writer, lowvalue));
 DEFINE(vp8_writer_range,                        offsetof(vp8_writer, range));
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index e5d2bd8..a18447d 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -3258,7 +3258,8 @@
     }
 
 #if CONFIG_MULTITHREAD
-    sem_post(&cpi->h_event_end_lpf); /* signal that we have set filter_level */
+    if (cpi->b_multi_threaded)
+        sem_post(&cpi->h_event_end_lpf); /* signal that we have set filter_level */
 #endif
 
     if (cm->filter_level > 0)
diff --git a/vp8/vp8cx_arm.mk b/vp8/vp8cx_arm.mk
index 349c3fd..165dada 100644
--- a/vp8/vp8cx_arm.mk
+++ b/vp8/vp8cx_arm.mk
@@ -15,7 +15,6 @@
 # encoder
 VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/arm_csystemdependent.c
 
-VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/encodemb_arm.c
 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/quantize_arm.c
 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/picklpf_arm.c
 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/dct_arm.c
@@ -34,10 +33,14 @@
 
 #File list for armv6
 # encoder
+VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_subtract_armv6$(ASM)
 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_fast_fdct4x4_armv6$(ASM)
 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_fast_quantize_b_armv6$(ASM)
 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_sad16x16_armv6$(ASM)
 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_variance16x16_armv6$(ASM)
+VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6$(ASM)
+VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6$(ASM)
+VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM)
 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_mse16x16_armv6$(ASM)
 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_variance8x8_armv6$(ASM)
 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/walsh_v6$(ASM)
diff --git a/vpxenc.c b/vpxenc.c
index 6c13cd1..39256b6 100755
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -236,7 +236,13 @@
                 stats->buf_ptr = new_ptr + (stats->buf_ptr - (char *)stats->buf.buf);
                 stats->buf.buf = new_ptr;
                 stats->buf_alloc_sz = new_sz;
-            } /* else ... */
+            }
+            else
+            {
+                fprintf(stderr,
+                        "\nFailed to realloc firstpass stats buffer.\n");
+                exit(EXIT_FAILURE);
+            }
         }
 
         memcpy(stats->buf_ptr, pkt, len);
@@ -698,10 +704,18 @@
         /* Save a cue point if this is a keyframe. */
         if(is_keyframe)
         {
-            struct cue_entry *cue;
+            struct cue_entry *cue, *new_cue_list;
 
-            glob->cue_list = realloc(glob->cue_list,
-                                     (glob->cues+1) * sizeof(struct cue_entry));
+            new_cue_list = realloc(glob->cue_list,
+                                   (glob->cues+1) * sizeof(struct cue_entry));
+            if(new_cue_list)
+                glob->cue_list = new_cue_list;
+            else
+            {
+                fprintf(stderr, "\nFailed to realloc cue list.\n");
+                exit(EXIT_FAILURE);
+            }
+
             cue = &glob->cue_list[glob->cues];
             cue->time = glob->cluster_timecode;
             cue->loc = glob->cluster_pos;