Merge "Update CHANGELOG for Cayuga release" into cayuga
diff --git a/.mailmap b/.mailmap
index 5052f29..2e1d4a9 100644
--- a/.mailmap
+++ b/.mailmap
@@ -2,3 +2,4 @@
 Johann Koenig <>
 Tero Rintaluoma <> <>
 Tom Finegan <>
+Ralph Giles <> <>
diff --git a/AUTHORS b/AUTHORS
index b8fc45e..a93df45 100644
@@ -4,8 +4,11 @@
 Aaron Watry <>
 Adrian Grange <>
 Alex Converse <>
+Alexis Ballier <>
+Alok Ahuja <>
 Andoni Morales Alastruey <>
 Andres Mejia <>
+Aron Rosenberg <>
 Attila Nagy <>
 Fabio Pedretti <>
 Frank Galligan <>
@@ -22,20 +25,29 @@
 Jim Bankoski <>
 Johann Koenig <>
 John Koleszar <>
+Joshua Bleecher Snyder <>
 Justin Clift <>
 Justin Lebar <>
+Lou Quillio <>
 Luca Barbato <>
 Makoto Kato <>
 Martin Ettl <>
 Michael Kohler <>
+Mike Hommey <>
 Mikhal Shemer <>
 Pascal Massimino <>
 Patrik Westin <>
 Paul Wilkins <>
 Pavol Rusnak <>
 Philip Jägenstedt <>
+Rafael Ávila de Espíndola <>
+Ralph Giles <>
+Ronald S. Bultje <>
 Scott LaVarnway <>
+Stefan Holmer <>
+Taekhyun Kim <>
 Tero Rintaluoma <>
+Thijs Vermeir <>
 Timothy B. Terriberry <>
 Tom Finegan <>
 Yaowu Xu <>
diff --git a/ b/
index 452c3f8..757e068 100644
--- a/
+++ b/
@@ -180,7 +180,7 @@
 LIBVPX_OBJS=$(call objs,$(CODEC_SRCS))
-LIBS-$(CONFIG_STATIC) += $(BUILD_PFX)libvpx.a $(BUILD_PFX)libvpx_g.a
+LIBS-$(if $(BUILD_LIBVPX),$(CONFIG_STATIC)) += $(BUILD_PFX)libvpx.a $(BUILD_PFX)libvpx_g.a
 $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS)
@@ -272,20 +272,20 @@
 ifeq ($(filter icc gcc,$(TGT_CC)), $(TGT_CC))
-    asm_com_offsets.asm: $(VP8_PREFIX)common/asm_com_offsets.c.S
+    $(BUILD_PFX)asm_com_offsets.asm: $(BUILD_PFX)$(VP8_PREFIX)common/asm_com_offsets.c.S
 	grep EQU $< | tr -d '$$\#' $(ADS2GAS) > $@
-    $(VP8_PREFIX)common/asm_com_offsets.c.S: vp8/common/asm_com_offsets.c
-    CLEAN-OBJS += asm_com_offsets.asm $(VP8_PREFIX)common/asm_com_offsets.c.S
+    $(BUILD_PFX)$(VP8_PREFIX)common/asm_com_offsets.c.S: $(VP8_PREFIX)common/asm_com_offsets.c
+    CLEAN-OBJS += $(BUILD_PFX)asm_com_offsets.asm $(BUILD_PFX)$(VP8_PREFIX)common/asm_com_offsets.c.S
-    asm_enc_offsets.asm: $(VP8_PREFIX)encoder/asm_enc_offsets.c.S
+    $(BUILD_PFX)asm_enc_offsets.asm: $(BUILD_PFX)$(VP8_PREFIX)encoder/asm_enc_offsets.c.S
 	grep EQU $< | tr -d '$$\#' $(ADS2GAS) > $@
-    $(VP8_PREFIX)encoder/asm_enc_offsets.c.S: vp8/encoder/asm_enc_offsets.c
-    CLEAN-OBJS += asm_enc_offsets.asm $(VP8_PREFIX)encoder/asm_enc_offsets.c.S
+    $(BUILD_PFX)$(VP8_PREFIX)encoder/asm_enc_offsets.c.S: $(VP8_PREFIX)encoder/asm_enc_offsets.c
+    CLEAN-OBJS += $(BUILD_PFX)asm_enc_offsets.asm $(BUILD_PFX)$(VP8_PREFIX)encoder/asm_enc_offsets.c.S
-    asm_dec_offsets.asm: $(VP8_PREFIX)decoder/asm_dec_offsets.c.S
+    $(BUILD_PFX)asm_dec_offsets.asm: $(BUILD_PFX)$(VP8_PREFIX)decoder/asm_dec_offsets.c.S
 	grep EQU $< | tr -d '$$\#' $(ADS2GAS) > $@
-    $(VP8_PREFIX)decoder/asm_dec_offsets.c.S: vp8/decoder/asm_dec_offsets.c
-    CLEAN-OBJS += asm_dec_offsets.asm $(VP8_PREFIX)decoder/asm_dec_offsets.c.S
+    $(BUILD_PFX)$(VP8_PREFIX)decoder/asm_dec_offsets.c.S: $(VP8_PREFIX)decoder/asm_dec_offsets.c
+    CLEAN-OBJS += $(BUILD_PFX)asm_dec_offsets.asm $(BUILD_PFX)$(VP8_PREFIX)decoder/asm_dec_offsets.c.S
   ifeq ($(filter rvct,$(TGT_CC)), $(TGT_CC))
     asm_com_offsets.asm: obj_int_extract
diff --git a/vp8/common/asm_com_offsets.c b/vp8/common/asm_com_offsets.c
index e167d26..e135a4d 100644
--- a/vp8/common/asm_com_offsets.c
+++ b/vp8/common/asm_com_offsets.c
@@ -9,6 +9,8 @@
+#include "vpx_config.h"
+#include "vpx/vpx_codec.h"
 #include "vpx_ports/asm_offsets.h"
 #include "vpx_scale/yv12config.h"
@@ -25,8 +27,14 @@
 DEFINE(yv12_buffer_config_u_buffer,             offsetof(YV12_BUFFER_CONFIG, u_buffer));
 DEFINE(yv12_buffer_config_v_buffer,             offsetof(YV12_BUFFER_CONFIG, v_buffer));
 DEFINE(yv12_buffer_config_border,               offsetof(YV12_BUFFER_CONFIG, border));
 /* add asserts for any offset that is not supported by assembly code */
 /* add asserts for any size that is not supported by assembly code */
+/* vp8_yv12_extend_frame_borders_neon makes several assumptions based on this */
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 8330203..36a1865 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -2594,6 +2594,8 @@
         cpi->Source = &cpi->scaled_source;
+    else
+        cpi->Source = sd;
diff --git a/vpx_scale/arm/neon/vp8_vpxyv12_extendframeborders_neon.asm b/vpx_scale/arm/neon/vp8_vpxyv12_extendframeborders_neon.asm
index b0a3b93..8444b8e 100644
--- a/vpx_scale/arm/neon/vp8_vpxyv12_extendframeborders_neon.asm
+++ b/vpx_scale/arm/neon/vp8_vpxyv12_extendframeborders_neon.asm
@@ -18,42 +18,32 @@
     AREA ||.text||, CODE, READONLY, ALIGN=2
 ;void vp8_yv12_extend_frame_borders_neon (YV12_BUFFER_CONFIG *ybf);
-;Note: this is VP8 function, which has border=32 and 16. Internal y_width and y_height
-; are always multiples of 16.
+; we depend on VP8BORDERINPIXELS being 32
 |vp8_yv12_extend_frame_borders_neon| PROC
     push            {r4 - r10, lr}
     vpush           {d8 - d15}
-    ;Not need to load y_width, since: y_width = y_stride - 2*border
-    ldr             r3, [r0, #yv12_buffer_config_border]
-    ldr             r1, [r0, #yv12_buffer_config_y_buffer]       ;srcptr1
-    ldr             r4, [r0, #yv12_buffer_config_y_height]
-    ldr             lr, [r0, #yv12_buffer_config_y_stride]
+    ; Border = 32
+    ldr             r3, [r0, #yv12_buffer_config_y_width]  ; plane_width
+    ldr             r1, [r0, #yv12_buffer_config_y_buffer] ; src_ptr1
+    ldr             r4, [r0, #yv12_buffer_config_y_height] ; plane_height
+    ldr             lr, [r0, #yv12_buffer_config_y_stride] ; plane_stride
-    cmp             r3, #16
-    beq             b16_extend_frame_borders
+; Border copy for Y plane
+; copy the left and right most columns out
+    add             r6, r1, r3              ; dest_ptr2 = src_ptr2 + 1 (src_ptr1 + plane_width)
+    sub             r2, r6, #1              ; src_ptr2 = src_ptr1 + plane_width - 1
+    sub             r5, r1, #32             ; dest_ptr1 = src_ptr1 - Border
-;border = 32
-;Border copy for Y plane
-;copy the left and right most columns out
-    sub             r5, r1, r3              ;destptr1
-    add             r6, r1, lr
-    sub             r6, r6, r3, lsl #1      ;destptr2
-    sub             r2, r6, #1              ;srcptr2
-    ;Do four rows at one time
-    mov             r12, r4, lsr #2
+    mov             r12, r4, lsr #2         ; plane_height / 4
     vld1.8          {d0[], d1[]}, [r1], lr
     vld1.8          {d4[], d5[]}, [r2], lr
     vld1.8          {d8[], d9[]}, [r1], lr
     vld1.8          {d12[], d13[]}, [r2], lr
-    vld1.8          {d16[], d17[]},  [r1], lr
+    vld1.8          {d16[], d17[]}, [r1], lr
     vld1.8          {d20[], d21[]}, [r2], lr
     vld1.8          {d24[], d25[]}, [r1], lr
     vld1.8          {d28[], d29[]}, [r2], lr
@@ -81,15 +71,16 @@
     bne             copy_left_right_y
 ;Now copy the top and bottom source lines into each line of the respective borders
-    ldr             r7, [r0, #yv12_buffer_config_y_buffer]       ;srcptr1
-    mul             r8, r3, lr
+    ldr             r1, [r0, #yv12_buffer_config_y_buffer] ; y_buffer
+    mul             r8, r4, lr              ; plane_height * plane_stride
-    mov             r12, lr, lsr #7
+    ; copy width is plane_stride
+    mov             r12, lr, lsr #7         ; plane_stride / 128
-    sub             r6, r1, r3              ;destptr2
-    sub             r2, r6, lr              ;srcptr2
-    sub             r1, r7, r3              ;srcptr1
-    sub             r5, r1, r8              ;destptr1
+    sub             r1, r1, #32             ; src_ptr1 = y_buffer - Border
+    add             r6, r1, r8              ; dest_ptr2 = src_ptr2 - plane_stride (src_ptr1 + (plane_height * plane_stride))
+    sub             r2, r6, lr              ; src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride
+    sub             r5, r1, lr, asl #5      ; dest_ptr1 = src_ptr1 - (Border * plane_stride)
     vld1.8          {q0, q1}, [r1]!
@@ -101,7 +92,7 @@
     vld1.8          {q6, q7}, [r1]!
     vld1.8          {q14, q15}, [r2]!
-    mov             r7, r3
+    mov             r7, #32                 ; Border
     subs            r7, r7, #1
@@ -115,44 +106,41 @@
     vst1.8          {q6, q7}, [r5]!
     vst1.8          {q14, q15}, [r6]!
-    add             r5, r5, lr
-    sub             r5, r5, #128
-    add             r6, r6, lr
-    sub             r6, r6, #128
+    add             r5, r5, lr              ; dest_ptr1 += plane_stride
+    sub             r5, r5, #128            ; dest_ptr1 -= 128
+    add             r6, r6, lr              ; dest_ptr2 += plane_stride
+    sub             r6, r6, #128            ; dest_ptr2 -= 128
     bne             top_bottom_32
-    sub             r5, r1, r8
-    add             r6, r2, lr
+    sub             r5, r1, lr, asl #5      ; src_ptr1 - (Border* plane_stride)
+    add             r6, r2, lr              ; src_ptr2 + plane_stride
     subs            r12, r12, #1
     bne             copy_top_bottom_y
-    mov             r7, lr, lsr #4              ;check to see if extra copy is needed
+    mov             r7, lr, lsr #4          ; check to see if extra copy is needed
     ands            r7, r7, #0x7
     bne             extra_top_bottom_y
 ;Border copy for U, V planes
-    ldr             r1, [r0, #yv12_buffer_config_u_buffer]       ;srcptr1
-    mov             lr, lr, lsr #1              ;uv_stride
-    mov             r3, r3, lsr #1              ;border
-    mov             r4, r4, lsr #1              ;uv_height
-    mov             r8, r8, lsr #2
+; Border = 16
+    ldr             r7, [r0, #yv12_buffer_config_u_buffer]  ; src_ptr1
+    ldr             lr, [r0, #yv12_buffer_config_uv_stride] ; plane_stride
+    ldr             r3, [r0, #yv12_buffer_config_uv_width]  ; plane_width
+    ldr             r4, [r0, #yv12_buffer_config_uv_height] ; plane_height
     mov             r10, #2
 ;copy the left and right most columns out
-    sub             r5, r1, r3              ;destptr1
-    add             r6, r1, lr
-    sub             r6, r6, r3, lsl #1      ;destptr2
-    sub             r2, r6, #1              ;srcptr2
+    mov             r1, r7                  ; src_ptr1 needs to be saved for second half of loop
+    sub             r5, r1, #16             ; dest_ptr1 = src_ptr1 - Border
+    add             r6, r1, r3              ; dest_ptr2 = src_ptr2 + 1 (src_ptr1 + plane_width)
+    sub             r2, r6, #1              ; src_ptr2 = src_ptr1 + plane_width - 1
-    mov             r7, r1
-    ;Do eight rows at one time
-    mov             r12, r4, lsr #3
+    mov             r12, r4, lsr #3         ; plane_height / 8
     vld1.8          {d0[], d1[]}, [r1], lr
@@ -167,7 +155,7 @@
     vld1.8          {d18[], d19[]}, [r2], lr
     vld1.8          {d20[], d21[]}, [r1], lr
     vld1.8          {d22[], d23[]}, [r2], lr
-    vld1.8          {d24[], d25[]},  [r1], lr
+    vld1.8          {d24[], d25[]}, [r1], lr
     vld1.8          {d26[], d27[]}, [r2], lr
     vld1.8          {d28[], d29[]}, [r1], lr
     vld1.8          {d30[], d31[]}, [r2], lr
@@ -194,12 +182,14 @@
     bne             copy_left_right_uv
 ;Now copy the top and bottom source lines into each line of the respective borders
-    mov             r12, lr, lsr #6
+    mov             r1, r7
+    mul             r8, r4, lr              ; plane_height * plane_stride
+    mov             r12, lr, lsr #6         ; plane_stride / 64
-    sub             r6, r1, r3              ;destptr2
-    sub             r2, r6, lr              ;srcptr2
-    sub             r1, r7, r3              ;srcptr1
-    sub             r5, r1, r8              ;destptr1
+    sub             r1, r1, #16             ; src_ptr1 = u_buffer - Border
+    add             r6, r1, r8              ; dest_ptr2 = src_ptr2 + plane_stride (src_ptr1 + (plane_height * plane_stride)
+    sub             r2, r6, lr              ; src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride
+    sub             r5, r1, lr, asl #4      ; dest_ptr1 = src_ptr1 - (Border * plane_stride)
     vld1.8          {q0, q1}, [r1]!
@@ -207,7 +197,7 @@
     vld1.8          {q2, q3}, [r1]!
     vld1.8          {q10, q11}, [r2]!
-    mov             r7, r3
+    mov             r7, #16                 ; Border
     subs            r7, r7, #1
@@ -217,38 +207,37 @@
     vst1.8          {q2, q3}, [r5]!
     vst1.8          {q10, q11}, [r6]!
-    add             r5, r5, lr
+    add             r5, r5, lr              ; dest_ptr1 += plane_stride
     sub             r5, r5, #64
-    add             r6, r6, lr
+    add             r6, r6, lr              ; dest_ptr2 += plane_stride
     sub             r6, r6, #64
     bne             top_bottom_16
-    sub             r5, r1, r8
-    add             r6, r2, lr
+    sub             r5, r1, lr, asl #4      ; dest_ptr1 = src_ptr1 - (Border * plane_stride)
+    add             r6, r2, lr              ; dest_ptr2 = src_ptr2 + plane_stride
     subs            r12, r12, #1
     bne             copy_top_bottom_uv
-    mov             r7, lr, lsr #3              ;check to see if extra copy is needed
+    mov             r7, lr, lsr #3          ; check to see if extra copy is needed
     ands            r7, r7, #0x7
     bne             extra_top_bottom_uv
     subs            r10, r10, #1
-    ldrne           r1, [r0, #yv12_buffer_config_v_buffer]       ;srcptr1
+    ldrne           r7, [r0, #yv12_buffer_config_v_buffer] ; src_ptr1
     bne             border_copy_uv
     vpop            {d8 - d15}
     pop             {r4 - r10, pc}
-;extra copy part for Y
     vld1.8          {q0}, [r1]!
     vld1.8          {q2}, [r2]!
-    mov             r9, r3, lsr #3
+    mov             r9, #4                  ; 32 >> 3
     subs            r9, r9, #1
@@ -271,19 +260,18 @@
     vst1.8          {q2}, [r6], lr
     bne             extra_top_bottom_32
-    sub             r5, r1, r8
-    add             r6, r2, lr
+    sub             r5, r1, lr, asl #5      ; src_ptr1 - (Border * plane_stride)
+    add             r6, r2, lr              ; src_ptr2 + plane_stride
     subs            r7, r7, #1
     bne             extra_top_bottom_y
     b               end_of_border_copy_y
-;extra copy part for UV
     vld1.8          {d0}, [r1]!
     vld1.8          {d8}, [r2]!
-    mov             r9, r3, lsr #3
+    mov             r9, #2                  ; 16 >> 3
     subs            r9, r9, #1
@@ -306,283 +294,12 @@
     vst1.8          {d8}, [r6], lr
     bne             extra_top_bottom_16
-    sub             r5, r1, r8
-    add             r6, r2, lr
+    sub             r5, r1, lr, asl #4      ; src_ptr1 - (Border * plane_stride)
+    add             r6, r2, lr              ; src_ptr2 + plane_stride
     subs            r7, r7, #1
     bne             extra_top_bottom_uv
     b               end_of_border_copy_uv
-;border = 16
-;Border copy for Y plane
-;copy the left and right most columns out
-    sub             r5, r1, r3              ;destptr1
-    add             r6, r1, lr
-    sub             r6, r6, r3, lsl #1      ;destptr2
-    sub             r2, r6, #1              ;srcptr2
-    ;Do four rows at one time
-    mov             r12, r4, lsr #2
-    vld1.8          {d0[], d1[]}, [r1], lr
-    vld1.8          {d4[], d5[]}, [r2], lr
-    vld1.8          {d8[], d9[]}, [r1], lr
-    vld1.8          {d12[], d13[]}, [r2], lr
-    vld1.8          {d16[], d17[]},  [r1], lr
-    vld1.8          {d20[], d21[]}, [r2], lr
-    vld1.8          {d24[], d25[]}, [r1], lr
-    vld1.8          {d28[], d29[]}, [r2], lr
-    subs            r12, r12, #1
-    vst1.8          {q0}, [r5], lr
-    vst1.8          {q2}, [r6], lr
-    vst1.8          {q4}, [r5], lr
-    vst1.8          {q6}, [r6], lr
-    vst1.8          {q8}, [r5], lr
-    vst1.8          {q10}, [r6], lr
-    vst1.8          {q12}, [r5], lr
-    vst1.8          {q14}, [r6], lr
-    bne             copy_left_right_y_b16
-;Now copy the top and bottom source lines into each line of the respective borders
-    ldr             r7, [r0, #yv12_buffer_config_y_buffer]       ;srcptr1
-    mul             r8, r3, lr
-    mov             r12, lr, lsr #7
-    sub             r6, r1, r3              ;destptr2
-    sub             r2, r6, lr              ;srcptr2
-    sub             r1, r7, r3              ;srcptr1
-    sub             r5, r1, r8              ;destptr1
-    vld1.8          {q0, q1}, [r1]!
-    vld1.8          {q8, q9}, [r2]!
-    vld1.8          {q2, q3}, [r1]!
-    vld1.8          {q10, q11}, [r2]!
-    vld1.8          {q4, q5}, [r1]!
-    vld1.8          {q12, q13}, [r2]!
-    vld1.8          {q6, q7}, [r1]!
-    vld1.8          {q14, q15}, [r2]!
-    mov             r7, r3
-    subs            r7, r7, #1
-    vst1.8          {q0, q1}, [r5]!
-    vst1.8          {q8, q9}, [r6]!
-    vst1.8          {q2, q3}, [r5]!
-    vst1.8          {q10, q11}, [r6]!
-    vst1.8          {q4, q5}, [r5]!
-    vst1.8          {q12, q13}, [r6]!
-    vst1.8          {q6, q7}, [r5]!
-    vst1.8          {q14, q15}, [r6]!
-    add             r5, r5, lr
-    sub             r5, r5, #128
-    add             r6, r6, lr
-    sub             r6, r6, #128
-    bne             top_bottom_16_b16
-    sub             r5, r1, r8
-    add             r6, r2, lr
-    subs            r12, r12, #1
-    bne             copy_top_bottom_y_b16
-    mov             r7, lr, lsr #4              ;check to see if extra copy is needed
-    ands            r7, r7, #0x7
-    bne             extra_top_bottom_y_b16
-;Border copy for U, V planes
-    ldr             r1, [r0, #yv12_buffer_config_u_buffer]       ;srcptr1
-    mov             lr, lr, lsr #1              ;uv_stride
-    mov             r3, r3, lsr #1              ;border
-    mov             r4, r4, lsr #1              ;uv_height
-    mov             r8, r8, lsr #2
-    mov             r10, #2
-;copy the left and right most columns out
-    sub             r5, r1, r3              ;destptr1
-    add             r6, r1, lr
-    sub             r6, r6, r3, lsl #1      ;destptr2
-    sub             r2, r6, #1              ;srcptr2
-    mov             r7, r1
-    ;Do eight rows at one time
-    mov             r12, r4, lsr #3
-    vld1.8          {d0[]}, [r1], lr
-    vld1.8          {d2[]}, [r2], lr
-    vld1.8          {d4[]}, [r1], lr
-    vld1.8          {d6[]}, [r2], lr
-    vld1.8          {d8[]},  [r1], lr
-    vld1.8          {d10[]}, [r2], lr
-    vld1.8          {d12[]}, [r1], lr
-    vld1.8          {d14[]}, [r2], lr
-    vld1.8          {d16[]}, [r1], lr
-    vld1.8          {d18[]}, [r2], lr
-    vld1.8          {d20[]}, [r1], lr
-    vld1.8          {d22[]}, [r2], lr
-    vld1.8          {d24[]},  [r1], lr
-    vld1.8          {d26[]}, [r2], lr
-    vld1.8          {d28[]}, [r1], lr
-    vld1.8          {d30[]}, [r2], lr
-    subs            r12, r12, #1
-    vst1.8          {d0}, [r5], lr
-    vst1.8          {d2}, [r6], lr
-    vst1.8          {d4}, [r5], lr
-    vst1.8          {d6}, [r6], lr
-    vst1.8          {d8}, [r5], lr
-    vst1.8          {d10}, [r6], lr
-    vst1.8          {d12}, [r5], lr
-    vst1.8          {d14}, [r6], lr
-    vst1.8          {d16}, [r5], lr
-    vst1.8          {d18}, [r6], lr
-    vst1.8          {d20}, [r5], lr
-    vst1.8          {d22}, [r6], lr
-    vst1.8          {d24}, [r5], lr
-    vst1.8          {d26}, [r6], lr
-    vst1.8          {d28}, [r5], lr
-    vst1.8          {d30}, [r6], lr
-    bne             copy_left_right_uv_b16
-;Now copy the top and bottom source lines into each line of the respective borders
-    mov             r12, lr, lsr #6
-    sub             r6, r1, r3              ;destptr2
-    sub             r2, r6, lr              ;srcptr2
-    sub             r1, r7, r3              ;srcptr1
-    sub             r5, r1, r8              ;destptr1
-    vld1.8          {q0, q1}, [r1]!
-    vld1.8          {q8, q9}, [r2]!
-    vld1.8          {q2, q3}, [r1]!
-    vld1.8          {q10, q11}, [r2]!
-    mov             r7, r3
-    subs            r7, r7, #1
-    vst1.8          {q0, q1}, [r5]!
-    vst1.8          {q8, q9}, [r6]!
-    vst1.8          {q2, q3}, [r5]!
-    vst1.8          {q10, q11}, [r6]!
-    add             r5, r5, lr
-    sub             r5, r5, #64
-    add             r6, r6, lr
-    sub             r6, r6, #64
-    bne             top_bottom_8_b16
-    sub             r5, r1, r8
-    add             r6, r2, lr
-    subs            r12, r12, #1
-    bne             copy_top_bottom_uv_b16
-    mov             r7, lr, lsr #3              ;check to see if extra copy is needed
-    ands            r7, r7, #0x7
-    bne             extra_top_bottom_uv_b16
-    subs            r10, r10, #1
-    ldrne           r1, [r0, #yv12_buffer_config_v_buffer]       ;srcptr1
-    bne             border_copy_uv_b16
-    vpop            {d8-d15}
-    pop             {r4 - r10, pc}
-;extra copy part for Y
-    vld1.8          {q0}, [r1]!
-    vld1.8          {q2}, [r2]!
-    mov             r9, r3, lsr #3
-    subs            r9, r9, #1
-    vst1.8          {q0}, [r5], lr
-    vst1.8          {q2}, [r6], lr
-    vst1.8          {q0}, [r5], lr
-    vst1.8          {q2}, [r6], lr
-    vst1.8          {q0}, [r5], lr
-    vst1.8          {q2}, [r6], lr
-    vst1.8          {q0}, [r5], lr
-    vst1.8          {q2}, [r6], lr
-    vst1.8          {q0}, [r5], lr
-    vst1.8          {q2}, [r6], lr
-    vst1.8          {q0}, [r5], lr
-    vst1.8          {q2}, [r6], lr
-    vst1.8          {q0}, [r5], lr
-    vst1.8          {q2}, [r6], lr
-    vst1.8          {q0}, [r5], lr
-    vst1.8          {q2}, [r6], lr
-    bne             extra_top_bottom_16_b16
-    sub             r5, r1, r8
-    add             r6, r2, lr
-    subs            r7, r7, #1
-    bne             extra_top_bottom_y_b16
-    b               end_of_border_copy_y_b16
-;extra copy part for UV
-    vld1.8          {d0}, [r1]!
-    vld1.8          {d8}, [r2]!
-    mov             r9, r3, lsr #3
-    subs            r9, r9, #1
-    vst1.8          {d0}, [r5], lr
-    vst1.8          {d8}, [r6], lr
-    vst1.8          {d0}, [r5], lr
-    vst1.8          {d8}, [r6], lr
-    vst1.8          {d0}, [r5], lr
-    vst1.8          {d8}, [r6], lr
-    vst1.8          {d0}, [r5], lr
-    vst1.8          {d8}, [r6], lr
-    vst1.8          {d0}, [r5], lr
-    vst1.8          {d8}, [r6], lr
-    vst1.8          {d0}, [r5], lr
-    vst1.8          {d8}, [r6], lr
-    vst1.8          {d0}, [r5], lr
-    vst1.8          {d8}, [r6], lr
-    vst1.8          {d0}, [r5], lr
-    vst1.8          {d8}, [r6], lr
-    bne             extra_top_bottom_8_b16
-    sub             r5, r1, r8
-    add             r6, r2, lr
-    subs            r7, r7, #1
-    bne             extra_top_bottom_uv_b16
-    b               end_of_border_copy_uv_b16