|  | ; | 
|  | ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 
|  | ; | 
|  | ;  Use of this source code is governed by a BSD-style license | 
|  | ;  that can be found in the LICENSE file in the root of the source | 
|  | ;  tree. An additional intellectual property rights grant can be found | 
|  | ;  in the file PATENTS.  All contributing project authors may | 
|  | ;  be found in the AUTHORS file in the root of the source tree. | 
|  | ; | 
|  |  | 
|  |  | 
|  | EXPORT  |vp8_sixtap_predict8x4_armv6| | 
|  |  | 
|  | AREA    |.text|, CODE, READONLY  ; name this block of code | 
|  | ;------------------------------------- | 
|  | ; r0    unsigned char *src_ptr, | 
|  | ; r1    int  src_pixels_per_line, | 
|  | ; r2    int  xoffset, | 
|  | ; r3    int  yoffset, | 
|  | ; stack unsigned char *dst_ptr, | 
|  | ; stack int  dst_pitch | 
|  | ;------------------------------------- | 
|  | ;note: In first pass, store the result in transpose(8linesx9columns) on stack. Temporary stack size is 184. | 
|  | ;Line width is 20 that is 9 short data plus 2 to make it 4bytes aligned. In second pass, load data from stack, | 
|  | ;and the result is stored in transpose. | 
|  | |vp8_sixtap_predict8x4_armv6| PROC | 
|  | stmdb       sp!, {r4 - r11, lr} | 
|  | str         r3, [sp, #-184]!            ;reserve space on stack for temporary storage, store yoffset | 
|  |  | 
|  | cmp         r2, #0                      ;skip first_pass filter if xoffset=0 | 
|  | add         lr, sp, #4                  ;point to temporary buffer | 
|  | beq         skip_firstpass_filter | 
|  |  | 
|  | ;first-pass filter | 
|  | adr         r12, filter8_coeff | 
|  | sub         r0, r0, r1, lsl #1 | 
|  |  | 
|  | add         r3, r1, #10                 ; preload next low | 
|  | pld         [r0, r3] | 
|  |  | 
|  | add         r2, r12, r2, lsl #4         ;calculate filter location | 
|  | add         r0, r0, #3                  ;adjust src only for loading convinience | 
|  |  | 
|  | ldr         r3, [r2]                    ; load up packed filter coefficients | 
|  | ldr         r4, [r2, #4] | 
|  | ldr         r5, [r2, #8] | 
|  |  | 
|  | mov         r2, #0x90000                ; height=9 is top part of counter | 
|  |  | 
|  | sub         r1, r1, #8 | 
|  |  | 
|  | |first_pass_hloop_v6| | 
|  | ldrb        r6, [r0, #-5]               ; load source data | 
|  | ldrb        r7, [r0, #-4] | 
|  | ldrb        r8, [r0, #-3] | 
|  | ldrb        r9, [r0, #-2] | 
|  | ldrb        r10, [r0, #-1] | 
|  |  | 
|  | orr         r2, r2, #0x4                ; construct loop counter. width=8=4x2 | 
|  |  | 
|  | pkhbt       r6, r6, r7, lsl #16         ; r7 | r6 | 
|  | pkhbt       r7, r7, r8, lsl #16         ; r8 | r7 | 
|  |  | 
|  | pkhbt       r8, r8, r9, lsl #16         ; r9 | r8 | 
|  | pkhbt       r9, r9, r10, lsl #16        ; r10 | r9 | 
|  |  | 
|  | |first_pass_wloop_v6| | 
|  | smuad       r11, r6, r3                 ; vp8_filter[0], vp8_filter[1] | 
|  | smuad       r12, r7, r3 | 
|  |  | 
|  | ldrb        r6, [r0], #1 | 
|  |  | 
|  | smlad       r11, r8, r4, r11            ; vp8_filter[2], vp8_filter[3] | 
|  | ldrb        r7, [r0], #1 | 
|  | smlad       r12, r9, r4, r12 | 
|  |  | 
|  | pkhbt       r10, r10, r6, lsl #16       ; r10 | r9 | 
|  | pkhbt       r6, r6, r7, lsl #16         ; r11 | r10 | 
|  | smlad       r11, r10, r5, r11           ; vp8_filter[4], vp8_filter[5] | 
|  | smlad       r12, r6, r5, r12 | 
|  |  | 
|  | sub         r2, r2, #1 | 
|  |  | 
|  | add         r11, r11, #0x40             ; round_shift_and_clamp | 
|  | tst         r2, #0xff                   ; test loop counter | 
|  | usat        r11, #8, r11, asr #7 | 
|  | add         r12, r12, #0x40 | 
|  | strh        r11, [lr], #20              ; result is transposed and stored, which | 
|  | usat        r12, #8, r12, asr #7 | 
|  |  | 
|  | strh        r12, [lr], #20 | 
|  |  | 
|  | movne       r11, r6 | 
|  | movne       r12, r7 | 
|  |  | 
|  | movne       r6, r8 | 
|  | movne       r7, r9 | 
|  | movne       r8, r10 | 
|  | movne       r9, r11 | 
|  | movne       r10, r12 | 
|  |  | 
|  | bne         first_pass_wloop_v6 | 
|  |  | 
|  | ;;add       r9, ppl, #30                ; attempt to load 2 adjacent cache lines | 
|  | ;;IF ARCHITECTURE=6 | 
|  | ;pld        [src, ppl] | 
|  | ;;pld       [src, r9] | 
|  | ;;ENDIF | 
|  |  | 
|  | subs        r2, r2, #0x10000 | 
|  |  | 
|  | sub         lr, lr, #158 | 
|  |  | 
|  | add         r0, r0, r1                  ; move to next input line | 
|  |  | 
|  | add         r11, r1, #18                ; preload next low. adding back block width(=8), which is subtracted earlier | 
|  | pld         [r0, r11] | 
|  |  | 
|  | bne         first_pass_hloop_v6 | 
|  |  | 
|  | ;second pass filter | 
|  | secondpass_filter | 
|  | ldr         r3, [sp], #4                ; load back yoffset | 
|  | ldr         r0, [sp, #216]              ; load dst address from stack 180+36 | 
|  | ldr         r1, [sp, #220]              ; load dst stride from stack 180+40 | 
|  |  | 
|  | cmp         r3, #0 | 
|  | beq         skip_secondpass_filter | 
|  |  | 
|  | adr         r12, filter8_coeff | 
|  | add         lr, r12, r3, lsl #4         ;calculate filter location | 
|  |  | 
|  | mov         r2, #0x00080000 | 
|  |  | 
|  | ldr         r3, [lr]                    ; load up packed filter coefficients | 
|  | ldr         r4, [lr, #4] | 
|  | ldr         r5, [lr, #8] | 
|  |  | 
|  | pkhbt       r12, r4, r3                 ; pack the filter differently | 
|  | pkhbt       r11, r5, r4 | 
|  |  | 
|  | second_pass_hloop_v6 | 
|  | ldr         r6, [sp]                    ; load the data | 
|  | ldr         r7, [sp, #4] | 
|  |  | 
|  | orr         r2, r2, #2                  ; loop counter | 
|  |  | 
|  | second_pass_wloop_v6 | 
|  | smuad       lr, r3, r6                  ; apply filter | 
|  | smulbt      r10, r3, r6 | 
|  |  | 
|  | ldr         r8, [sp, #8] | 
|  |  | 
|  | smlad       lr, r4, r7, lr | 
|  | smladx      r10, r12, r7, r10 | 
|  |  | 
|  | ldrh        r9, [sp, #12] | 
|  |  | 
|  | smlad       lr, r5, r8, lr | 
|  | smladx      r10, r11, r8, r10 | 
|  |  | 
|  | add         sp, sp, #4 | 
|  | smlatb      r10, r5, r9, r10 | 
|  |  | 
|  | sub         r2, r2, #1 | 
|  |  | 
|  | add         lr, lr, #0x40               ; round_shift_and_clamp | 
|  | tst         r2, #0xff | 
|  | usat        lr, #8, lr, asr #7 | 
|  | add         r10, r10, #0x40 | 
|  | strb        lr, [r0], r1                ; the result is transposed back and stored | 
|  | usat        r10, #8, r10, asr #7 | 
|  |  | 
|  | strb        r10, [r0],r1 | 
|  |  | 
|  | movne       r6, r7 | 
|  | movne       r7, r8 | 
|  |  | 
|  | bne         second_pass_wloop_v6 | 
|  |  | 
|  | subs        r2, r2, #0x10000 | 
|  | add         sp, sp, #12                 ; updata src for next loop (20-8) | 
|  | sub         r0, r0, r1, lsl #2 | 
|  | add         r0, r0, #1 | 
|  |  | 
|  | bne         second_pass_hloop_v6 | 
|  |  | 
|  | add         sp, sp, #20 | 
|  | ldmia       sp!, {r4 - r11, pc} | 
|  |  | 
|  | ;-------------------- | 
|  | skip_firstpass_filter | 
|  | sub         r0, r0, r1, lsl #1 | 
|  | sub         r1, r1, #8 | 
|  | mov         r2, #9 | 
|  |  | 
|  | skip_firstpass_hloop | 
|  | ldrb        r4, [r0], #1                ; load data | 
|  | subs        r2, r2, #1 | 
|  | ldrb        r5, [r0], #1 | 
|  | strh        r4, [lr], #20               ; store it to immediate buffer | 
|  | ldrb        r6, [r0], #1                ; load data | 
|  | strh        r5, [lr], #20 | 
|  | ldrb        r7, [r0], #1 | 
|  | strh        r6, [lr], #20 | 
|  | ldrb        r8, [r0], #1 | 
|  | strh        r7, [lr], #20 | 
|  | ldrb        r9, [r0], #1 | 
|  | strh        r8, [lr], #20 | 
|  | ldrb        r10, [r0], #1 | 
|  | strh        r9, [lr], #20 | 
|  | ldrb        r11, [r0], #1 | 
|  | strh        r10, [lr], #20 | 
|  | add         r0, r0, r1                  ; move to next input line | 
|  | strh        r11, [lr], #20 | 
|  |  | 
|  | sub         lr, lr, #158                ; move over to next column | 
|  | bne         skip_firstpass_hloop | 
|  |  | 
|  | b           secondpass_filter | 
|  |  | 
|  | ;-------------------- | 
|  | skip_secondpass_filter | 
|  | mov         r2, #8 | 
|  | add         sp, sp, #4                  ;start from src[0] instead of src[-2] | 
|  |  | 
|  | skip_secondpass_hloop | 
|  | ldr         r6, [sp], #4 | 
|  | subs        r2, r2, #1 | 
|  | ldr         r8, [sp], #4 | 
|  |  | 
|  | mov         r7, r6, lsr #16             ; unpack | 
|  | strb        r6, [r0], r1 | 
|  | mov         r9, r8, lsr #16 | 
|  | strb        r7, [r0], r1 | 
|  | add         sp, sp, #12                 ; 20-8 | 
|  | strb        r8, [r0], r1 | 
|  | strb        r9, [r0], r1 | 
|  |  | 
|  | sub         r0, r0, r1, lsl #2 | 
|  | add         r0, r0, #1 | 
|  |  | 
|  | bne         skip_secondpass_hloop | 
|  |  | 
|  | add         sp, sp, #16                 ; 180 - (160 +4) | 
|  |  | 
|  | ldmia       sp!, {r4 - r11, pc} | 
|  |  | 
|  | ENDP | 
|  |  | 
|  | ;----------------- | 
|  | ;One word each is reserved. Label filter_coeff can be used to access the data. | 
|  | ;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... | 
|  | filter8_coeff | 
|  | DCD     0x00000000,     0x00000080,     0x00000000,     0x00000000 | 
|  | DCD     0xfffa0000,     0x000c007b,     0x0000ffff,     0x00000000 | 
|  | DCD     0xfff50002,     0x0024006c,     0x0001fff8,     0x00000000 | 
|  | DCD     0xfff70000,     0x0032005d,     0x0000fffa,     0x00000000 | 
|  | DCD     0xfff00003,     0x004d004d,     0x0003fff0,     0x00000000 | 
|  | DCD     0xfffa0000,     0x005d0032,     0x0000fff7,     0x00000000 | 
|  | DCD     0xfff80001,     0x006c0024,     0x0002fff5,     0x00000000 | 
|  | DCD     0xffff0000,     0x007b000c,     0x0000fffa,     0x00000000 | 
|  |  | 
|  | ;DCD        0,  0,  128,    0,   0,  0 | 
|  | ;DCD        0, -6,  123,   12,  -1,  0 | 
|  | ;DCD        2, -11, 108,   36,  -8,  1 | 
|  | ;DCD        0, -9,   93,   50,  -6,  0 | 
|  | ;DCD        3, -16,  77,   77, -16,  3 | 
|  | ;DCD        0, -6,   50,   93,  -9,  0 | 
|  | ;DCD        1, -8,   36,  108, -11,  2 | 
|  | ;DCD        0, -1,   12,  123,  -6,  0 | 
|  |  | 
|  | END |