|  | ; | 
|  | ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 
|  | ; | 
|  | ;  Use of this source code is governed by a BSD-style license | 
|  | ;  that can be found in the LICENSE file in the root of the source | 
|  | ;  tree. An additional intellectual property rights grant can be found | 
|  | ;  in the file PATENTS.  All contributing project authors may | 
|  | ;  be found in the AUTHORS file in the root of the source tree. | 
|  | ; | 
|  |  | 
|  |  | 
|  | .globl short_idct4x4_ppc | 
|  |  | 
|  | .macro load_c V, LABEL, OFF, R0, R1 | 
|  | lis     \R0, \LABEL@ha | 
|  | la      \R1, \LABEL@l(\R0) | 
|  | lvx     \V, \OFF, \R1 | 
|  | .endm | 
|  |  | 
|  | ;# r3 short *input | 
|  | ;# r4 short *output | 
|  | ;# r5 int pitch | 
|  | .align 2 | 
|  | short_idct4x4_ppc: | 
|  | mfspr   r11, 256            ;# get old VRSAVE | 
|  | oris    r12, r11, 0xfff8 | 
|  | mtspr   256, r12            ;# set VRSAVE | 
|  |  | 
|  | load_c v8, sinpi8sqrt2, 0, r9, r10 | 
|  | load_c v9, cospi8sqrt2minus1, 0, r9, r10 | 
|  | load_c v10, hi_hi, 0, r9, r10 | 
|  | load_c v11, lo_lo, 0, r9, r10 | 
|  | load_c v12, shift_16, 0, r9, r10 | 
|  |  | 
|  | li      r10,  16 | 
|  | lvx     v0,   0, r3         ;# input ip[0], ip[ 4] | 
|  | lvx     v1, r10, r3         ;# input ip[8], ip[12] | 
|  |  | 
|  | ;# first pass | 
|  | vupkhsh v2, v0 | 
|  | vupkhsh v3, v1 | 
|  | vaddsws v6, v2, v3          ;# a1 = ip[0]+ip[8] | 
|  | vsubsws v7, v2, v3          ;# b1 = ip[0]-ip[8] | 
|  |  | 
|  | vupklsh v0, v0 | 
|  | vmulosh v4, v0, v8 | 
|  | vsraw   v4, v4, v12 | 
|  | vaddsws v4, v4, v0          ;# ip[ 4] * sin(pi/8) * sqrt(2) | 
|  |  | 
|  | vupklsh v1, v1 | 
|  | vmulosh v5, v1, v9 | 
|  | vsraw   v5, v5, v12         ;# ip[12] * cos(pi/8) * sqrt(2) | 
|  | vaddsws v5, v5, v1 | 
|  |  | 
|  | vsubsws v4, v4, v5          ;# c1 | 
|  |  | 
|  | vmulosh v3, v1, v8 | 
|  | vsraw   v3, v3, v12 | 
|  | vaddsws v3, v3, v1          ;# ip[12] * sin(pi/8) * sqrt(2) | 
|  |  | 
|  | vmulosh v5, v0, v9 | 
|  | vsraw   v5, v5, v12         ;# ip[ 4] * cos(pi/8) * sqrt(2) | 
|  | vaddsws v5, v5, v0 | 
|  |  | 
|  | vaddsws v3, v3, v5          ;# d1 | 
|  |  | 
|  | vaddsws v0, v6, v3          ;# a1 + d1 | 
|  | vsubsws v3, v6, v3          ;# a1 - d1 | 
|  |  | 
|  | vaddsws v1, v7, v4          ;# b1 + c1 | 
|  | vsubsws v2, v7, v4          ;# b1 - c1 | 
|  |  | 
|  | ;# transpose input | 
|  | vmrghw  v4, v0, v1          ;# a0 b0 a1 b1 | 
|  | vmrghw  v5, v2, v3          ;# c0 d0 c1 d1 | 
|  |  | 
|  | vmrglw  v6, v0, v1          ;# a2 b2 a3 b3 | 
|  | vmrglw  v7, v2, v3          ;# c2 d2 c3 d3 | 
|  |  | 
|  | vperm   v0, v4, v5, v10     ;# a0 b0 c0 d0 | 
|  | vperm   v1, v4, v5, v11     ;# a1 b1 c1 d1 | 
|  |  | 
|  | vperm   v2, v6, v7, v10     ;# a2 b2 c2 d2 | 
|  | vperm   v3, v6, v7, v11     ;# a3 b3 c3 d3 | 
|  |  | 
|  | ;# second pass | 
|  | vaddsws v6, v0, v2          ;# a1 = ip[0]+ip[8] | 
|  | vsubsws v7, v0, v2          ;# b1 = ip[0]-ip[8] | 
|  |  | 
|  | vmulosh v4, v1, v8 | 
|  | vsraw   v4, v4, v12 | 
|  | vaddsws v4, v4, v1          ;# ip[ 4] * sin(pi/8) * sqrt(2) | 
|  |  | 
|  | vmulosh v5, v3, v9 | 
|  | vsraw   v5, v5, v12         ;# ip[12] * cos(pi/8) * sqrt(2) | 
|  | vaddsws v5, v5, v3 | 
|  |  | 
|  | vsubsws v4, v4, v5          ;# c1 | 
|  |  | 
|  | vmulosh v2, v3, v8 | 
|  | vsraw   v2, v2, v12 | 
|  | vaddsws v2, v2, v3          ;# ip[12] * sin(pi/8) * sqrt(2) | 
|  |  | 
|  | vmulosh v5, v1, v9 | 
|  | vsraw   v5, v5, v12         ;# ip[ 4] * cos(pi/8) * sqrt(2) | 
|  | vaddsws v5, v5, v1 | 
|  |  | 
|  | vaddsws v3, v2, v5          ;# d1 | 
|  |  | 
|  | vaddsws v0, v6, v3          ;# a1 + d1 | 
|  | vsubsws v3, v6, v3          ;# a1 - d1 | 
|  |  | 
|  | vaddsws v1, v7, v4          ;# b1 + c1 | 
|  | vsubsws v2, v7, v4          ;# b1 - c1 | 
|  |  | 
|  | vspltish v6, 4 | 
|  | vspltish v7, 3 | 
|  |  | 
|  | vpkswss v0, v0, v1 | 
|  | vpkswss v1, v2, v3 | 
|  |  | 
|  | vaddshs v0, v0, v6 | 
|  | vaddshs v1, v1, v6 | 
|  |  | 
|  | vsrah   v0, v0, v7 | 
|  | vsrah   v1, v1, v7 | 
|  |  | 
|  | ;# transpose output | 
|  | vmrghh  v2, v0, v1          ;# a0 c0 a1 c1 a2 c2 a3 c3 | 
|  | vmrglh  v3, v0, v1          ;# b0 d0 b1 d1 b2 d2 b3 d3 | 
|  |  | 
|  | vmrghh  v0, v2, v3          ;# a0 b0 c0 d0 a1 b1 c1 d1 | 
|  | vmrglh  v1, v2, v3          ;# a2 b2 c2 d2 a3 b3 c3 d3 | 
|  |  | 
|  | stwu    r1,-416(r1)         ;# create space on the stack | 
|  |  | 
|  | stvx    v0,  0, r1 | 
|  | lwz     r6, 0(r1) | 
|  | stw     r6, 0(r4) | 
|  | lwz     r6, 4(r1) | 
|  | stw     r6, 4(r4) | 
|  |  | 
|  | add     r4, r4, r5 | 
|  |  | 
|  | lwz     r6,  8(r1) | 
|  | stw     r6,  0(r4) | 
|  | lwz     r6, 12(r1) | 
|  | stw     r6,  4(r4) | 
|  |  | 
|  | add     r4, r4, r5 | 
|  |  | 
|  | stvx    v1,  0, r1 | 
|  | lwz     r6, 0(r1) | 
|  | stw     r6, 0(r4) | 
|  | lwz     r6, 4(r1) | 
|  | stw     r6, 4(r4) | 
|  |  | 
|  | add     r4, r4, r5 | 
|  |  | 
|  | lwz     r6,  8(r1) | 
|  | stw     r6,  0(r4) | 
|  | lwz     r6, 12(r1) | 
|  | stw     r6,  4(r4) | 
|  |  | 
|  | addi    r1, r1, 416         ;# recover stack | 
|  |  | 
|  | mtspr   256, r11            ;# reset old VRSAVE | 
|  |  | 
|  | blr | 
|  |  | 
|  | .align 4 | 
|  | sinpi8sqrt2: | 
|  | .short  35468, 35468, 35468, 35468, 35468, 35468, 35468, 35468 | 
|  |  | 
|  | .align 4 | 
|  | cospi8sqrt2minus1: | 
|  | .short  20091, 20091, 20091, 20091, 20091, 20091, 20091, 20091 | 
|  |  | 
|  | .align 4 | 
|  | shift_16: | 
|  | .long      16,    16,    16,    16 | 
|  |  | 
|  | .align 4 | 
|  | hi_hi: | 
|  | .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23 | 
|  |  | 
|  | .align 4 | 
|  | lo_lo: | 
|  | .byte     8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 |