blob: dc9c63226a943f577abf30a14f9ee0586ec944b2 [file] [log] [blame]
Yaowu Xuc27fc142016-08-22 16:08:15 -07001/*
Yaowu Xu9c01aa12016-09-01 14:32:49 -07002 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
Yaowu Xuc27fc142016-08-22 16:08:15 -07003 *
Yaowu Xu9c01aa12016-09-01 14:32:49 -07004 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
Yaowu Xuc27fc142016-08-22 16:08:15 -070010 */
11
12#include "aom_dsp/mips/fwd_txfm_msa.h"
13
14static void fdct8x32_1d_column_load_butterfly(const int16_t *input,
15 int32_t src_stride,
16 int16_t *temp_buff) {
17 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
18 v8i16 step0, step1, step2, step3;
19 v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1;
20 v8i16 step0_1, step1_1, step2_1, step3_1;
21
22 /* 1st and 2nd set */
23 LD_SH4(input, src_stride, in0, in1, in2, in3);
24 LD_SH4(input + (28 * src_stride), src_stride, in4, in5, in6, in7);
25 LD_SH4(input + (4 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1);
26 LD_SH4(input + (24 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1);
27 SLLI_4V(in0, in1, in2, in3, 2);
28 SLLI_4V(in4, in5, in6, in7, 2);
29 SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2);
30 SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2);
31 BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2,
32 step3, in4, in5, in6, in7);
33 BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, step0_1,
34 step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1);
35 ST_SH4(step0, step1, step2, step3, temp_buff, 8);
36 ST_SH4(in4, in5, in6, in7, temp_buff + (28 * 8), 8);
37 ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (4 * 8), 8);
38 ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (24 * 8), 8);
39
40 /* 3rd and 4th set */
41 LD_SH4(input + (8 * src_stride), src_stride, in0, in1, in2, in3);
42 LD_SH4(input + (20 * src_stride), src_stride, in4, in5, in6, in7);
43 LD_SH4(input + (12 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1);
44 LD_SH4(input + (16 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1);
45 SLLI_4V(in0, in1, in2, in3, 2);
46 SLLI_4V(in4, in5, in6, in7, 2);
47 SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2);
48 SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2);
49 BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2,
50 step3, in4, in5, in6, in7);
51 BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, step0_1,
52 step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1);
53 ST_SH4(step0, step1, step2, step3, temp_buff + (8 * 8), 8);
54 ST_SH4(in4, in5, in6, in7, temp_buff + (20 * 8), 8);
55 ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (12 * 8), 8);
56 ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (15 * 8) + 8, 8);
57}
58
59static void fdct8x32_1d_column_even_store(int16_t *input, int16_t *temp) {
60 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
61 v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
62 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
63 v8i16 temp0, temp1;
64
65 /* fdct even */
66 LD_SH4(input, 8, in0, in1, in2, in3);
67 LD_SH4(input + 96, 8, in12, in13, in14, in15);
68 BUTTERFLY_8(in0, in1, in2, in3, in12, in13, in14, in15, vec0, vec1, vec2,
69 vec3, in12, in13, in14, in15);
70 LD_SH4(input + 32, 8, in4, in5, in6, in7);
71 LD_SH4(input + 64, 8, in8, in9, in10, in11);
72 BUTTERFLY_8(in4, in5, in6, in7, in8, in9, in10, in11, vec4, vec5, vec6, vec7,
73 in8, in9, in10, in11);
74
75 /* Stage 3 */
76 ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
77 BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0);
78 DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0);
79 FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
80 ST_SH(temp0, temp);
81 ST_SH(temp1, temp + 512);
82
83 DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
84 FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
85 ST_SH(temp0, temp + 256);
86 ST_SH(temp1, temp + 768);
87
88 SUB4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, vec7, vec6, vec5, vec4);
89 DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
90 ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
91 DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
92 FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
93 ST_SH(temp0, temp + 128);
94 ST_SH(temp1, temp + 896);
95
96 SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
97 DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
98 FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
99 ST_SH(temp0, temp + 640);
100 ST_SH(temp1, temp + 384);
101
102 DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
103 DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
104 ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
105 DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
106 ADD2(in0, in1, in2, in3, vec0, vec7);
107 DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
108 FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
109 ST_SH(temp0, temp + 64);
110 ST_SH(temp1, temp + 960);
111
112 SUB2(in0, in1, in2, in3, in0, in2);
113 DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
114 FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
115 ST_SH(temp0, temp + 576);
116 ST_SH(temp1, temp + 448);
117
118 SUB2(in9, vec2, in14, vec5, vec2, vec5);
119 DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
120 SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5);
121 DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
122 FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
123 ST_SH(temp0, temp + 320);
124 ST_SH(temp1, temp + 704);
125
126 ADD2(in3, in2, in0, in1, vec3, vec4);
127 DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
128 FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
129 ST_SH(temp0, temp + 192);
130 ST_SH(temp1, temp + 832);
131}
132
133static void fdct8x32_1d_column_odd_store(int16_t *input, int16_t *temp_ptr) {
134 v8i16 in16, in17, in18, in19, in20, in21, in22, in23;
135 v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5;
136
137 in20 = LD_SH(input + 32);
138 in21 = LD_SH(input + 40);
139 in26 = LD_SH(input + 80);
140 in27 = LD_SH(input + 88);
141
142 DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
143 DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
144
145 in18 = LD_SH(input + 16);
146 in19 = LD_SH(input + 24);
147 in28 = LD_SH(input + 96);
148 in29 = LD_SH(input + 104);
149
150 vec4 = in19 - in20;
151 ST_SH(vec4, input + 32);
152 vec4 = in18 - in21;
153 ST_SH(vec4, input + 40);
154 vec4 = in29 - in26;
155 ST_SH(vec4, input + 80);
156 vec4 = in28 - in27;
157 ST_SH(vec4, input + 88);
158
159 in21 = in18 + in21;
160 in20 = in19 + in20;
161 in27 = in28 + in27;
162 in26 = in29 + in26;
163
164 LD_SH4(input + 48, 8, in22, in23, in24, in25);
165 DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
166 DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
167
168 in16 = LD_SH(input);
169 in17 = LD_SH(input + 8);
170 in30 = LD_SH(input + 112);
171 in31 = LD_SH(input + 120);
172
173 vec4 = in17 - in22;
174 ST_SH(vec4, input + 16);
175 vec4 = in16 - in23;
176 ST_SH(vec4, input + 24);
177 vec4 = in31 - in24;
178 ST_SH(vec4, input + 96);
179 vec4 = in30 - in25;
180 ST_SH(vec4, input + 104);
181
182 ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31);
183 DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
184 DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
185 ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25);
186 DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
187 ADD2(in27, in26, in25, in24, in23, in20);
188 DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
189 FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
190 ST_SH(vec5, temp_ptr);
191 ST_SH(vec4, temp_ptr + 960);
192
193 SUB2(in27, in26, in25, in24, in22, in21);
194 DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
195 FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
196 ST_SH(vec5, temp_ptr + 448);
197 ST_SH(vec4, temp_ptr + 512);
198
199 SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20);
200 DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25);
201 SUB2(in26, in27, in24, in25, in23, in20);
202 DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
203 FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
204 ST_SH(vec4, temp_ptr + 704);
205 ST_SH(vec5, temp_ptr + 256);
206
207 ADD2(in26, in27, in24, in25, in22, in21);
208 DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
209 FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
210 ST_SH(vec4, temp_ptr + 192);
211 ST_SH(vec5, temp_ptr + 768);
212
213 LD_SH4(input + 16, 8, in22, in23, in20, in21);
214 LD_SH4(input + 80, 8, in26, in27, in24, in25);
215 in16 = in20;
216 in17 = in21;
217 DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27);
218 DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26);
219 SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31);
220 DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
221 ADD2(in28, in29, in31, in30, in16, in19);
222 DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
223 FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
224 ST_SH(vec5, temp_ptr + 832);
225 ST_SH(vec4, temp_ptr + 128);
226
227 SUB2(in28, in29, in31, in30, in17, in18);
228 DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
229 FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
230 ST_SH(vec5, temp_ptr + 320);
231 ST_SH(vec4, temp_ptr + 640);
232 ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19);
233 DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31);
234 SUB2(in29, in28, in30, in31, in16, in19);
235 DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
236 FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
237 ST_SH(vec5, temp_ptr + 576);
238 ST_SH(vec4, temp_ptr + 384);
239
240 ADD2(in29, in28, in30, in31, in17, in18);
241 DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
242 FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
243 ST_SH(vec5, temp_ptr + 64);
244 ST_SH(vec4, temp_ptr + 896);
245}
246
247static void fdct8x32_1d_column(const int16_t *input, int32_t src_stride,
248 int16_t *tmp_buf, int16_t *tmp_buf_big) {
249 fdct8x32_1d_column_load_butterfly(input, src_stride, tmp_buf);
250 fdct8x32_1d_column_even_store(tmp_buf, tmp_buf_big);
251 fdct8x32_1d_column_odd_store(tmp_buf + 128, (tmp_buf_big + 32));
252}
253
254static void fdct8x32_1d_row_load_butterfly(int16_t *temp_buff,
255 int16_t *output) {
256 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
257 v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
258 v8i16 step0, step1, step2, step3, step4, step5, step6, step7;
259
260 LD_SH8(temp_buff, 32, in0, in1, in2, in3, in4, in5, in6, in7);
261 LD_SH8(temp_buff + 24, 32, in8, in9, in10, in11, in12, in13, in14, in15);
262 TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
263 in4, in5, in6, in7);
264 TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
265 in10, in11, in12, in13, in14, in15);
266 BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
267 in12, in13, in14, in15, step0, step1, step2, step3, step4, step5,
268 step6, step7, in8, in9, in10, in11, in12, in13, in14, in15);
269 ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7, output, 8);
270 ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 24 * 8), 8);
271
272 /* 2nd set */
273 LD_SH8(temp_buff + 8, 32, in0, in1, in2, in3, in4, in5, in6, in7);
274 LD_SH8(temp_buff + 16, 32, in8, in9, in10, in11, in12, in13, in14, in15);
275 TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
276 in4, in5, in6, in7);
277 TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
278 in10, in11, in12, in13, in14, in15);
279 BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
280 in12, in13, in14, in15, step0, step1, step2, step3, step4, step5,
281 step6, step7, in8, in9, in10, in11, in12, in13, in14, in15);
282 ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7,
283 (output + 8 * 8), 8);
284 ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 16 * 8), 8);
285}
286
287static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr,
288 int16_t *out) {
289 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
290 v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
291 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
292 v4i32 vec0_l, vec1_l, vec2_l, vec3_l, vec4_l, vec5_l, vec6_l, vec7_l;
293 v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec4_r, vec5_r, vec6_r, vec7_r;
294 v4i32 tmp0_w, tmp1_w, tmp2_w, tmp3_w;
295
296 /* fdct32 even */
297 /* stage 2 */
298 LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
299 LD_SH8(input + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
300
301 BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
302 in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6,
303 vec7, in8, in9, in10, in11, in12, in13, in14, in15);
304 ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, interm_ptr, 8);
305 ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, interm_ptr + 64, 8);
306
307 /* Stage 3 */
308 UNPCK_SH_SW(vec0, vec0_l, vec0_r);
309 UNPCK_SH_SW(vec1, vec1_l, vec1_r);
310 UNPCK_SH_SW(vec2, vec2_l, vec2_r);
311 UNPCK_SH_SW(vec3, vec3_l, vec3_r);
312 UNPCK_SH_SW(vec4, vec4_l, vec4_r);
313 UNPCK_SH_SW(vec5, vec5_l, vec5_r);
314 UNPCK_SH_SW(vec6, vec6_l, vec6_r);
315 UNPCK_SH_SW(vec7, vec7_l, vec7_r);
316 ADD4(vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r, vec3_r, vec4_r, tmp0_w,
317 tmp1_w, tmp2_w, tmp3_w);
318 BUTTERFLY_4(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r, vec5_r);
319 ADD4(vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l, vec3_l, vec4_l, vec0_r,
320 vec1_r, vec2_r, vec3_r);
321
322 tmp3_w = vec0_r + vec3_r;
323 vec0_r = vec0_r - vec3_r;
324 vec3_r = vec1_r + vec2_r;
325 vec1_r = vec1_r - vec2_r;
326
327 DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64, cospi_16_64,
328 vec4_r, tmp3_w, vec6_r, vec3_r);
329 FDCT32_POSTPROC_NEG_W(vec4_r);
330 FDCT32_POSTPROC_NEG_W(tmp3_w);
331 FDCT32_POSTPROC_NEG_W(vec6_r);
332 FDCT32_POSTPROC_NEG_W(vec3_r);
333 PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
334 ST_SH2(vec5, vec4, out, 8);
335
336 DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64, cospi_8_64,
337 vec4_r, tmp3_w, vec6_r, vec3_r);
338 FDCT32_POSTPROC_NEG_W(vec4_r);
339 FDCT32_POSTPROC_NEG_W(tmp3_w);
340 FDCT32_POSTPROC_NEG_W(vec6_r);
341 FDCT32_POSTPROC_NEG_W(vec3_r);
342 PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
343 ST_SH2(vec5, vec4, out + 16, 8);
344
345 LD_SH8(interm_ptr, 8, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
346 SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7);
347 DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
348 ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
349 DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, in5, in4);
350 FDCT_POSTPROC_2V_NEG_H(in4, in5);
351 ST_SH(in4, out + 32);
352 ST_SH(in5, out + 56);
353
354 SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
355 DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, in5, in4);
356 FDCT_POSTPROC_2V_NEG_H(in4, in5);
357 ST_SH(in4, out + 40);
358 ST_SH(in5, out + 48);
359
360 LD_SH8(interm_ptr + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
361 DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
362 DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
363 ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
364 DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
365 ADD2(in0, in1, in2, in3, vec0, vec7);
366 DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, in5, in4);
367 FDCT_POSTPROC_2V_NEG_H(in4, in5);
368 ST_SH(in4, out + 64);
369 ST_SH(in5, out + 120);
370
371 SUB2(in0, in1, in2, in3, in0, in2);
372 DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, in5, in4);
373 FDCT_POSTPROC_2V_NEG_H(in4, in5);
374 ST_SH(in4, out + 72);
375 ST_SH(in5, out + 112);
376
377 SUB2(in9, vec2, in14, vec5, vec2, vec5);
378 DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
379 SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5);
380 DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, in5, in4);
381 FDCT_POSTPROC_2V_NEG_H(in4, in5);
382 ST_SH(in4, out + 80);
383 ST_SH(in5, out + 104);
384
385 ADD2(in3, in2, in0, in1, vec3, vec4);
386 DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, in4, in5);
387 FDCT_POSTPROC_2V_NEG_H(in4, in5);
388 ST_SH(in4, out + 96);
389 ST_SH(in5, out + 88);
390}
391
392static void fdct8x32_1d_row_even(int16_t *temp, int16_t *out) {
393 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
394 v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
395 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1;
396
397 /* fdct32 even */
398 /* stage 2 */
399 LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7);
400 LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
401
402 BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
403 in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6,
404 vec7, in8, in9, in10, in11, in12, in13, in14, in15);
405
406 /* Stage 3 */
407 ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
408 BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0);
409 DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0);
410 FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
411 ST_SH(temp0, out);
412 ST_SH(temp1, out + 8);
413
414 DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
415 FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
416 ST_SH(temp0, out + 16);
417 ST_SH(temp1, out + 24);
418
419 SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7);
420 DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
421 ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
422 DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
423 FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
424 ST_SH(temp0, out + 32);
425 ST_SH(temp1, out + 56);
426
427 SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
428 DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
429 FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
430 ST_SH(temp0, out + 40);
431 ST_SH(temp1, out + 48);
432
433 DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
434 DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
435 ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
436 DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
437 ADD2(in0, in1, in2, in3, vec0, vec7);
438 DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
439 FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
440 ST_SH(temp0, out + 64);
441 ST_SH(temp1, out + 120);
442
443 SUB2(in0, in1, in2, in3, in0, in2);
444 DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
445 FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
446 ST_SH(temp0, out + 72);
447 ST_SH(temp1, out + 112);
448
449 SUB2(in9, vec2, in14, vec5, vec2, vec5);
450 DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
451 SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5)
452 DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
453 FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
454 ST_SH(temp0, out + 80);
455 ST_SH(temp1, out + 104);
456
457 ADD2(in3, in2, in0, in1, vec3, vec4);
458 DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
459 FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
460 ST_SH(temp0, out + 96);
461 ST_SH(temp1, out + 88);
462}
463
464static void fdct8x32_1d_row_odd(int16_t *temp, int16_t *interm_ptr,
465 int16_t *out) {
466 v8i16 in16, in17, in18, in19, in20, in21, in22, in23;
467 v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5;
468
469 in20 = LD_SH(temp + 32);
470 in21 = LD_SH(temp + 40);
471 in26 = LD_SH(temp + 80);
472 in27 = LD_SH(temp + 88);
473
474 DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
475 DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
476
477 in18 = LD_SH(temp + 16);
478 in19 = LD_SH(temp + 24);
479 in28 = LD_SH(temp + 96);
480 in29 = LD_SH(temp + 104);
481
482 vec4 = in19 - in20;
483 ST_SH(vec4, interm_ptr + 32);
484 vec4 = in18 - in21;
485 ST_SH(vec4, interm_ptr + 88);
486 vec4 = in28 - in27;
487 ST_SH(vec4, interm_ptr + 56);
488 vec4 = in29 - in26;
489 ST_SH(vec4, interm_ptr + 64);
490
491 ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26);
492
493 in22 = LD_SH(temp + 48);
494 in23 = LD_SH(temp + 56);
495 in24 = LD_SH(temp + 64);
496 in25 = LD_SH(temp + 72);
497
498 DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
499 DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
500
501 in16 = LD_SH(temp);
502 in17 = LD_SH(temp + 8);
503 in30 = LD_SH(temp + 112);
504 in31 = LD_SH(temp + 120);
505
506 vec4 = in17 - in22;
507 ST_SH(vec4, interm_ptr + 40);
508 vec4 = in30 - in25;
509 ST_SH(vec4, interm_ptr + 48);
510 vec4 = in31 - in24;
511 ST_SH(vec4, interm_ptr + 72);
512 vec4 = in16 - in23;
513 ST_SH(vec4, interm_ptr + 80);
514
515 ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31);
516 DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
517 DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
518
519 ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25);
520 DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
521 ADD2(in27, in26, in25, in24, in23, in20);
522
523 DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
524 FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
525 ST_SH(vec5, out);
526 ST_SH(vec4, out + 120);
527
528 SUB2(in27, in26, in25, in24, in22, in21);
529
530 DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
531 FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
532 ST_SH(vec5, out + 112);
533 ST_SH(vec4, out + 8);
534
535 SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20);
536 DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25);
537 SUB2(in26, in27, in24, in25, in23, in20);
538
539 DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
540 FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
541 ST_SH(vec4, out + 16);
542 ST_SH(vec5, out + 104);
543
544 ADD2(in26, in27, in24, in25, in22, in21);
545 DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
546 FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
547 ST_SH(vec4, out + 24);
548 ST_SH(vec5, out + 96);
549
550 in20 = LD_SH(interm_ptr + 32);
551 in21 = LD_SH(interm_ptr + 88);
552 in27 = LD_SH(interm_ptr + 56);
553 in26 = LD_SH(interm_ptr + 64);
554
555 in16 = in20;
556 in17 = in21;
557 DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27);
558 DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26);
559
560 in22 = LD_SH(interm_ptr + 40);
561 in25 = LD_SH(interm_ptr + 48);
562 in24 = LD_SH(interm_ptr + 72);
563 in23 = LD_SH(interm_ptr + 80);
564
565 SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31);
566 DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
567 ADD2(in28, in29, in31, in30, in16, in19);
568 DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
569 FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
570 ST_SH(vec5, out + 32);
571 ST_SH(vec4, out + 88);
572
573 SUB2(in28, in29, in31, in30, in17, in18);
574 DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
575 FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
576 ST_SH(vec5, out + 40);
577 ST_SH(vec4, out + 80);
578
579 ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19);
580 DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31);
581 SUB2(in29, in28, in30, in31, in16, in19);
582
583 DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
584 FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
585 ST_SH(vec5, out + 72);
586 ST_SH(vec4, out + 48);
587
588 ADD2(in29, in28, in30, in31, in17, in18);
589
590 DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
591 FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
592 ST_SH(vec4, out + 56);
593 ST_SH(vec5, out + 64);
594}
595
596static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) {
597 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
598 v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1;
599
600 /* 1st set */
601 in0 = LD_SH(temp);
602 in4 = LD_SH(temp + 32);
603 in2 = LD_SH(temp + 64);
604 in6 = LD_SH(temp + 96);
605 in1 = LD_SH(temp + 128);
606 in7 = LD_SH(temp + 152);
607 in3 = LD_SH(temp + 192);
608 in5 = LD_SH(temp + 216);
609
610 TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
611 in4, in5, in6, in7);
612
613 /* 2nd set */
614 in0_1 = LD_SH(temp + 16);
615 in1_1 = LD_SH(temp + 232);
616 in2_1 = LD_SH(temp + 80);
617 in3_1 = LD_SH(temp + 168);
618 in4_1 = LD_SH(temp + 48);
619 in5_1 = LD_SH(temp + 176);
620 in6_1 = LD_SH(temp + 112);
621 in7_1 = LD_SH(temp + 240);
622
623 ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 32);
624 TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
625 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1);
626
627 /* 3rd set */
628 in0 = LD_SH(temp + 8);
629 in1 = LD_SH(temp + 136);
630 in2 = LD_SH(temp + 72);
631 in3 = LD_SH(temp + 200);
632 in4 = LD_SH(temp + 40);
633 in5 = LD_SH(temp + 208);
634 in6 = LD_SH(temp + 104);
635 in7 = LD_SH(temp + 144);
636
637 ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, output + 8,
638 32);
639 TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
640 in4, in5, in6, in7);
641 ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output + 16, 32);
642
643 /* 4th set */
644 in0_1 = LD_SH(temp + 24);
645 in1_1 = LD_SH(temp + 224);
646 in2_1 = LD_SH(temp + 88);
647 in3_1 = LD_SH(temp + 160);
648 in4_1 = LD_SH(temp + 56);
649 in5_1 = LD_SH(temp + 184);
650 in6_1 = LD_SH(temp + 120);
651 in7_1 = LD_SH(temp + 248);
652
653 TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
654 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1);
655 ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, output + 24,
656 32);
657}
658
659static void fdct32x8_1d_row(int16_t *temp, int16_t *temp_buf, int16_t *output) {
660 fdct8x32_1d_row_load_butterfly(temp, temp_buf);
661 fdct8x32_1d_row_even(temp_buf, temp_buf);
662 fdct8x32_1d_row_odd(temp_buf + 128, temp, temp_buf + 128);
663 fdct8x32_1d_row_transpose_store(temp_buf, output);
664}
665
666static void fdct32x8_1d_row_4x(int16_t *tmp_buf_big, int16_t *tmp_buf,
667 int16_t *output) {
668 fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf);
669 fdct8x32_1d_row_even_4x(tmp_buf, tmp_buf_big, tmp_buf);
670 fdct8x32_1d_row_odd(tmp_buf + 128, tmp_buf_big, tmp_buf + 128);
671 fdct8x32_1d_row_transpose_store(tmp_buf, output);
672}
673
Yaowu Xuf883b422016-08-30 14:01:10 -0700674void aom_fdct32x32_msa(const int16_t *input, int16_t *output,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700675 int32_t src_stride) {
676 int32_t i;
677 DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]);
678 DECLARE_ALIGNED(32, int16_t, tmp_buf[256]);
679
680 /* column transform */
681 for (i = 0; i < 4; ++i) {
682 fdct8x32_1d_column(input + (8 * i), src_stride, tmp_buf,
683 tmp_buf_big + (8 * i));
684 }
685
686 /* row transform */
687 fdct32x8_1d_row_4x(tmp_buf_big, tmp_buf, output);
688
689 /* row transform */
690 for (i = 1; i < 4; ++i) {
691 fdct32x8_1d_row(tmp_buf_big + (i * 256), tmp_buf, output + (i * 256));
692 }
693}
694
695static void fdct8x32_1d_row_even_rd(int16_t *temp, int16_t *out) {
696 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
697 v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
698 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1;
699
700 /* fdct32 even */
701 /* stage 2 */
702 LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7);
703 LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
704
705 BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
706 in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6,
707 vec7, in8, in9, in10, in11, in12, in13, in14, in15);
708 FDCT_POSTPROC_2V_NEG_H(vec0, vec1);
709 FDCT_POSTPROC_2V_NEG_H(vec2, vec3);
710 FDCT_POSTPROC_2V_NEG_H(vec4, vec5);
711 FDCT_POSTPROC_2V_NEG_H(vec6, vec7);
712 FDCT_POSTPROC_2V_NEG_H(in8, in9);
713 FDCT_POSTPROC_2V_NEG_H(in10, in11);
714 FDCT_POSTPROC_2V_NEG_H(in12, in13);
715 FDCT_POSTPROC_2V_NEG_H(in14, in15);
716
717 /* Stage 3 */
718 ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
719
720 temp0 = in0 + in3;
721 in0 = in0 - in3;
722 in3 = in1 + in2;
723 in1 = in1 - in2;
724
725 DOTP_CONST_PAIR(temp0, in3, cospi_16_64, cospi_16_64, temp1, temp0);
726 ST_SH(temp0, out);
727 ST_SH(temp1, out + 8);
728
729 DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
730 ST_SH(temp0, out + 16);
731 ST_SH(temp1, out + 24);
732
733 SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7);
734 DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
735 ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
736 DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
737 ST_SH(temp0, out + 32);
738 ST_SH(temp1, out + 56);
739
740 SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
741 DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
742 ST_SH(temp0, out + 40);
743 ST_SH(temp1, out + 48);
744
745 DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
746 DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
747 ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
748 DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
749 ADD2(in0, in1, in2, in3, vec0, vec7);
750 DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
751 ST_SH(temp0, out + 64);
752 ST_SH(temp1, out + 120);
753
754 SUB2(in0, in1, in2, in3, in0, in2);
755 DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
756 ST_SH(temp0, out + 72);
757 ST_SH(temp1, out + 112);
758
759 SUB2(in9, vec2, in14, vec5, vec2, vec5);
760 DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
761 SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5);
762 DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
763 ST_SH(temp0, out + 80);
764 ST_SH(temp1, out + 104);
765
766 ADD2(in3, in2, in0, in1, vec3, vec4);
767 DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
768 ST_SH(temp0, out + 96);
769 ST_SH(temp1, out + 88);
770}
771
772static void fdct8x32_1d_row_odd_rd(int16_t *temp, int16_t *interm_ptr,
773 int16_t *out) {
774 v8i16 in16, in17, in18, in19, in20, in21, in22, in23;
775 v8i16 in24, in25, in26, in27, in28, in29, in30, in31;
776 v8i16 vec4, vec5;
777
778 in20 = LD_SH(temp + 32);
779 in21 = LD_SH(temp + 40);
780 in26 = LD_SH(temp + 80);
781 in27 = LD_SH(temp + 88);
782
783 DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
784 DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
785
786 FDCT_POSTPROC_2V_NEG_H(in20, in21);
787 FDCT_POSTPROC_2V_NEG_H(in26, in27);
788
789 in18 = LD_SH(temp + 16);
790 in19 = LD_SH(temp + 24);
791 in28 = LD_SH(temp + 96);
792 in29 = LD_SH(temp + 104);
793
794 FDCT_POSTPROC_2V_NEG_H(in18, in19);
795 FDCT_POSTPROC_2V_NEG_H(in28, in29);
796
797 vec4 = in19 - in20;
798 ST_SH(vec4, interm_ptr + 32);
799 vec4 = in18 - in21;
800 ST_SH(vec4, interm_ptr + 88);
801 vec4 = in29 - in26;
802 ST_SH(vec4, interm_ptr + 64);
803 vec4 = in28 - in27;
804 ST_SH(vec4, interm_ptr + 56);
805
806 ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26);
807
808 in22 = LD_SH(temp + 48);
809 in23 = LD_SH(temp + 56);
810 in24 = LD_SH(temp + 64);
811 in25 = LD_SH(temp + 72);
812
813 DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
814 DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
815 FDCT_POSTPROC_2V_NEG_H(in22, in23);
816 FDCT_POSTPROC_2V_NEG_H(in24, in25);
817
818 in16 = LD_SH(temp);
819 in17 = LD_SH(temp + 8);
820 in30 = LD_SH(temp + 112);
821 in31 = LD_SH(temp + 120);
822
823 FDCT_POSTPROC_2V_NEG_H(in16, in17);
824 FDCT_POSTPROC_2V_NEG_H(in30, in31);
825
826 vec4 = in17 - in22;
827 ST_SH(vec4, interm_ptr + 40);
828 vec4 = in30 - in25;
829 ST_SH(vec4, interm_ptr + 48);
830 vec4 = in31 - in24;
831 ST_SH(vec4, interm_ptr + 72);
832 vec4 = in16 - in23;
833 ST_SH(vec4, interm_ptr + 80);
834
835 ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31);
836 DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
837 DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
838 ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25);
839 DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
840 ADD2(in27, in26, in25, in24, in23, in20);
841 DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
842 ST_SH(vec5, out);
843 ST_SH(vec4, out + 120);
844
845 SUB2(in27, in26, in25, in24, in22, in21);
846 DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
847 ST_SH(vec5, out + 112);
848 ST_SH(vec4, out + 8);
849
850 SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20);
851 DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25);
852 SUB2(in26, in27, in24, in25, in23, in20);
853 DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
854 ST_SH(vec4, out + 16);
855 ST_SH(vec5, out + 104);
856
857 ADD2(in26, in27, in24, in25, in22, in21);
858 DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
859 ST_SH(vec4, out + 24);
860 ST_SH(vec5, out + 96);
861
862 in20 = LD_SH(interm_ptr + 32);
863 in21 = LD_SH(interm_ptr + 88);
864 in27 = LD_SH(interm_ptr + 56);
865 in26 = LD_SH(interm_ptr + 64);
866
867 in16 = in20;
868 in17 = in21;
869 DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27);
870 DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26);
871
872 in22 = LD_SH(interm_ptr + 40);
873 in25 = LD_SH(interm_ptr + 48);
874 in24 = LD_SH(interm_ptr + 72);
875 in23 = LD_SH(interm_ptr + 80);
876
877 SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31);
878 DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
879 in16 = in28 + in29;
880 in19 = in31 + in30;
881 DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
882 ST_SH(vec5, out + 32);
883 ST_SH(vec4, out + 88);
884
885 SUB2(in28, in29, in31, in30, in17, in18);
886 DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
887 ST_SH(vec5, out + 40);
888 ST_SH(vec4, out + 80);
889
890 ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19);
891 DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31);
892 SUB2(in29, in28, in30, in31, in16, in19);
893 DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
894 ST_SH(vec5, out + 72);
895 ST_SH(vec4, out + 48);
896
897 ADD2(in29, in28, in30, in31, in17, in18);
898 DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
899 ST_SH(vec4, out + 56);
900 ST_SH(vec5, out + 64);
901}
902
903static void fdct32x8_1d_row_rd(int16_t *tmp_buf_big, int16_t *tmp_buf,
904 int16_t *output) {
905 fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf);
906 fdct8x32_1d_row_even_rd(tmp_buf, tmp_buf);
907 fdct8x32_1d_row_odd_rd((tmp_buf + 128), tmp_buf_big, (tmp_buf + 128));
908 fdct8x32_1d_row_transpose_store(tmp_buf, output);
909}
910
Yaowu Xuf883b422016-08-30 14:01:10 -0700911void aom_fdct32x32_rd_msa(const int16_t *input, int16_t *out,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700912 int32_t src_stride) {
913 int32_t i;
914 DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]);
915 DECLARE_ALIGNED(32, int16_t, tmp_buf[256]);
916
917 /* column transform */
918 for (i = 0; i < 4; ++i) {
919 fdct8x32_1d_column(input + (8 * i), src_stride, &tmp_buf[0],
920 &tmp_buf_big[0] + (8 * i));
921 }
922
923 /* row transform */
924 for (i = 0; i < 4; ++i) {
925 fdct32x8_1d_row_rd(&tmp_buf_big[0] + (8 * i * 32), &tmp_buf[0],
926 out + (8 * i * 32));
927 }
928}
929
Yaowu Xuf883b422016-08-30 14:01:10 -0700930void aom_fdct32x32_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700931 int sum = LD_HADD(input, stride);
932 sum += LD_HADD(input + 8, stride);
933 sum += LD_HADD(input + 16, stride);
934 sum += LD_HADD(input + 24, stride);
935 sum += LD_HADD(input + 32 * 8, stride);
936 sum += LD_HADD(input + 32 * 8 + 8, stride);
937 sum += LD_HADD(input + 32 * 8 + 16, stride);
938 sum += LD_HADD(input + 32 * 8 + 24, stride);
939 sum += LD_HADD(input + 32 * 16, stride);
940 sum += LD_HADD(input + 32 * 16 + 8, stride);
941 sum += LD_HADD(input + 32 * 16 + 16, stride);
942 sum += LD_HADD(input + 32 * 16 + 24, stride);
943 sum += LD_HADD(input + 32 * 24, stride);
944 sum += LD_HADD(input + 32 * 24 + 8, stride);
945 sum += LD_HADD(input + 32 * 24 + 16, stride);
946 sum += LD_HADD(input + 32 * 24 + 24, stride);
947 out[0] = (int16_t)(sum >> 3);
948}