blob: 7921219b5fa1e238787befa3aec0937a1825a011 [file] [log] [blame]
Deb Mukherjee47031c02014-05-16 18:52:01 -07001/*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
James Zern3a7d4672014-08-10 16:15:18 -070011#include "libyuv/row.h"
Deb Mukherjee47031c02014-05-16 18:52:01 -070012
13#ifdef __cplusplus
14namespace libyuv {
15extern "C" {
16#endif
17
18// This module is for GCC Neon.
Johann223bf292015-03-02 15:19:19 -080019#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
20 !defined(__aarch64__)
Deb Mukherjee47031c02014-05-16 18:52:01 -070021
22// NEON downscalers with interpolation.
23// Provided by Fritz Koenig
24
25// Read 32x1 throw away even pixels, and write 16x1.
26void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
27 uint8* dst, int dst_width) {
28 asm volatile (
29 ".p2align 2 \n"
30 "1: \n"
31 // load even pixels into q0, odd into q1
James Zern3a7d4672014-08-10 16:15:18 -070032 MEMACCESS(0)
Deb Mukherjee47031c02014-05-16 18:52:01 -070033 "vld2.8 {q0, q1}, [%0]! \n"
34 "subs %2, %2, #16 \n" // 16 processed per loop
James Zern3a7d4672014-08-10 16:15:18 -070035 MEMACCESS(1)
Deb Mukherjee47031c02014-05-16 18:52:01 -070036 "vst1.8 {q1}, [%1]! \n" // store odd pixels
37 "bgt 1b \n"
38 : "+r"(src_ptr), // %0
39 "+r"(dst), // %1
40 "+r"(dst_width) // %2
41 :
42 : "q0", "q1" // Clobber List
43 );
44}
45
46// Read 32x2 average down and write 16x1.
47void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
48 uint8* dst, int dst_width) {
49 asm volatile (
50 // change the stride to row 2 pointer
51 "add %1, %0 \n"
52 ".p2align 2 \n"
53 "1: \n"
James Zern3a7d4672014-08-10 16:15:18 -070054 MEMACCESS(0)
Deb Mukherjee47031c02014-05-16 18:52:01 -070055 "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
James Zern3a7d4672014-08-10 16:15:18 -070056 MEMACCESS(1)
Deb Mukherjee47031c02014-05-16 18:52:01 -070057 "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
58 "subs %3, %3, #16 \n" // 16 processed per loop
59 "vpaddl.u8 q0, q0 \n" // row 1 add adjacent
60 "vpaddl.u8 q1, q1 \n"
61 "vpadal.u8 q0, q2 \n" // row 2 add adjacent + row1
62 "vpadal.u8 q1, q3 \n"
63 "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
64 "vrshrn.u16 d1, q1, #2 \n"
James Zern3a7d4672014-08-10 16:15:18 -070065 MEMACCESS(2)
Deb Mukherjee47031c02014-05-16 18:52:01 -070066 "vst1.8 {q0}, [%2]! \n"
67 "bgt 1b \n"
68 : "+r"(src_ptr), // %0
69 "+r"(src_stride), // %1
70 "+r"(dst), // %2
71 "+r"(dst_width) // %3
72 :
73 : "q0", "q1", "q2", "q3" // Clobber List
74 );
75}
76
77void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
78 uint8* dst_ptr, int dst_width) {
79 asm volatile (
80 ".p2align 2 \n"
81 "1: \n"
James Zern3a7d4672014-08-10 16:15:18 -070082 MEMACCESS(0)
Deb Mukherjee47031c02014-05-16 18:52:01 -070083 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
84 "subs %2, %2, #8 \n" // 8 processed per loop
James Zern3a7d4672014-08-10 16:15:18 -070085 MEMACCESS(1)
Deb Mukherjee47031c02014-05-16 18:52:01 -070086 "vst1.8 {d2}, [%1]! \n"
87 "bgt 1b \n"
88 : "+r"(src_ptr), // %0
89 "+r"(dst_ptr), // %1
90 "+r"(dst_width) // %2
91 :
92 : "q0", "q1", "memory", "cc"
93 );
94}
95
96void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
97 uint8* dst_ptr, int dst_width) {
James Zern3a7d4672014-08-10 16:15:18 -070098 const uint8* src_ptr1 = src_ptr + src_stride;
99 const uint8* src_ptr2 = src_ptr + src_stride * 2;
100 const uint8* src_ptr3 = src_ptr + src_stride * 3;
101asm volatile (
Deb Mukherjee47031c02014-05-16 18:52:01 -0700102 ".p2align 2 \n"
103 "1: \n"
James Zern3a7d4672014-08-10 16:15:18 -0700104 MEMACCESS(0)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700105 "vld1.8 {q0}, [%0]! \n" // load up 16x4
James Zern3a7d4672014-08-10 16:15:18 -0700106 MEMACCESS(3)
107 "vld1.8 {q1}, [%3]! \n"
108 MEMACCESS(4)
109 "vld1.8 {q2}, [%4]! \n"
110 MEMACCESS(5)
111 "vld1.8 {q3}, [%5]! \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700112 "subs %2, %2, #4 \n"
113 "vpaddl.u8 q0, q0 \n"
114 "vpadal.u8 q0, q1 \n"
115 "vpadal.u8 q0, q2 \n"
116 "vpadal.u8 q0, q3 \n"
117 "vpaddl.u16 q0, q0 \n"
118 "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
119 "vmovn.u16 d0, q0 \n"
James Zern3a7d4672014-08-10 16:15:18 -0700120 MEMACCESS(1)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700121 "vst1.32 {d0[0]}, [%1]! \n"
122 "bgt 1b \n"
James Zern3a7d4672014-08-10 16:15:18 -0700123 : "+r"(src_ptr), // %0
124 "+r"(dst_ptr), // %1
125 "+r"(dst_width), // %2
126 "+r"(src_ptr1), // %3
127 "+r"(src_ptr2), // %4
128 "+r"(src_ptr3) // %5
129 :
130 : "q0", "q1", "q2", "q3", "memory", "cc"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700131 );
132}
133
134// Down scale from 4 to 3 pixels. Use the neon multilane read/write
135// to load up the every 4th pixel into a 4 different registers.
136// Point samples 32 pixels to 24 pixels.
137void ScaleRowDown34_NEON(const uint8* src_ptr,
138 ptrdiff_t src_stride,
139 uint8* dst_ptr, int dst_width) {
140 asm volatile (
141 ".p2align 2 \n"
142 "1: \n"
James Zern3a7d4672014-08-10 16:15:18 -0700143 MEMACCESS(0)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700144 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
145 "subs %2, %2, #24 \n"
146 "vmov d2, d3 \n" // order d0, d1, d2
James Zern3a7d4672014-08-10 16:15:18 -0700147 MEMACCESS(1)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700148 "vst3.8 {d0, d1, d2}, [%1]! \n"
149 "bgt 1b \n"
150 : "+r"(src_ptr), // %0
151 "+r"(dst_ptr), // %1
152 "+r"(dst_width) // %2
153 :
154 : "d0", "d1", "d2", "d3", "memory", "cc"
155 );
156}
157
158void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
159 ptrdiff_t src_stride,
160 uint8* dst_ptr, int dst_width) {
161 asm volatile (
162 "vmov.u8 d24, #3 \n"
163 "add %3, %0 \n"
164 ".p2align 2 \n"
165 "1: \n"
James Zern3a7d4672014-08-10 16:15:18 -0700166 MEMACCESS(0)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700167 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
James Zern3a7d4672014-08-10 16:15:18 -0700168 MEMACCESS(3)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700169 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
170 "subs %2, %2, #24 \n"
171
172 // filter src line 0 with src line 1
173 // expand chars to shorts to allow for room
174 // when adding lines together
175 "vmovl.u8 q8, d4 \n"
176 "vmovl.u8 q9, d5 \n"
177 "vmovl.u8 q10, d6 \n"
178 "vmovl.u8 q11, d7 \n"
179
180 // 3 * line_0 + line_1
181 "vmlal.u8 q8, d0, d24 \n"
182 "vmlal.u8 q9, d1, d24 \n"
183 "vmlal.u8 q10, d2, d24 \n"
184 "vmlal.u8 q11, d3, d24 \n"
185
186 // (3 * line_0 + line_1) >> 2
187 "vqrshrn.u16 d0, q8, #2 \n"
188 "vqrshrn.u16 d1, q9, #2 \n"
189 "vqrshrn.u16 d2, q10, #2 \n"
190 "vqrshrn.u16 d3, q11, #2 \n"
191
192 // a0 = (src[0] * 3 + s[1] * 1) >> 2
193 "vmovl.u8 q8, d1 \n"
194 "vmlal.u8 q8, d0, d24 \n"
195 "vqrshrn.u16 d0, q8, #2 \n"
196
197 // a1 = (src[1] * 1 + s[2] * 1) >> 1
198 "vrhadd.u8 d1, d1, d2 \n"
199
200 // a2 = (src[2] * 1 + s[3] * 3) >> 2
201 "vmovl.u8 q8, d2 \n"
202 "vmlal.u8 q8, d3, d24 \n"
203 "vqrshrn.u16 d2, q8, #2 \n"
204
James Zern3a7d4672014-08-10 16:15:18 -0700205 MEMACCESS(1)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700206 "vst3.8 {d0, d1, d2}, [%1]! \n"
207
208 "bgt 1b \n"
209 : "+r"(src_ptr), // %0
210 "+r"(dst_ptr), // %1
211 "+r"(dst_width), // %2
212 "+r"(src_stride) // %3
213 :
214 : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
215 );
216}
217
218void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
219 ptrdiff_t src_stride,
220 uint8* dst_ptr, int dst_width) {
221 asm volatile (
222 "vmov.u8 d24, #3 \n"
223 "add %3, %0 \n"
224 ".p2align 2 \n"
225 "1: \n"
James Zern3a7d4672014-08-10 16:15:18 -0700226 MEMACCESS(0)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700227 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
James Zern3a7d4672014-08-10 16:15:18 -0700228 MEMACCESS(3)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700229 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
230 "subs %2, %2, #24 \n"
231 // average src line 0 with src line 1
232 "vrhadd.u8 q0, q0, q2 \n"
233 "vrhadd.u8 q1, q1, q3 \n"
234
235 // a0 = (src[0] * 3 + s[1] * 1) >> 2
236 "vmovl.u8 q3, d1 \n"
237 "vmlal.u8 q3, d0, d24 \n"
238 "vqrshrn.u16 d0, q3, #2 \n"
239
240 // a1 = (src[1] * 1 + s[2] * 1) >> 1
241 "vrhadd.u8 d1, d1, d2 \n"
242
243 // a2 = (src[2] * 1 + s[3] * 3) >> 2
244 "vmovl.u8 q3, d2 \n"
245 "vmlal.u8 q3, d3, d24 \n"
246 "vqrshrn.u16 d2, q3, #2 \n"
247
James Zern3a7d4672014-08-10 16:15:18 -0700248 MEMACCESS(1)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700249 "vst3.8 {d0, d1, d2}, [%1]! \n"
250 "bgt 1b \n"
251 : "+r"(src_ptr), // %0
252 "+r"(dst_ptr), // %1
253 "+r"(dst_width), // %2
254 "+r"(src_stride) // %3
255 :
256 : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
257 );
258}
259
260#define HAS_SCALEROWDOWN38_NEON
261static uvec8 kShuf38 =
262 { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
263static uvec8 kShuf38_2 =
264 { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
265static vec16 kMult38_Div6 =
266 { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
267 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
268static vec16 kMult38_Div9 =
269 { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
270 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
271
272// 32 -> 12
273void ScaleRowDown38_NEON(const uint8* src_ptr,
274 ptrdiff_t src_stride,
275 uint8* dst_ptr, int dst_width) {
276 asm volatile (
James Zern3a7d4672014-08-10 16:15:18 -0700277 MEMACCESS(3)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700278 "vld1.8 {q3}, [%3] \n"
279 ".p2align 2 \n"
280 "1: \n"
James Zern3a7d4672014-08-10 16:15:18 -0700281 MEMACCESS(0)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700282 "vld1.8 {d0, d1, d2, d3}, [%0]! \n"
283 "subs %2, %2, #12 \n"
284 "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
285 "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
James Zern3a7d4672014-08-10 16:15:18 -0700286 MEMACCESS(1)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700287 "vst1.8 {d4}, [%1]! \n"
James Zern3a7d4672014-08-10 16:15:18 -0700288 MEMACCESS(1)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700289 "vst1.32 {d5[0]}, [%1]! \n"
290 "bgt 1b \n"
291 : "+r"(src_ptr), // %0
292 "+r"(dst_ptr), // %1
293 "+r"(dst_width) // %2
294 : "r"(&kShuf38) // %3
295 : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
296 );
297}
298
299// 32x3 -> 12x1
300void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
301 ptrdiff_t src_stride,
302 uint8* dst_ptr, int dst_width) {
James Zern3a7d4672014-08-10 16:15:18 -0700303 const uint8* src_ptr1 = src_ptr + src_stride * 2;
304
Deb Mukherjee47031c02014-05-16 18:52:01 -0700305 asm volatile (
James Zern3a7d4672014-08-10 16:15:18 -0700306 MEMACCESS(5)
307 "vld1.16 {q13}, [%5] \n"
308 MEMACCESS(6)
309 "vld1.8 {q14}, [%6] \n"
310 MEMACCESS(7)
311 "vld1.8 {q15}, [%7] \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700312 "add %3, %0 \n"
313 ".p2align 2 \n"
314 "1: \n"
315
316 // d0 = 00 40 01 41 02 42 03 43
317 // d1 = 10 50 11 51 12 52 13 53
318 // d2 = 20 60 21 61 22 62 23 63
319 // d3 = 30 70 31 71 32 72 33 73
James Zern3a7d4672014-08-10 16:15:18 -0700320 MEMACCESS(0)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700321 "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
James Zern3a7d4672014-08-10 16:15:18 -0700322 MEMACCESS(3)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700323 "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
James Zern3a7d4672014-08-10 16:15:18 -0700324 MEMACCESS(4)
325 "vld4.8 {d16, d17, d18, d19}, [%4]! \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700326 "subs %2, %2, #12 \n"
327
328 // Shuffle the input data around to get align the data
329 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
330 // d0 = 00 10 01 11 02 12 03 13
331 // d1 = 40 50 41 51 42 52 43 53
332 "vtrn.u8 d0, d1 \n"
333 "vtrn.u8 d4, d5 \n"
334 "vtrn.u8 d16, d17 \n"
335
336 // d2 = 20 30 21 31 22 32 23 33
337 // d3 = 60 70 61 71 62 72 63 73
338 "vtrn.u8 d2, d3 \n"
339 "vtrn.u8 d6, d7 \n"
340 "vtrn.u8 d18, d19 \n"
341
342 // d0 = 00+10 01+11 02+12 03+13
343 // d2 = 40+50 41+51 42+52 43+53
344 "vpaddl.u8 q0, q0 \n"
345 "vpaddl.u8 q2, q2 \n"
346 "vpaddl.u8 q8, q8 \n"
347
348 // d3 = 60+70 61+71 62+72 63+73
349 "vpaddl.u8 d3, d3 \n"
350 "vpaddl.u8 d7, d7 \n"
351 "vpaddl.u8 d19, d19 \n"
352
353 // combine source lines
354 "vadd.u16 q0, q2 \n"
355 "vadd.u16 q0, q8 \n"
356 "vadd.u16 d4, d3, d7 \n"
357 "vadd.u16 d4, d19 \n"
358
359 // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
360 // + s[6 + st * 1] + s[7 + st * 1]
361 // + s[6 + st * 2] + s[7 + st * 2]) / 6
362 "vqrdmulh.s16 q2, q2, q13 \n"
363 "vmovn.u16 d4, q2 \n"
364
365 // Shuffle 2,3 reg around so that 2 can be added to the
366 // 0,1 reg and 3 can be added to the 4,5 reg. This
367 // requires expanding from u8 to u16 as the 0,1 and 4,5
368 // registers are already expanded. Then do transposes
369 // to get aligned.
370 // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
371 "vmovl.u8 q1, d2 \n"
372 "vmovl.u8 q3, d6 \n"
373 "vmovl.u8 q9, d18 \n"
374
375 // combine source lines
376 "vadd.u16 q1, q3 \n"
377 "vadd.u16 q1, q9 \n"
378
379 // d4 = xx 20 xx 30 xx 22 xx 32
380 // d5 = xx 21 xx 31 xx 23 xx 33
381 "vtrn.u32 d2, d3 \n"
382
383 // d4 = xx 20 xx 21 xx 22 xx 23
384 // d5 = xx 30 xx 31 xx 32 xx 33
385 "vtrn.u16 d2, d3 \n"
386
387 // 0+1+2, 3+4+5
388 "vadd.u16 q0, q1 \n"
389
390 // Need to divide, but can't downshift as the the value
391 // isn't a power of 2. So multiply by 65536 / n
392 // and take the upper 16 bits.
393 "vqrdmulh.s16 q0, q0, q15 \n"
394
395 // Align for table lookup, vtbl requires registers to
396 // be adjacent
397 "vmov.u8 d2, d4 \n"
398
399 "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
400 "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
401
James Zern3a7d4672014-08-10 16:15:18 -0700402 MEMACCESS(1)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700403 "vst1.8 {d3}, [%1]! \n"
James Zern3a7d4672014-08-10 16:15:18 -0700404 MEMACCESS(1)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700405 "vst1.32 {d4[0]}, [%1]! \n"
406 "bgt 1b \n"
407 : "+r"(src_ptr), // %0
408 "+r"(dst_ptr), // %1
409 "+r"(dst_width), // %2
James Zern3a7d4672014-08-10 16:15:18 -0700410 "+r"(src_stride), // %3
411 "+r"(src_ptr1) // %4
412 : "r"(&kMult38_Div6), // %5
413 "r"(&kShuf38_2), // %6
414 "r"(&kMult38_Div9) // %7
415 : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700416 );
417}
418
419// 32x2 -> 12x1
420void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
421 ptrdiff_t src_stride,
422 uint8* dst_ptr, int dst_width) {
423 asm volatile (
James Zern3a7d4672014-08-10 16:15:18 -0700424 MEMACCESS(4)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700425 "vld1.16 {q13}, [%4] \n"
James Zern3a7d4672014-08-10 16:15:18 -0700426 MEMACCESS(5)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700427 "vld1.8 {q14}, [%5] \n"
428 "add %3, %0 \n"
429 ".p2align 2 \n"
430 "1: \n"
431
432 // d0 = 00 40 01 41 02 42 03 43
433 // d1 = 10 50 11 51 12 52 13 53
434 // d2 = 20 60 21 61 22 62 23 63
435 // d3 = 30 70 31 71 32 72 33 73
James Zern3a7d4672014-08-10 16:15:18 -0700436 MEMACCESS(0)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700437 "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
James Zern3a7d4672014-08-10 16:15:18 -0700438 MEMACCESS(3)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700439 "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
440 "subs %2, %2, #12 \n"
441
442 // Shuffle the input data around to get align the data
443 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
444 // d0 = 00 10 01 11 02 12 03 13
445 // d1 = 40 50 41 51 42 52 43 53
446 "vtrn.u8 d0, d1 \n"
447 "vtrn.u8 d4, d5 \n"
448
449 // d2 = 20 30 21 31 22 32 23 33
450 // d3 = 60 70 61 71 62 72 63 73
451 "vtrn.u8 d2, d3 \n"
452 "vtrn.u8 d6, d7 \n"
453
454 // d0 = 00+10 01+11 02+12 03+13
455 // d2 = 40+50 41+51 42+52 43+53
456 "vpaddl.u8 q0, q0 \n"
457 "vpaddl.u8 q2, q2 \n"
458
459 // d3 = 60+70 61+71 62+72 63+73
460 "vpaddl.u8 d3, d3 \n"
461 "vpaddl.u8 d7, d7 \n"
462
463 // combine source lines
464 "vadd.u16 q0, q2 \n"
465 "vadd.u16 d4, d3, d7 \n"
466
467 // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
468 "vqrshrn.u16 d4, q2, #2 \n"
469
470 // Shuffle 2,3 reg around so that 2 can be added to the
471 // 0,1 reg and 3 can be added to the 4,5 reg. This
472 // requires expanding from u8 to u16 as the 0,1 and 4,5
473 // registers are already expanded. Then do transposes
474 // to get aligned.
475 // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
476 "vmovl.u8 q1, d2 \n"
477 "vmovl.u8 q3, d6 \n"
478
479 // combine source lines
480 "vadd.u16 q1, q3 \n"
481
482 // d4 = xx 20 xx 30 xx 22 xx 32
483 // d5 = xx 21 xx 31 xx 23 xx 33
484 "vtrn.u32 d2, d3 \n"
485
486 // d4 = xx 20 xx 21 xx 22 xx 23
487 // d5 = xx 30 xx 31 xx 32 xx 33
488 "vtrn.u16 d2, d3 \n"
489
490 // 0+1+2, 3+4+5
491 "vadd.u16 q0, q1 \n"
492
493 // Need to divide, but can't downshift as the the value
494 // isn't a power of 2. So multiply by 65536 / n
495 // and take the upper 16 bits.
496 "vqrdmulh.s16 q0, q0, q13 \n"
497
498 // Align for table lookup, vtbl requires registers to
499 // be adjacent
500 "vmov.u8 d2, d4 \n"
501
502 "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
503 "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
504
James Zern3a7d4672014-08-10 16:15:18 -0700505 MEMACCESS(1)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700506 "vst1.8 {d3}, [%1]! \n"
James Zern3a7d4672014-08-10 16:15:18 -0700507 MEMACCESS(1)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700508 "vst1.32 {d4[0]}, [%1]! \n"
509 "bgt 1b \n"
510 : "+r"(src_ptr), // %0
511 "+r"(dst_ptr), // %1
512 "+r"(dst_width), // %2
513 "+r"(src_stride) // %3
514 : "r"(&kMult38_Div6), // %4
515 "r"(&kShuf38_2) // %5
516 : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
517 );
518}
519
520// 16x2 -> 16x1
521void ScaleFilterRows_NEON(uint8* dst_ptr,
522 const uint8* src_ptr, ptrdiff_t src_stride,
523 int dst_width, int source_y_fraction) {
524 asm volatile (
525 "cmp %4, #0 \n"
526 "beq 100f \n"
527 "add %2, %1 \n"
528 "cmp %4, #64 \n"
529 "beq 75f \n"
530 "cmp %4, #128 \n"
531 "beq 50f \n"
532 "cmp %4, #192 \n"
533 "beq 25f \n"
534
535 "vdup.8 d5, %4 \n"
536 "rsb %4, #256 \n"
537 "vdup.8 d4, %4 \n"
538 // General purpose row blend.
539 "1: \n"
James Zern3a7d4672014-08-10 16:15:18 -0700540 MEMACCESS(1)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700541 "vld1.8 {q0}, [%1]! \n"
James Zern3a7d4672014-08-10 16:15:18 -0700542 MEMACCESS(2)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700543 "vld1.8 {q1}, [%2]! \n"
544 "subs %3, %3, #16 \n"
545 "vmull.u8 q13, d0, d4 \n"
546 "vmull.u8 q14, d1, d4 \n"
547 "vmlal.u8 q13, d2, d5 \n"
548 "vmlal.u8 q14, d3, d5 \n"
549 "vrshrn.u16 d0, q13, #8 \n"
550 "vrshrn.u16 d1, q14, #8 \n"
James Zern3a7d4672014-08-10 16:15:18 -0700551 MEMACCESS(0)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700552 "vst1.8 {q0}, [%0]! \n"
553 "bgt 1b \n"
554 "b 99f \n"
555
556 // Blend 25 / 75.
557 "25: \n"
James Zern3a7d4672014-08-10 16:15:18 -0700558 MEMACCESS(1)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700559 "vld1.8 {q0}, [%1]! \n"
James Zern3a7d4672014-08-10 16:15:18 -0700560 MEMACCESS(2)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700561 "vld1.8 {q1}, [%2]! \n"
562 "subs %3, %3, #16 \n"
563 "vrhadd.u8 q0, q1 \n"
564 "vrhadd.u8 q0, q1 \n"
James Zern3a7d4672014-08-10 16:15:18 -0700565 MEMACCESS(0)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700566 "vst1.8 {q0}, [%0]! \n"
567 "bgt 25b \n"
568 "b 99f \n"
569
570 // Blend 50 / 50.
571 "50: \n"
James Zern3a7d4672014-08-10 16:15:18 -0700572 MEMACCESS(1)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700573 "vld1.8 {q0}, [%1]! \n"
James Zern3a7d4672014-08-10 16:15:18 -0700574 MEMACCESS(2)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700575 "vld1.8 {q1}, [%2]! \n"
576 "subs %3, %3, #16 \n"
577 "vrhadd.u8 q0, q1 \n"
James Zern3a7d4672014-08-10 16:15:18 -0700578 MEMACCESS(0)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700579 "vst1.8 {q0}, [%0]! \n"
580 "bgt 50b \n"
581 "b 99f \n"
582
583 // Blend 75 / 25.
584 "75: \n"
James Zern3a7d4672014-08-10 16:15:18 -0700585 MEMACCESS(1)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700586 "vld1.8 {q1}, [%1]! \n"
James Zern3a7d4672014-08-10 16:15:18 -0700587 MEMACCESS(2)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700588 "vld1.8 {q0}, [%2]! \n"
589 "subs %3, %3, #16 \n"
590 "vrhadd.u8 q0, q1 \n"
591 "vrhadd.u8 q0, q1 \n"
James Zern3a7d4672014-08-10 16:15:18 -0700592 MEMACCESS(0)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700593 "vst1.8 {q0}, [%0]! \n"
594 "bgt 75b \n"
595 "b 99f \n"
596
597 // Blend 100 / 0 - Copy row unchanged.
598 "100: \n"
James Zern3a7d4672014-08-10 16:15:18 -0700599 MEMACCESS(1)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700600 "vld1.8 {q0}, [%1]! \n"
601 "subs %3, %3, #16 \n"
James Zern3a7d4672014-08-10 16:15:18 -0700602 MEMACCESS(0)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700603 "vst1.8 {q0}, [%0]! \n"
604 "bgt 100b \n"
605
606 "99: \n"
James Zern3a7d4672014-08-10 16:15:18 -0700607 MEMACCESS(0)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700608 "vst1.8 {d1[7]}, [%0] \n"
609 : "+r"(dst_ptr), // %0
610 "+r"(src_ptr), // %1
611 "+r"(src_stride), // %2
612 "+r"(dst_width), // %3
613 "+r"(source_y_fraction) // %4
614 :
615 : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
616 );
617}
618
619void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
620 uint8* dst, int dst_width) {
621 asm volatile (
622 ".p2align 2 \n"
623 "1: \n"
624 // load even pixels into q0, odd into q1
James Zern3a7d4672014-08-10 16:15:18 -0700625 MEMACCESS(0)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700626 "vld2.32 {q0, q1}, [%0]! \n"
James Zern3a7d4672014-08-10 16:15:18 -0700627 MEMACCESS(0)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700628 "vld2.32 {q2, q3}, [%0]! \n"
629 "subs %2, %2, #8 \n" // 8 processed per loop
James Zern3a7d4672014-08-10 16:15:18 -0700630 MEMACCESS(1)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700631 "vst1.8 {q1}, [%1]! \n" // store odd pixels
James Zern3a7d4672014-08-10 16:15:18 -0700632 MEMACCESS(1)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700633 "vst1.8 {q3}, [%1]! \n"
634 "bgt 1b \n"
635 : "+r"(src_ptr), // %0
636 "+r"(dst), // %1
637 "+r"(dst_width) // %2
638 :
639 : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
640 );
641}
642
643void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
644 uint8* dst, int dst_width) {
645 asm volatile (
646 // change the stride to row 2 pointer
647 "add %1, %1, %0 \n"
648 ".p2align 2 \n"
649 "1: \n"
James Zern3a7d4672014-08-10 16:15:18 -0700650 MEMACCESS(0)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700651 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
James Zern3a7d4672014-08-10 16:15:18 -0700652 MEMACCESS(0)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700653 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
654 "subs %3, %3, #8 \n" // 8 processed per loop.
655 "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
656 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
657 "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
658 "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
James Zern3a7d4672014-08-10 16:15:18 -0700659 MEMACCESS(1)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700660 "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB pixels.
James Zern3a7d4672014-08-10 16:15:18 -0700661 MEMACCESS(1)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700662 "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB pixels.
663 "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts.
664 "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts.
665 "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts.
666 "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts.
667 "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
668 "vrshrn.u16 d1, q1, #2 \n"
669 "vrshrn.u16 d2, q2, #2 \n"
670 "vrshrn.u16 d3, q3, #2 \n"
James Zern3a7d4672014-08-10 16:15:18 -0700671 MEMACCESS(2)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700672 "vst4.8 {d0, d1, d2, d3}, [%2]! \n"
673 "bgt 1b \n"
674 : "+r"(src_ptr), // %0
675 "+r"(src_stride), // %1
676 "+r"(dst), // %2
677 "+r"(dst_width) // %3
678 :
679 : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
680 );
681}
682
683// Reads 4 pixels at a time.
684// Alignment requirement: src_argb 4 byte aligned.
685void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
686 int src_stepx, uint8* dst_argb, int dst_width) {
687 asm volatile (
688 "mov r12, %3, lsl #2 \n"
689 ".p2align 2 \n"
690 "1: \n"
James Zern3a7d4672014-08-10 16:15:18 -0700691 MEMACCESS(0)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700692 "vld1.32 {d0[0]}, [%0], r12 \n"
James Zern3a7d4672014-08-10 16:15:18 -0700693 MEMACCESS(0)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700694 "vld1.32 {d0[1]}, [%0], r12 \n"
James Zern3a7d4672014-08-10 16:15:18 -0700695 MEMACCESS(0)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700696 "vld1.32 {d1[0]}, [%0], r12 \n"
James Zern3a7d4672014-08-10 16:15:18 -0700697 MEMACCESS(0)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700698 "vld1.32 {d1[1]}, [%0], r12 \n"
699 "subs %2, %2, #4 \n" // 4 pixels per loop.
James Zern3a7d4672014-08-10 16:15:18 -0700700 MEMACCESS(1)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700701 "vst1.8 {q0}, [%1]! \n"
702 "bgt 1b \n"
703 : "+r"(src_argb), // %0
704 "+r"(dst_argb), // %1
705 "+r"(dst_width) // %2
706 : "r"(src_stepx) // %3
707 : "memory", "cc", "r12", "q0"
708 );
709}
710
711// Reads 4 pixels at a time.
712// Alignment requirement: src_argb 4 byte aligned.
713void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
714 int src_stepx,
715 uint8* dst_argb, int dst_width) {
716 asm volatile (
717 "mov r12, %4, lsl #2 \n"
718 "add %1, %1, %0 \n"
719 ".p2align 2 \n"
720 "1: \n"
James Zern3a7d4672014-08-10 16:15:18 -0700721 MEMACCESS(0)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700722 "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1
James Zern3a7d4672014-08-10 16:15:18 -0700723 MEMACCESS(1)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700724 "vld1.8 {d1}, [%1], r12 \n"
James Zern3a7d4672014-08-10 16:15:18 -0700725 MEMACCESS(0)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700726 "vld1.8 {d2}, [%0], r12 \n"
James Zern3a7d4672014-08-10 16:15:18 -0700727 MEMACCESS(1)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700728 "vld1.8 {d3}, [%1], r12 \n"
James Zern3a7d4672014-08-10 16:15:18 -0700729 MEMACCESS(0)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700730 "vld1.8 {d4}, [%0], r12 \n"
James Zern3a7d4672014-08-10 16:15:18 -0700731 MEMACCESS(1)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700732 "vld1.8 {d5}, [%1], r12 \n"
James Zern3a7d4672014-08-10 16:15:18 -0700733 MEMACCESS(0)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700734 "vld1.8 {d6}, [%0], r12 \n"
James Zern3a7d4672014-08-10 16:15:18 -0700735 MEMACCESS(1)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700736 "vld1.8 {d7}, [%1], r12 \n"
737 "vaddl.u8 q0, d0, d1 \n"
738 "vaddl.u8 q1, d2, d3 \n"
739 "vaddl.u8 q2, d4, d5 \n"
740 "vaddl.u8 q3, d6, d7 \n"
741 "vswp.8 d1, d2 \n" // ab_cd -> ac_bd
742 "vswp.8 d5, d6 \n" // ef_gh -> eg_fh
743 "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d)
744 "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h)
745 "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels.
746 "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels.
747 "subs %3, %3, #4 \n" // 4 pixels per loop.
James Zern3a7d4672014-08-10 16:15:18 -0700748 MEMACCESS(2)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700749 "vst1.8 {q0}, [%2]! \n"
750 "bgt 1b \n"
751 : "+r"(src_argb), // %0
752 "+r"(src_stride), // %1
753 "+r"(dst_argb), // %2
754 "+r"(dst_width) // %3
755 : "r"(src_stepx) // %4
756 : "memory", "cc", "r12", "q0", "q1", "q2", "q3"
757 );
758}
759
Johann223bf292015-03-02 15:19:19 -0800760#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
Deb Mukherjee47031c02014-05-16 18:52:01 -0700761
762#ifdef __cplusplus
763} // extern "C"
764} // namespace libyuv
765#endif