blob: 572b4bfa9b33e72e012d7a40a421401b43799dc4 [file] [log] [blame]
Deb Mukherjee47031c02014-05-16 18:52:01 -07001/*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
James Zern3a7d4672014-08-10 16:15:18 -070011#include "libyuv/row.h"
Deb Mukherjee47031c02014-05-16 18:52:01 -070012
13#ifdef __cplusplus
14namespace libyuv {
15extern "C" {
16#endif
17
18// This module is for GCC Neon.
Johann223bf292015-03-02 15:19:19 -080019#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
20 !defined(__aarch64__)
Deb Mukherjee47031c02014-05-16 18:52:01 -070021
22// NEON downscalers with interpolation.
23// Provided by Fritz Koenig
24
25// Read 32x1 throw away even pixels, and write 16x1.
Christopher Degawa96703f22020-12-12 18:16:19 +000026void ScaleRowDown2_NEON(const uint8_t* src_ptr,
27 ptrdiff_t src_stride,
28 uint8_t* dst,
29 int dst_width) {
30 (void)src_stride;
31 asm volatile(
32 "1: \n"
33 // load even pixels into q0, odd into q1
34 "vld2.8 {q0, q1}, [%0]! \n"
35 "subs %2, %2, #16 \n" // 16 processed per loop
36 "vst1.8 {q1}, [%1]! \n" // store odd pixels
37 "bgt 1b \n"
38 : "+r"(src_ptr), // %0
39 "+r"(dst), // %1
40 "+r"(dst_width) // %2
41 :
42 : "q0", "q1" // Clobber List
Deb Mukherjee47031c02014-05-16 18:52:01 -070043 );
44}
45
James Zernfcb42532015-07-24 16:54:51 -070046// Read 32x1 average down and write 16x1.
Christopher Degawa96703f22020-12-12 18:16:19 +000047void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
48 ptrdiff_t src_stride,
49 uint8_t* dst,
50 int dst_width) {
51 (void)src_stride;
52 asm volatile(
53 "1: \n"
54 "vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels
55 "subs %2, %2, #16 \n" // 16 processed per loop
56 "vrhadd.u8 q0, q0, q1 \n" // rounding half add
57 "vst1.8 {q0}, [%1]! \n"
58 "bgt 1b \n"
59 : "+r"(src_ptr), // %0
60 "+r"(dst), // %1
61 "+r"(dst_width) // %2
62 :
63 : "q0", "q1" // Clobber List
James Zernfcb42532015-07-24 16:54:51 -070064 );
65}
66
Deb Mukherjee47031c02014-05-16 18:52:01 -070067// Read 32x2 average down and write 16x1.
Christopher Degawa96703f22020-12-12 18:16:19 +000068void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
69 ptrdiff_t src_stride,
70 uint8_t* dst,
71 int dst_width) {
72 asm volatile(
73 // change the stride to row 2 pointer
74 "add %1, %0 \n"
75 "1: \n"
76 "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
77 "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
78 "subs %3, %3, #16 \n" // 16 processed per loop
79 "vpaddl.u8 q0, q0 \n" // row 1 add adjacent
80 "vpaddl.u8 q1, q1 \n"
81 "vpadal.u8 q0, q2 \n" // row 2 add adjacent +
82 // row1
83 "vpadal.u8 q1, q3 \n"
84 "vrshrn.u16 d0, q0, #2 \n" // downshift, round and
85 // pack
86 "vrshrn.u16 d1, q1, #2 \n"
87 "vst1.8 {q0}, [%2]! \n"
88 "bgt 1b \n"
89 : "+r"(src_ptr), // %0
90 "+r"(src_stride), // %1
91 "+r"(dst), // %2
92 "+r"(dst_width) // %3
93 :
94 : "q0", "q1", "q2", "q3" // Clobber List
Deb Mukherjee47031c02014-05-16 18:52:01 -070095 );
96}
97
Christopher Degawa96703f22020-12-12 18:16:19 +000098void ScaleRowDown4_NEON(const uint8_t* src_ptr,
99 ptrdiff_t src_stride,
100 uint8_t* dst_ptr,
101 int dst_width) {
102 (void)src_stride;
103 asm volatile(
104 "1: \n"
105 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
106 "subs %2, %2, #8 \n" // 8 processed per loop
107 "vst1.8 {d2}, [%1]! \n"
108 "bgt 1b \n"
109 : "+r"(src_ptr), // %0
110 "+r"(dst_ptr), // %1
111 "+r"(dst_width) // %2
112 :
113 : "q0", "q1", "memory", "cc");
Deb Mukherjee47031c02014-05-16 18:52:01 -0700114}
115
Christopher Degawa96703f22020-12-12 18:16:19 +0000116void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
117 ptrdiff_t src_stride,
118 uint8_t* dst_ptr,
119 int dst_width) {
120 const uint8_t* src_ptr1 = src_ptr + src_stride;
121 const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
122 const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
123 asm volatile(
124 "1: \n"
125 "vld1.8 {q0}, [%0]! \n" // load up 16x4
126 "vld1.8 {q1}, [%3]! \n"
127 "vld1.8 {q2}, [%4]! \n"
128 "vld1.8 {q3}, [%5]! \n"
129 "subs %2, %2, #4 \n"
130 "vpaddl.u8 q0, q0 \n"
131 "vpadal.u8 q0, q1 \n"
132 "vpadal.u8 q0, q2 \n"
133 "vpadal.u8 q0, q3 \n"
134 "vpaddl.u16 q0, q0 \n"
135 "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
136 "vmovn.u16 d0, q0 \n"
137 "vst1.32 {d0[0]}, [%1]! \n"
138 "bgt 1b \n"
139 : "+r"(src_ptr), // %0
140 "+r"(dst_ptr), // %1
141 "+r"(dst_width), // %2
142 "+r"(src_ptr1), // %3
143 "+r"(src_ptr2), // %4
144 "+r"(src_ptr3) // %5
145 :
146 : "q0", "q1", "q2", "q3", "memory", "cc");
Deb Mukherjee47031c02014-05-16 18:52:01 -0700147}
148
149// Down scale from 4 to 3 pixels. Use the neon multilane read/write
150// to load up the every 4th pixel into a 4 different registers.
151// Point samples 32 pixels to 24 pixels.
Christopher Degawa96703f22020-12-12 18:16:19 +0000152void ScaleRowDown34_NEON(const uint8_t* src_ptr,
Deb Mukherjee47031c02014-05-16 18:52:01 -0700153 ptrdiff_t src_stride,
Christopher Degawa96703f22020-12-12 18:16:19 +0000154 uint8_t* dst_ptr,
155 int dst_width) {
156 (void)src_stride;
157 asm volatile(
158 "1: \n"
159 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
160 "subs %2, %2, #24 \n"
161 "vmov d2, d3 \n" // order d0, d1, d2
162 "vst3.8 {d0, d1, d2}, [%1]! \n"
163 "bgt 1b \n"
164 : "+r"(src_ptr), // %0
165 "+r"(dst_ptr), // %1
166 "+r"(dst_width) // %2
167 :
168 : "d0", "d1", "d2", "d3", "memory", "cc");
Deb Mukherjee47031c02014-05-16 18:52:01 -0700169}
170
Christopher Degawa96703f22020-12-12 18:16:19 +0000171void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
Deb Mukherjee47031c02014-05-16 18:52:01 -0700172 ptrdiff_t src_stride,
Christopher Degawa96703f22020-12-12 18:16:19 +0000173 uint8_t* dst_ptr,
174 int dst_width) {
175 asm volatile(
176 "vmov.u8 d24, #3 \n"
177 "add %3, %0 \n"
178 "1: \n"
179 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
180 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
181 "subs %2, %2, #24 \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700182
Christopher Degawa96703f22020-12-12 18:16:19 +0000183 // filter src line 0 with src line 1
184 // expand chars to shorts to allow for room
185 // when adding lines together
186 "vmovl.u8 q8, d4 \n"
187 "vmovl.u8 q9, d5 \n"
188 "vmovl.u8 q10, d6 \n"
189 "vmovl.u8 q11, d7 \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700190
Christopher Degawa96703f22020-12-12 18:16:19 +0000191 // 3 * line_0 + line_1
192 "vmlal.u8 q8, d0, d24 \n"
193 "vmlal.u8 q9, d1, d24 \n"
194 "vmlal.u8 q10, d2, d24 \n"
195 "vmlal.u8 q11, d3, d24 \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700196
Christopher Degawa96703f22020-12-12 18:16:19 +0000197 // (3 * line_0 + line_1) >> 2
198 "vqrshrn.u16 d0, q8, #2 \n"
199 "vqrshrn.u16 d1, q9, #2 \n"
200 "vqrshrn.u16 d2, q10, #2 \n"
201 "vqrshrn.u16 d3, q11, #2 \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700202
Christopher Degawa96703f22020-12-12 18:16:19 +0000203 // a0 = (src[0] * 3 + s[1] * 1) >> 2
204 "vmovl.u8 q8, d1 \n"
205 "vmlal.u8 q8, d0, d24 \n"
206 "vqrshrn.u16 d0, q8, #2 \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700207
Christopher Degawa96703f22020-12-12 18:16:19 +0000208 // a1 = (src[1] * 1 + s[2] * 1) >> 1
209 "vrhadd.u8 d1, d1, d2 \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700210
Christopher Degawa96703f22020-12-12 18:16:19 +0000211 // a2 = (src[2] * 1 + s[3] * 3) >> 2
212 "vmovl.u8 q8, d2 \n"
213 "vmlal.u8 q8, d3, d24 \n"
214 "vqrshrn.u16 d2, q8, #2 \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700215
Christopher Degawa96703f22020-12-12 18:16:19 +0000216 "vst3.8 {d0, d1, d2}, [%1]! \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700217
Christopher Degawa96703f22020-12-12 18:16:19 +0000218 "bgt 1b \n"
219 : "+r"(src_ptr), // %0
220 "+r"(dst_ptr), // %1
221 "+r"(dst_width), // %2
222 "+r"(src_stride) // %3
223 :
224 : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory",
225 "cc");
Deb Mukherjee47031c02014-05-16 18:52:01 -0700226}
227
Christopher Degawa96703f22020-12-12 18:16:19 +0000228void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
Deb Mukherjee47031c02014-05-16 18:52:01 -0700229 ptrdiff_t src_stride,
Christopher Degawa96703f22020-12-12 18:16:19 +0000230 uint8_t* dst_ptr,
231 int dst_width) {
232 asm volatile(
233 "vmov.u8 d24, #3 \n"
234 "add %3, %0 \n"
235 "1: \n"
236 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
237 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
238 "subs %2, %2, #24 \n"
239 // average src line 0 with src line 1
240 "vrhadd.u8 q0, q0, q2 \n"
241 "vrhadd.u8 q1, q1, q3 \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700242
Christopher Degawa96703f22020-12-12 18:16:19 +0000243 // a0 = (src[0] * 3 + s[1] * 1) >> 2
244 "vmovl.u8 q3, d1 \n"
245 "vmlal.u8 q3, d0, d24 \n"
246 "vqrshrn.u16 d0, q3, #2 \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700247
Christopher Degawa96703f22020-12-12 18:16:19 +0000248 // a1 = (src[1] * 1 + s[2] * 1) >> 1
249 "vrhadd.u8 d1, d1, d2 \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700250
Christopher Degawa96703f22020-12-12 18:16:19 +0000251 // a2 = (src[2] * 1 + s[3] * 3) >> 2
252 "vmovl.u8 q3, d2 \n"
253 "vmlal.u8 q3, d3, d24 \n"
254 "vqrshrn.u16 d2, q3, #2 \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700255
Christopher Degawa96703f22020-12-12 18:16:19 +0000256 "vst3.8 {d0, d1, d2}, [%1]! \n"
257 "bgt 1b \n"
258 : "+r"(src_ptr), // %0
259 "+r"(dst_ptr), // %1
260 "+r"(dst_width), // %2
261 "+r"(src_stride) // %3
262 :
263 : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc");
Deb Mukherjee47031c02014-05-16 18:52:01 -0700264}
265
266#define HAS_SCALEROWDOWN38_NEON
Christopher Degawa96703f22020-12-12 18:16:19 +0000267static const uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19,
268 22, 24, 27, 30, 0, 0, 0, 0};
269static const uvec8 kShuf38_2 = {0, 8, 16, 2, 10, 17, 4, 12,
270 18, 6, 14, 19, 0, 0, 0, 0};
271static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12,
272 65536 / 12, 65536 / 12, 65536 / 12,
273 65536 / 12, 65536 / 12};
274static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18,
275 65536 / 18, 65536 / 18, 65536 / 18,
276 65536 / 18, 65536 / 18};
Deb Mukherjee47031c02014-05-16 18:52:01 -0700277
278// 32 -> 12
Christopher Degawa96703f22020-12-12 18:16:19 +0000279void ScaleRowDown38_NEON(const uint8_t* src_ptr,
Deb Mukherjee47031c02014-05-16 18:52:01 -0700280 ptrdiff_t src_stride,
Christopher Degawa96703f22020-12-12 18:16:19 +0000281 uint8_t* dst_ptr,
282 int dst_width) {
283 (void)src_stride;
284 asm volatile(
285 "vld1.8 {q3}, [%3] \n"
286 "1: \n"
287 "vld1.8 {d0, d1, d2, d3}, [%0]! \n"
288 "subs %2, %2, #12 \n"
289 "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
290 "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
291 "vst1.8 {d4}, [%1]! \n"
292 "vst1.32 {d5[0]}, [%1]! \n"
293 "bgt 1b \n"
294 : "+r"(src_ptr), // %0
295 "+r"(dst_ptr), // %1
296 "+r"(dst_width) // %2
297 : "r"(&kShuf38) // %3
298 : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc");
Deb Mukherjee47031c02014-05-16 18:52:01 -0700299}
300
301// 32x3 -> 12x1
Christopher Degawa96703f22020-12-12 18:16:19 +0000302void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
Deb Mukherjee47031c02014-05-16 18:52:01 -0700303 ptrdiff_t src_stride,
Christopher Degawa96703f22020-12-12 18:16:19 +0000304 uint8_t* dst_ptr,
305 int dst_width) {
306 const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
James Zern3a7d4672014-08-10 16:15:18 -0700307
Christopher Degawa96703f22020-12-12 18:16:19 +0000308 asm volatile(
309 "vld1.16 {q13}, [%5] \n"
310 "vld1.8 {q14}, [%6] \n"
311 "vld1.8 {q15}, [%7] \n"
312 "add %3, %0 \n"
313 "1: \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700314
Christopher Degawa96703f22020-12-12 18:16:19 +0000315 // d0 = 00 40 01 41 02 42 03 43
316 // d1 = 10 50 11 51 12 52 13 53
317 // d2 = 20 60 21 61 22 62 23 63
318 // d3 = 30 70 31 71 32 72 33 73
319 "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
320 "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
321 "vld4.8 {d16, d17, d18, d19}, [%4]! \n"
322 "subs %2, %2, #12 \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700323
Christopher Degawa96703f22020-12-12 18:16:19 +0000324 // Shuffle the input data around to get align the data
325 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
326 // d0 = 00 10 01 11 02 12 03 13
327 // d1 = 40 50 41 51 42 52 43 53
328 "vtrn.u8 d0, d1 \n"
329 "vtrn.u8 d4, d5 \n"
330 "vtrn.u8 d16, d17 \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700331
Christopher Degawa96703f22020-12-12 18:16:19 +0000332 // d2 = 20 30 21 31 22 32 23 33
333 // d3 = 60 70 61 71 62 72 63 73
334 "vtrn.u8 d2, d3 \n"
335 "vtrn.u8 d6, d7 \n"
336 "vtrn.u8 d18, d19 \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700337
Christopher Degawa96703f22020-12-12 18:16:19 +0000338 // d0 = 00+10 01+11 02+12 03+13
339 // d2 = 40+50 41+51 42+52 43+53
340 "vpaddl.u8 q0, q0 \n"
341 "vpaddl.u8 q2, q2 \n"
342 "vpaddl.u8 q8, q8 \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700343
Christopher Degawa96703f22020-12-12 18:16:19 +0000344 // d3 = 60+70 61+71 62+72 63+73
345 "vpaddl.u8 d3, d3 \n"
346 "vpaddl.u8 d7, d7 \n"
347 "vpaddl.u8 d19, d19 \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700348
Christopher Degawa96703f22020-12-12 18:16:19 +0000349 // combine source lines
350 "vadd.u16 q0, q2 \n"
351 "vadd.u16 q0, q8 \n"
352 "vadd.u16 d4, d3, d7 \n"
353 "vadd.u16 d4, d19 \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700354
Christopher Degawa96703f22020-12-12 18:16:19 +0000355 // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
356 // + s[6 + st * 1] + s[7 + st * 1]
357 // + s[6 + st * 2] + s[7 + st * 2]) / 6
358 "vqrdmulh.s16 q2, q2, q13 \n"
359 "vmovn.u16 d4, q2 \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700360
Christopher Degawa96703f22020-12-12 18:16:19 +0000361 // Shuffle 2,3 reg around so that 2 can be added to the
362 // 0,1 reg and 3 can be added to the 4,5 reg. This
363 // requires expanding from u8 to u16 as the 0,1 and 4,5
364 // registers are already expanded. Then do transposes
365 // to get aligned.
366 // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
367 "vmovl.u8 q1, d2 \n"
368 "vmovl.u8 q3, d6 \n"
369 "vmovl.u8 q9, d18 \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700370
Christopher Degawa96703f22020-12-12 18:16:19 +0000371 // combine source lines
372 "vadd.u16 q1, q3 \n"
373 "vadd.u16 q1, q9 \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700374
Christopher Degawa96703f22020-12-12 18:16:19 +0000375 // d4 = xx 20 xx 30 xx 22 xx 32
376 // d5 = xx 21 xx 31 xx 23 xx 33
377 "vtrn.u32 d2, d3 \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700378
Christopher Degawa96703f22020-12-12 18:16:19 +0000379 // d4 = xx 20 xx 21 xx 22 xx 23
380 // d5 = xx 30 xx 31 xx 32 xx 33
381 "vtrn.u16 d2, d3 \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700382
Christopher Degawa96703f22020-12-12 18:16:19 +0000383 // 0+1+2, 3+4+5
384 "vadd.u16 q0, q1 \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700385
Christopher Degawa96703f22020-12-12 18:16:19 +0000386 // Need to divide, but can't downshift as the the value
387 // isn't a power of 2. So multiply by 65536 / n
388 // and take the upper 16 bits.
389 "vqrdmulh.s16 q0, q0, q15 \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700390
Christopher Degawa96703f22020-12-12 18:16:19 +0000391 // Align for table lookup, vtbl requires registers to
392 // be adjacent
393 "vmov.u8 d2, d4 \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700394
Christopher Degawa96703f22020-12-12 18:16:19 +0000395 "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
396 "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700397
Christopher Degawa96703f22020-12-12 18:16:19 +0000398 "vst1.8 {d3}, [%1]! \n"
399 "vst1.32 {d4[0]}, [%1]! \n"
400 "bgt 1b \n"
401 : "+r"(src_ptr), // %0
402 "+r"(dst_ptr), // %1
403 "+r"(dst_width), // %2
404 "+r"(src_stride), // %3
405 "+r"(src_ptr1) // %4
406 : "r"(&kMult38_Div6), // %5
407 "r"(&kShuf38_2), // %6
408 "r"(&kMult38_Div9) // %7
409 : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory",
410 "cc");
Deb Mukherjee47031c02014-05-16 18:52:01 -0700411}
412
413// 32x2 -> 12x1
Christopher Degawa96703f22020-12-12 18:16:19 +0000414void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
Deb Mukherjee47031c02014-05-16 18:52:01 -0700415 ptrdiff_t src_stride,
Christopher Degawa96703f22020-12-12 18:16:19 +0000416 uint8_t* dst_ptr,
417 int dst_width) {
418 asm volatile(
419 "vld1.16 {q13}, [%4] \n"
420 "vld1.8 {q14}, [%5] \n"
421 "add %3, %0 \n"
422 "1: \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700423
Christopher Degawa96703f22020-12-12 18:16:19 +0000424 // d0 = 00 40 01 41 02 42 03 43
425 // d1 = 10 50 11 51 12 52 13 53
426 // d2 = 20 60 21 61 22 62 23 63
427 // d3 = 30 70 31 71 32 72 33 73
428 "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
429 "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
430 "subs %2, %2, #12 \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700431
Christopher Degawa96703f22020-12-12 18:16:19 +0000432 // Shuffle the input data around to get align the data
433 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
434 // d0 = 00 10 01 11 02 12 03 13
435 // d1 = 40 50 41 51 42 52 43 53
436 "vtrn.u8 d0, d1 \n"
437 "vtrn.u8 d4, d5 \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700438
Christopher Degawa96703f22020-12-12 18:16:19 +0000439 // d2 = 20 30 21 31 22 32 23 33
440 // d3 = 60 70 61 71 62 72 63 73
441 "vtrn.u8 d2, d3 \n"
442 "vtrn.u8 d6, d7 \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700443
Christopher Degawa96703f22020-12-12 18:16:19 +0000444 // d0 = 00+10 01+11 02+12 03+13
445 // d2 = 40+50 41+51 42+52 43+53
446 "vpaddl.u8 q0, q0 \n"
447 "vpaddl.u8 q2, q2 \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700448
Christopher Degawa96703f22020-12-12 18:16:19 +0000449 // d3 = 60+70 61+71 62+72 63+73
450 "vpaddl.u8 d3, d3 \n"
451 "vpaddl.u8 d7, d7 \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700452
Christopher Degawa96703f22020-12-12 18:16:19 +0000453 // combine source lines
454 "vadd.u16 q0, q2 \n"
455 "vadd.u16 d4, d3, d7 \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700456
Christopher Degawa96703f22020-12-12 18:16:19 +0000457 // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
458 "vqrshrn.u16 d4, q2, #2 \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700459
Christopher Degawa96703f22020-12-12 18:16:19 +0000460 // Shuffle 2,3 reg around so that 2 can be added to the
461 // 0,1 reg and 3 can be added to the 4,5 reg. This
462 // requires expanding from u8 to u16 as the 0,1 and 4,5
463 // registers are already expanded. Then do transposes
464 // to get aligned.
465 // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
466 "vmovl.u8 q1, d2 \n"
467 "vmovl.u8 q3, d6 \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700468
Christopher Degawa96703f22020-12-12 18:16:19 +0000469 // combine source lines
470 "vadd.u16 q1, q3 \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700471
Christopher Degawa96703f22020-12-12 18:16:19 +0000472 // d4 = xx 20 xx 30 xx 22 xx 32
473 // d5 = xx 21 xx 31 xx 23 xx 33
474 "vtrn.u32 d2, d3 \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700475
Christopher Degawa96703f22020-12-12 18:16:19 +0000476 // d4 = xx 20 xx 21 xx 22 xx 23
477 // d5 = xx 30 xx 31 xx 32 xx 33
478 "vtrn.u16 d2, d3 \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700479
Christopher Degawa96703f22020-12-12 18:16:19 +0000480 // 0+1+2, 3+4+5
481 "vadd.u16 q0, q1 \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700482
Christopher Degawa96703f22020-12-12 18:16:19 +0000483 // Need to divide, but can't downshift as the the value
484 // isn't a power of 2. So multiply by 65536 / n
485 // and take the upper 16 bits.
486 "vqrdmulh.s16 q0, q0, q13 \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700487
Christopher Degawa96703f22020-12-12 18:16:19 +0000488 // Align for table lookup, vtbl requires registers to
489 // be adjacent
490 "vmov.u8 d2, d4 \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700491
Christopher Degawa96703f22020-12-12 18:16:19 +0000492 "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
493 "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700494
Christopher Degawa96703f22020-12-12 18:16:19 +0000495 "vst1.8 {d3}, [%1]! \n"
496 "vst1.32 {d4[0]}, [%1]! \n"
497 "bgt 1b \n"
498 : "+r"(src_ptr), // %0
499 "+r"(dst_ptr), // %1
500 "+r"(dst_width), // %2
501 "+r"(src_stride) // %3
502 : "r"(&kMult38_Div6), // %4
503 "r"(&kShuf38_2) // %5
504 : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc");
Deb Mukherjee47031c02014-05-16 18:52:01 -0700505}
506
Christopher Degawa96703f22020-12-12 18:16:19 +0000507// Add a row of bytes to a row of shorts. Used for box filter.
508// Reads 16 bytes and accumulates to 16 shorts at a time.
509void ScaleAddRow_NEON(const uint8_t* src_ptr,
510 uint16_t* dst_ptr,
511 int src_width) {
512 asm volatile(
513 "1: \n"
514 "vld1.16 {q1, q2}, [%1] \n" // load accumulator
515 "vld1.8 {q0}, [%0]! \n" // load 16 bytes
516 "vaddw.u8 q2, q2, d1 \n" // add
517 "vaddw.u8 q1, q1, d0 \n"
518 "vst1.16 {q1, q2}, [%1]! \n" // store accumulator
519 "subs %2, %2, #16 \n" // 16 processed per loop
520 "bgt 1b \n"
521 : "+r"(src_ptr), // %0
522 "+r"(dst_ptr), // %1
523 "+r"(src_width) // %2
524 :
525 : "memory", "cc", "q0", "q1", "q2" // Clobber List
James Zernfcb42532015-07-24 16:54:51 -0700526 );
527}
528
529// TODO(Yang Zhang): Investigate less load instructions for
530// the x/dx stepping
Christopher Degawa96703f22020-12-12 18:16:19 +0000531#define LOAD2_DATA8_LANE(n) \
532 "lsr %5, %3, #16 \n" \
533 "add %6, %1, %5 \n" \
534 "add %3, %3, %4 \n" \
535 "vld2.8 {d6[" #n "], d7[" #n "]}, [%6] \n"
James Zernfcb42532015-07-24 16:54:51 -0700536
Christopher Degawa96703f22020-12-12 18:16:19 +0000537// The NEON version mimics this formula (from row_common.cc):
538// #define BLENDER(a, b, f) (uint8_t)((int)(a) +
539// ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
540
541void ScaleFilterCols_NEON(uint8_t* dst_ptr,
542 const uint8_t* src_ptr,
543 int dst_width,
544 int x,
545 int dx) {
James Zernfcb42532015-07-24 16:54:51 -0700546 int dx_offset[4] = {0, 1, 2, 3};
547 int* tmp = dx_offset;
Christopher Degawa96703f22020-12-12 18:16:19 +0000548 const uint8_t* src_tmp = src_ptr;
James Zernfcb42532015-07-24 16:54:51 -0700549 asm volatile (
Christopher Degawa96703f22020-12-12 18:16:19 +0000550 "vdup.32 q0, %3 \n" // x
551 "vdup.32 q1, %4 \n" // dx
552 "vld1.32 {q2}, [%5] \n" // 0 1 2 3
553 "vshl.i32 q3, q1, #2 \n" // 4 * dx
554 "vmul.s32 q1, q1, q2 \n"
James Zernfcb42532015-07-24 16:54:51 -0700555 // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
Christopher Degawa96703f22020-12-12 18:16:19 +0000556 "vadd.s32 q1, q1, q0 \n"
James Zernfcb42532015-07-24 16:54:51 -0700557 // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
Christopher Degawa96703f22020-12-12 18:16:19 +0000558 "vadd.s32 q2, q1, q3 \n"
559 "vshl.i32 q0, q3, #1 \n" // 8 * dx
560 "1: \n"
James Zernfcb42532015-07-24 16:54:51 -0700561 LOAD2_DATA8_LANE(0)
562 LOAD2_DATA8_LANE(1)
563 LOAD2_DATA8_LANE(2)
564 LOAD2_DATA8_LANE(3)
565 LOAD2_DATA8_LANE(4)
566 LOAD2_DATA8_LANE(5)
567 LOAD2_DATA8_LANE(6)
568 LOAD2_DATA8_LANE(7)
Christopher Degawa96703f22020-12-12 18:16:19 +0000569 "vmov q10, q1 \n"
570 "vmov q11, q2 \n"
571 "vuzp.16 q10, q11 \n"
572 "vmovl.u8 q8, d6 \n"
573 "vmovl.u8 q9, d7 \n"
574 "vsubl.s16 q11, d18, d16 \n"
575 "vsubl.s16 q12, d19, d17 \n"
576 "vmovl.u16 q13, d20 \n"
577 "vmovl.u16 q10, d21 \n"
578 "vmul.s32 q11, q11, q13 \n"
579 "vmul.s32 q12, q12, q10 \n"
580 "vrshrn.s32 d18, q11, #16 \n"
581 "vrshrn.s32 d19, q12, #16 \n"
582 "vadd.s16 q8, q8, q9 \n"
583 "vmovn.s16 d6, q8 \n"
James Zernfcb42532015-07-24 16:54:51 -0700584
Christopher Degawa96703f22020-12-12 18:16:19 +0000585 "vst1.8 {d6}, [%0]! \n" // store pixels
586 "vadd.s32 q1, q1, q0 \n"
587 "vadd.s32 q2, q2, q0 \n"
588 "subs %2, %2, #8 \n" // 8 processed per loop
589 "bgt 1b \n"
James Zernfcb42532015-07-24 16:54:51 -0700590 : "+r"(dst_ptr), // %0
591 "+r"(src_ptr), // %1
592 "+r"(dst_width), // %2
593 "+r"(x), // %3
594 "+r"(dx), // %4
595 "+r"(tmp), // %5
596 "+r"(src_tmp) // %6
597 :
598 : "memory", "cc", "q0", "q1", "q2", "q3",
599 "q8", "q9", "q10", "q11", "q12", "q13"
600 );
601}
602
603#undef LOAD2_DATA8_LANE
604
Deb Mukherjee47031c02014-05-16 18:52:01 -0700605// 16x2 -> 16x1
Christopher Degawa96703f22020-12-12 18:16:19 +0000606void ScaleFilterRows_NEON(uint8_t* dst_ptr,
607 const uint8_t* src_ptr,
608 ptrdiff_t src_stride,
609 int dst_width,
610 int source_y_fraction) {
611 asm volatile(
612 "cmp %4, #0 \n"
613 "beq 100f \n"
614 "add %2, %1 \n"
615 "cmp %4, #64 \n"
616 "beq 75f \n"
617 "cmp %4, #128 \n"
618 "beq 50f \n"
619 "cmp %4, #192 \n"
620 "beq 25f \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700621
Christopher Degawa96703f22020-12-12 18:16:19 +0000622 "vdup.8 d5, %4 \n"
623 "rsb %4, #256 \n"
624 "vdup.8 d4, %4 \n"
625 // General purpose row blend.
626 "1: \n"
627 "vld1.8 {q0}, [%1]! \n"
628 "vld1.8 {q1}, [%2]! \n"
629 "subs %3, %3, #16 \n"
630 "vmull.u8 q13, d0, d4 \n"
631 "vmull.u8 q14, d1, d4 \n"
632 "vmlal.u8 q13, d2, d5 \n"
633 "vmlal.u8 q14, d3, d5 \n"
634 "vrshrn.u16 d0, q13, #8 \n"
635 "vrshrn.u16 d1, q14, #8 \n"
636 "vst1.8 {q0}, [%0]! \n"
637 "bgt 1b \n"
638 "b 99f \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700639
Christopher Degawa96703f22020-12-12 18:16:19 +0000640 // Blend 25 / 75.
641 "25: \n"
642 "vld1.8 {q0}, [%1]! \n"
643 "vld1.8 {q1}, [%2]! \n"
644 "subs %3, %3, #16 \n"
645 "vrhadd.u8 q0, q1 \n"
646 "vrhadd.u8 q0, q1 \n"
647 "vst1.8 {q0}, [%0]! \n"
648 "bgt 25b \n"
649 "b 99f \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700650
Christopher Degawa96703f22020-12-12 18:16:19 +0000651 // Blend 50 / 50.
652 "50: \n"
653 "vld1.8 {q0}, [%1]! \n"
654 "vld1.8 {q1}, [%2]! \n"
655 "subs %3, %3, #16 \n"
656 "vrhadd.u8 q0, q1 \n"
657 "vst1.8 {q0}, [%0]! \n"
658 "bgt 50b \n"
659 "b 99f \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700660
Christopher Degawa96703f22020-12-12 18:16:19 +0000661 // Blend 75 / 25.
662 "75: \n"
663 "vld1.8 {q1}, [%1]! \n"
664 "vld1.8 {q0}, [%2]! \n"
665 "subs %3, %3, #16 \n"
666 "vrhadd.u8 q0, q1 \n"
667 "vrhadd.u8 q0, q1 \n"
668 "vst1.8 {q0}, [%0]! \n"
669 "bgt 75b \n"
670 "b 99f \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700671
Christopher Degawa96703f22020-12-12 18:16:19 +0000672 // Blend 100 / 0 - Copy row unchanged.
673 "100: \n"
674 "vld1.8 {q0}, [%1]! \n"
675 "subs %3, %3, #16 \n"
676 "vst1.8 {q0}, [%0]! \n"
677 "bgt 100b \n"
Deb Mukherjee47031c02014-05-16 18:52:01 -0700678
Christopher Degawa96703f22020-12-12 18:16:19 +0000679 "99: \n"
680 "vst1.8 {d1[7]}, [%0] \n"
681 : "+r"(dst_ptr), // %0
682 "+r"(src_ptr), // %1
683 "+r"(src_stride), // %2
684 "+r"(dst_width), // %3
685 "+r"(source_y_fraction) // %4
686 :
687 : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc");
688}
689
690void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
691 ptrdiff_t src_stride,
692 uint8_t* dst,
693 int dst_width) {
694 (void)src_stride;
695 asm volatile(
696 "1: \n"
697 "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
698 "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
699 "subs %2, %2, #8 \n" // 8 processed per loop
700 "vmov q2, q1 \n" // load next 8 ARGB
701 "vst2.32 {q2, q3}, [%1]! \n" // store odd pixels
702 "bgt 1b \n"
703 : "+r"(src_ptr), // %0
704 "+r"(dst), // %1
705 "+r"(dst_width) // %2
706 :
707 : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
Deb Mukherjee47031c02014-05-16 18:52:01 -0700708 );
709}
710
Christopher Degawa96703f22020-12-12 18:16:19 +0000711// 46: f964 018d vld4.32 {d16,d18,d20,d22}, [r4]!
712// 4a: 3e04 subs r6, #4
713// 4c: f964 118d vld4.32 {d17,d19,d21,d23}, [r4]!
714// 50: ef64 21f4 vorr q9, q10, q10
715// 54: f942 038d vst2.32 {d16-d19}, [r2]!
716// 58: d1f5 bne.n 46 <ScaleARGBRowDown2_C+0x46>
717
718void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
719 ptrdiff_t src_stride,
720 uint8_t* dst_argb,
721 int dst_width) {
722 (void)src_stride;
723 asm volatile(
724 "1: \n"
725 "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
726 "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
727 "subs %2, %2, #8 \n" // 8 processed per loop
728 "vrhadd.u8 q0, q0, q1 \n" // rounding half add
729 "vrhadd.u8 q1, q2, q3 \n" // rounding half add
730 "vst2.32 {q0, q1}, [%1]! \n"
731 "bgt 1b \n"
732 : "+r"(src_argb), // %0
733 "+r"(dst_argb), // %1
734 "+r"(dst_width) // %2
735 :
736 : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
Deb Mukherjee47031c02014-05-16 18:52:01 -0700737 );
738}
739
Christopher Degawa96703f22020-12-12 18:16:19 +0000740void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
741 ptrdiff_t src_stride,
742 uint8_t* dst,
743 int dst_width) {
744 asm volatile(
745 // change the stride to row 2 pointer
746 "add %1, %1, %0 \n"
747 "1: \n"
748 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
749 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
750 "subs %3, %3, #8 \n" // 8 processed per loop.
751 "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
752 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
753 "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
754 "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
755 "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB
756 "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB
757 "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts.
758 "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts.
759 "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts.
760 "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts.
761 "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes
762 "vrshrn.u16 d1, q1, #2 \n"
763 "vrshrn.u16 d2, q2, #2 \n"
764 "vrshrn.u16 d3, q3, #2 \n"
765 "vst4.8 {d0, d1, d2, d3}, [%2]! \n"
766 "bgt 1b \n"
767 : "+r"(src_ptr), // %0
768 "+r"(src_stride), // %1
769 "+r"(dst), // %2
770 "+r"(dst_width) // %3
771 :
772 : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
Deb Mukherjee47031c02014-05-16 18:52:01 -0700773}
774
775// Reads 4 pixels at a time.
776// Alignment requirement: src_argb 4 byte aligned.
Christopher Degawa96703f22020-12-12 18:16:19 +0000777void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
778 ptrdiff_t src_stride,
779 int src_stepx,
780 uint8_t* dst_argb,
781 int dst_width) {
782 (void)src_stride;
783 asm volatile(
784 "mov r12, %3, lsl #2 \n"
785 "1: \n"
786 "vld1.32 {d0[0]}, [%0], r12 \n"
787 "vld1.32 {d0[1]}, [%0], r12 \n"
788 "vld1.32 {d1[0]}, [%0], r12 \n"
789 "vld1.32 {d1[1]}, [%0], r12 \n"
790 "subs %2, %2, #4 \n" // 4 pixels per loop.
791 "vst1.8 {q0}, [%1]! \n"
792 "bgt 1b \n"
793 : "+r"(src_argb), // %0
794 "+r"(dst_argb), // %1
795 "+r"(dst_width) // %2
796 : "r"(src_stepx) // %3
797 : "memory", "cc", "r12", "q0");
Deb Mukherjee47031c02014-05-16 18:52:01 -0700798}
799
800// Reads 4 pixels at a time.
801// Alignment requirement: src_argb 4 byte aligned.
Christopher Degawa96703f22020-12-12 18:16:19 +0000802void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
803 ptrdiff_t src_stride,
Deb Mukherjee47031c02014-05-16 18:52:01 -0700804 int src_stepx,
Christopher Degawa96703f22020-12-12 18:16:19 +0000805 uint8_t* dst_argb,
806 int dst_width) {
807 asm volatile(
808 "mov r12, %4, lsl #2 \n"
809 "add %1, %1, %0 \n"
810 "1: \n"
811 "vld1.8 {d0}, [%0], r12 \n" // 4 2x2 blocks -> 2x1
812 "vld1.8 {d1}, [%1], r12 \n"
813 "vld1.8 {d2}, [%0], r12 \n"
814 "vld1.8 {d3}, [%1], r12 \n"
815 "vld1.8 {d4}, [%0], r12 \n"
816 "vld1.8 {d5}, [%1], r12 \n"
817 "vld1.8 {d6}, [%0], r12 \n"
818 "vld1.8 {d7}, [%1], r12 \n"
819 "vaddl.u8 q0, d0, d1 \n"
820 "vaddl.u8 q1, d2, d3 \n"
821 "vaddl.u8 q2, d4, d5 \n"
822 "vaddl.u8 q3, d6, d7 \n"
823 "vswp.8 d1, d2 \n" // ab_cd -> ac_bd
824 "vswp.8 d5, d6 \n" // ef_gh -> eg_fh
825 "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d)
826 "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h)
827 "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels.
828 "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels.
829 "subs %3, %3, #4 \n" // 4 pixels per loop.
830 "vst1.8 {q0}, [%2]! \n"
831 "bgt 1b \n"
832 : "+r"(src_argb), // %0
833 "+r"(src_stride), // %1
834 "+r"(dst_argb), // %2
835 "+r"(dst_width) // %3
836 : "r"(src_stepx) // %4
837 : "memory", "cc", "r12", "q0", "q1", "q2", "q3");
Deb Mukherjee47031c02014-05-16 18:52:01 -0700838}
839
James Zernfcb42532015-07-24 16:54:51 -0700840// TODO(Yang Zhang): Investigate less load instructions for
841// the x/dx stepping
Christopher Degawa96703f22020-12-12 18:16:19 +0000842#define LOAD1_DATA32_LANE(dn, n) \
843 "lsr %5, %3, #16 \n" \
844 "add %6, %1, %5, lsl #2 \n" \
845 "add %3, %3, %4 \n" \
846 "vld1.32 {" #dn "[" #n "]}, [%6] \n"
James Zernfcb42532015-07-24 16:54:51 -0700847
Christopher Degawa96703f22020-12-12 18:16:19 +0000848void ScaleARGBCols_NEON(uint8_t* dst_argb,
849 const uint8_t* src_argb,
850 int dst_width,
851 int x,
852 int dx) {
853 int tmp;
854 const uint8_t* src_tmp = src_argb;
855 asm volatile(
856 "1: \n"
857 // clang-format off
858 LOAD1_DATA32_LANE(d0, 0)
859 LOAD1_DATA32_LANE(d0, 1)
860 LOAD1_DATA32_LANE(d1, 0)
861 LOAD1_DATA32_LANE(d1, 1)
862 LOAD1_DATA32_LANE(d2, 0)
863 LOAD1_DATA32_LANE(d2, 1)
864 LOAD1_DATA32_LANE(d3, 0)
865 LOAD1_DATA32_LANE(d3, 1)
866 // clang-format on
867 "vst1.32 {q0, q1}, [%0]! \n" // store pixels
868 "subs %2, %2, #8 \n" // 8 processed per loop
869 "bgt 1b \n"
870 : "+r"(dst_argb), // %0
871 "+r"(src_argb), // %1
872 "+r"(dst_width), // %2
873 "+r"(x), // %3
874 "+r"(dx), // %4
875 "=&r"(tmp), // %5
876 "+r"(src_tmp) // %6
877 :
878 : "memory", "cc", "q0", "q1");
James Zernfcb42532015-07-24 16:54:51 -0700879}
880
881#undef LOAD1_DATA32_LANE
882
883// TODO(Yang Zhang): Investigate less load instructions for
884// the x/dx stepping
Christopher Degawa96703f22020-12-12 18:16:19 +0000885#define LOAD2_DATA32_LANE(dn1, dn2, n) \
886 "lsr %5, %3, #16 \n" \
887 "add %6, %1, %5, lsl #2 \n" \
888 "add %3, %3, %4 \n" \
889 "vld2.32 {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n"
James Zernfcb42532015-07-24 16:54:51 -0700890
Christopher Degawa96703f22020-12-12 18:16:19 +0000891void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
892 const uint8_t* src_argb,
893 int dst_width,
894 int x,
895 int dx) {
James Zernfcb42532015-07-24 16:54:51 -0700896 int dx_offset[4] = {0, 1, 2, 3};
897 int* tmp = dx_offset;
Christopher Degawa96703f22020-12-12 18:16:19 +0000898 const uint8_t* src_tmp = src_argb;
James Zernfcb42532015-07-24 16:54:51 -0700899 asm volatile (
Christopher Degawa96703f22020-12-12 18:16:19 +0000900 "vdup.32 q0, %3 \n" // x
901 "vdup.32 q1, %4 \n" // dx
902 "vld1.32 {q2}, [%5] \n" // 0 1 2 3
903 "vshl.i32 q9, q1, #2 \n" // 4 * dx
904 "vmul.s32 q1, q1, q2 \n"
905 "vmov.i8 q3, #0x7f \n" // 0x7F
906 "vmov.i16 q15, #0x7f \n" // 0x7F
James Zernfcb42532015-07-24 16:54:51 -0700907 // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
Christopher Degawa96703f22020-12-12 18:16:19 +0000908 "vadd.s32 q8, q1, q0 \n"
909 "1: \n"
James Zernfcb42532015-07-24 16:54:51 -0700910 // d0, d1: a
911 // d2, d3: b
912 LOAD2_DATA32_LANE(d0, d2, 0)
913 LOAD2_DATA32_LANE(d0, d2, 1)
914 LOAD2_DATA32_LANE(d1, d3, 0)
915 LOAD2_DATA32_LANE(d1, d3, 1)
916 "vshrn.i32 d22, q8, #9 \n"
917 "vand.16 d22, d22, d30 \n"
918 "vdup.8 d24, d22[0] \n"
919 "vdup.8 d25, d22[2] \n"
920 "vdup.8 d26, d22[4] \n"
921 "vdup.8 d27, d22[6] \n"
922 "vext.8 d4, d24, d25, #4 \n"
923 "vext.8 d5, d26, d27, #4 \n" // f
924 "veor.8 q10, q2, q3 \n" // 0x7f ^ f
925 "vmull.u8 q11, d0, d20 \n"
926 "vmull.u8 q12, d1, d21 \n"
927 "vmull.u8 q13, d2, d4 \n"
928 "vmull.u8 q14, d3, d5 \n"
929 "vadd.i16 q11, q11, q13 \n"
930 "vadd.i16 q12, q12, q14 \n"
931 "vshrn.i16 d0, q11, #7 \n"
932 "vshrn.i16 d1, q12, #7 \n"
933
James Zernfcb42532015-07-24 16:54:51 -0700934 "vst1.32 {d0, d1}, [%0]! \n" // store pixels
935 "vadd.s32 q8, q8, q9 \n"
936 "subs %2, %2, #4 \n" // 4 processed per loop
937 "bgt 1b \n"
938 : "+r"(dst_argb), // %0
939 "+r"(src_argb), // %1
940 "+r"(dst_width), // %2
941 "+r"(x), // %3
942 "+r"(dx), // %4
943 "+r"(tmp), // %5
944 "+r"(src_tmp) // %6
945 :
946 : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9",
947 "q10", "q11", "q12", "q13", "q14", "q15"
948 );
949}
950
951#undef LOAD2_DATA32_LANE
952
Christopher Degawa96703f22020-12-12 18:16:19 +0000953void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
954 ptrdiff_t src_stride,
955 uint8_t* dst,
956 int dst_width) {
957 asm volatile(
958 // change the stride to row 2 pointer
959 "add %1, %1, %0 \n"
960 "1: \n"
961 "vld2.8 {d0, d2}, [%0]! \n" // load 8 UV pixels.
962 "vld2.8 {d1, d3}, [%0]! \n" // load next 8 UV
963 "subs %3, %3, #8 \n" // 8 processed per loop.
964 "vpaddl.u8 q0, q0 \n" // U 16 bytes -> 8 shorts.
965 "vpaddl.u8 q1, q1 \n" // V 16 bytes -> 8 shorts.
966 "vld2.8 {d16, d18}, [%1]! \n" // load 8 more UV
967 "vld2.8 {d17, d19}, [%1]! \n" // load last 8 UV
968 "vpadal.u8 q0, q8 \n" // U 16 bytes -> 8 shorts.
969 "vpadal.u8 q1, q9 \n" // V 16 bytes -> 8 shorts.
970 "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes
971 "vrshrn.u16 d1, q1, #2 \n"
972 "vst2.8 {d0, d1}, [%2]! \n"
973 "bgt 1b \n"
974 : "+r"(src_ptr), // %0
975 "+r"(src_stride), // %1
976 "+r"(dst), // %2
977 "+r"(dst_width) // %3
978 :
979 : "memory", "cc", "q0", "q1", "q8", "q9");
980}
981
982// Reads 4 pixels at a time.
983void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
984 ptrdiff_t src_stride,
985 int src_stepx, // pixel step
986 uint8_t* dst_ptr,
987 int dst_width) {
988 const uint8_t* src1_ptr = src_ptr + src_stepx * 2;
989 const uint8_t* src2_ptr = src_ptr + src_stepx * 4;
990 const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
991 (void)src_stride;
992 asm volatile(
993 "1: \n"
994 "vld1.16 {d0[0]}, [%0], %6 \n"
995 "vld1.16 {d0[1]}, [%1], %6 \n"
996 "vld1.16 {d0[2]}, [%2], %6 \n"
997 "vld1.16 {d0[3]}, [%3], %6 \n"
998 "subs %5, %5, #4 \n" // 4 pixels per loop.
999 "vst1.8 {d0}, [%4]! \n"
1000 "bgt 1b \n"
1001 : "+r"(src_ptr), // %0
1002 "+r"(src1_ptr), // %1
1003 "+r"(src2_ptr), // %2
1004 "+r"(src3_ptr), // %3
1005 "+r"(dst_ptr), // %4
1006 "+r"(dst_width) // %5
1007 : "r"(src_stepx * 8) // %6
1008 : "memory", "cc", "d0");
1009}
1010
Johann223bf292015-03-02 15:19:19 -08001011#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
Deb Mukherjee47031c02014-05-16 18:52:01 -07001012
1013#ifdef __cplusplus
1014} // extern "C"
1015} // namespace libyuv
1016#endif