blob: c912bcfe9dd9d35a59229dca64c1c85d6107f7c1 [file] [log] [blame]
Luc Trudeaud8d2ef12018-02-15 13:10:18 -05001/*
2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11#include <arm_neon.h>
12
13#include "./av1_rtcd.h"
14
15#include "av1/common/cfl.h"
16
17static INLINE void vldsubstq_s16(int16_t *buf, int16x8_t sub) {
18 vst1q_s16(buf, vsubq_s16(vld1q_s16(buf), sub));
19}
20
21static INLINE uint16x8_t vldaddq_u16(const uint16_t *buf, size_t offset) {
22 return vaddq_u16(vld1q_u16(buf), vld1q_u16(buf + offset));
23}
24
Luc Trudeau46929632018-02-16 15:09:26 -050025// Load half of a vector and duplicated in other half
26static INLINE uint8x8_t vldh_dup_u8(const uint8_t *ptr) {
27 return vreinterpret_u8_u32(vld1_dup_u32((const uint32_t *)ptr));
28}
29
30// Store half of a vector.
31static INLINE void vsth_s16(int16_t *ptr, int16x4_t val) {
32 *((uint32_t *)ptr) = vreinterpret_u32_s16(val)[0];
33}
34
Luc Trudeau5905ac52018-03-08 13:22:23 -050035// Store half of a vector.
36static INLINE void vsth_u8(uint8_t *ptr, uint8x8_t val) {
37 *((uint32_t *)ptr) = vreinterpret_u32_u8(val)[0];
38}
39
Luc Trudeau46929632018-02-16 15:09:26 -050040static void cfl_luma_subsampling_420_lbd_neon(const uint8_t *input,
41 int input_stride,
42 int16_t *pred_buf_q3, int width,
43 int height) {
44 const int16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE;
45 const int luma_stride = input_stride << 1;
46 do {
47 if (width == 4) {
48 const uint16x4_t top = vpaddl_u8(vldh_dup_u8(input));
49 const uint16x4_t sum = vpadal_u8(top, vldh_dup_u8(input + input_stride));
50 vsth_s16(pred_buf_q3, vshl_n_s16(vreinterpret_s16_u16(sum), 1));
51 } else if (width == 8) {
52 const uint16x4_t top = vpaddl_u8(vld1_u8(input));
53 const uint16x4_t sum = vpadal_u8(top, vld1_u8(input + input_stride));
54 vst1_s16(pred_buf_q3, vshl_n_s16(vreinterpret_s16_u16(sum), 1));
55 } else {
56 const uint16x8_t top = vpaddlq_u8(vld1q_u8(input));
57 const uint16x8_t sum = vpadalq_u8(top, vld1q_u8(input + input_stride));
58 vst1q_s16(pred_buf_q3, vshlq_n_s16(vreinterpretq_s16_u16(sum), 1));
59 if (width == 32) {
60 const uint16x8_t next_top = vpaddlq_u8(vld1q_u8(input + 16));
61 const uint16x8_t next_sum =
62 vpadalq_u8(next_top, vld1q_u8(input + 16 + input_stride));
63 vst1q_s16(pred_buf_q3 + 8,
64 vshlq_n_s16(vreinterpretq_s16_u16(next_sum), 1));
65 }
66 }
67 input += luma_stride;
68 } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
69}
70
Luc Trudeau34061662018-03-27 20:10:49 -040071static void cfl_luma_subsampling_422_lbd_neon(const uint8_t *input,
72 int input_stride,
73 int16_t *pred_buf_q3, int width,
74 int height) {
75 const int16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
76 do {
77 if (width == 4) {
78 const uint16x4_t top = vpaddl_u8(vldh_dup_u8(input));
79 vsth_s16(pred_buf_q3, vshl_n_s16(vreinterpret_s16_u16(top), 2));
80 } else if (width == 8) {
81 const uint16x4_t top = vpaddl_u8(vld1_u8(input));
82 vst1_s16(pred_buf_q3, vshl_n_s16(vreinterpret_s16_u16(top), 2));
83 } else {
84 const uint16x8_t top = vpaddlq_u8(vld1q_u8(input));
85 vst1q_s16(pred_buf_q3, vshlq_n_s16(vreinterpretq_s16_u16(top), 2));
86 if (width == 32) {
87 const uint16x8_t next_top = vpaddlq_u8(vld1q_u8(input + 16));
88 vst1q_s16(pred_buf_q3 + 8,
89 vshlq_n_s16(vreinterpretq_s16_u16(next_top), 2));
90 }
91 }
92 input += input_stride;
93 } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
94}
95
Luc Trudeau9ba35682018-03-23 21:08:15 -040096static void cfl_luma_subsampling_444_lbd_neon(const uint8_t *input,
97 int input_stride,
98 int16_t *pred_buf_q3, int width,
99 int height) {
100 const int16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
101 do {
102 if (width == 4) {
103 const uint16x8_t top = vshll_n_u8(vldh_dup_u8(input), 3);
104 vst1_s16(pred_buf_q3, vreinterpret_s16_u16(vget_low_u16(top)));
105 } else if (width == 8) {
106 const uint16x8_t top = vshll_n_u8(vld1_u8(input), 3);
107 vst1q_s16(pred_buf_q3, vreinterpretq_s16_u16(top));
108 } else {
109 const uint8x16_t top = vld1q_u8(input);
110 vst1q_s16(pred_buf_q3,
111 vreinterpretq_s16_u16(vshll_n_u8(vget_low_u8(top), 3)));
112 vst1q_s16(pred_buf_q3 + 8,
113 vreinterpretq_s16_u16(vshll_n_u8(vget_high_u8(top), 3)));
114 if (width == 32) {
115 const uint8x16_t next_top = vld1q_u8(input + 16);
116 vst1q_s16(pred_buf_q3 + 16,
117 vreinterpretq_s16_u16(vshll_n_u8(vget_low_u8(next_top), 3)));
118 vst1q_s16(pred_buf_q3 + 24,
119 vreinterpretq_s16_u16(vshll_n_u8(vget_high_u8(next_top), 3)));
120 }
121 }
122 input += input_stride;
123 } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
124}
125
Luc Trudeau32b8af72018-03-30 18:38:02 -0400126#if __ARM_ARCH <= 7
127uint16x8_t vpaddq_u16(uint16x8_t a, uint16x8_t b) {
128 return vcombine_u16(vpadd_u16(vget_low_u16(a), vget_high_u16(a)),
129 vpadd_u16(vget_low_u16(b), vget_high_u16(b)));
130}
131#endif
132
133static void cfl_luma_subsampling_420_hbd_neon(const uint16_t *input,
134 int input_stride,
135 int16_t *pred_buf_q3, int width,
136 int height) {
137 const int16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE;
138 const int luma_stride = input_stride << 1;
139 do {
140 if (width == 4) {
141 const uint16x4_t top = vld1_u16(input);
142 const uint16x4_t bot = vld1_u16(input + input_stride);
143 const uint16x4_t sum = vadd_u16(top, bot);
144 const uint16x4_t hsum = vpadd_u16(sum, sum);
145 vsth_s16(pred_buf_q3, vshl_n_s16(vreinterpret_s16_u16(hsum), 1));
146 } else if (width < 32) {
147 const uint16x8_t top = vld1q_u16(input);
148 const uint16x8_t bot = vld1q_u16(input + input_stride);
149 const uint16x8_t sum = vaddq_u16(top, bot);
150 if (width == 8) {
151 const int16x4_t hsum =
152 vreinterpret_s16_u16(vget_low_u16(vpaddq_u16(sum, sum)));
153 vst1_s16(pred_buf_q3, vshl_n_s16(hsum, 1));
154 } else {
155 const uint16x8_t top_1 = vld1q_u16(input + 8);
156 const uint16x8_t bot_1 = vld1q_u16(input + 8 + input_stride);
157 const uint16x8_t sum_1 = vaddq_u16(top_1, bot_1);
158 const int16x8_t hsum = vreinterpretq_s16_u16(vpaddq_u16(sum, sum_1));
159 vst1q_s16(pred_buf_q3, vshlq_n_s16(hsum, 1));
160 }
161 } else {
162 const uint16x8x4_t top = vld4q_u16(input);
163 const uint16x8x4_t bot = vld4q_u16(input + input_stride);
164 // equivalent to a vpaddq_u16 (because vld4q interleaves)
165 const uint16x8_t top_0 = vaddq_u16(top.val[0], top.val[1]);
166 // equivalent to a vpaddq_u16 (because vld4q interleaves)
167 const uint16x8_t bot_0 = vaddq_u16(bot.val[0], bot.val[1]);
168 // equivalent to a vpaddq_u16 (because vld4q interleaves)
169 const uint16x8_t top_1 = vaddq_u16(top.val[2], top.val[3]);
170 // equivalent to a vpaddq_u16 (because vld4q interleaves)
171 const uint16x8_t bot_1 = vaddq_u16(bot.val[2], bot.val[3]);
172 int16x8x2_t sum;
173 sum.val[0] =
174 vshlq_n_s16(vreinterpretq_s16_u16(vaddq_u16(top_0, bot_0)), 1);
175 sum.val[1] =
176 vshlq_n_s16(vreinterpretq_s16_u16(vaddq_u16(top_1, bot_1)), 1);
177 vst2q_s16(pred_buf_q3, sum);
178 }
179 input += luma_stride;
180 } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
181}
182
Luc Trudeau46929632018-02-16 15:09:26 -0500183CFL_GET_SUBSAMPLE_FUNCTION(neon)
184
Luc Trudeaud8d2ef12018-02-15 13:10:18 -0500185static INLINE void subtract_average_neon(int16_t *pred_buf, int width,
186 int height, int round_offset,
187 const int num_pel_log2) {
188 const int16_t *const end = pred_buf + height * CFL_BUF_LINE;
189 const uint16_t *const sum_end = (uint16_t *)end;
190
191 // Round offset is not needed, because NEON will handle the rounding.
192 (void)round_offset;
193
194 // To optimize the use of the CPU pipeline, we process 4 rows per iteration
195 const int step = 4 * CFL_BUF_LINE;
196
197 // At this stage, the prediction buffer contains scaled reconstructed luma
198 // pixels, which are positive integer and only require 15 bits. By using
199 // unsigned integer for the sum, we can do one addition operation inside 16
200 // bits (8 lanes) before having to convert to 32 bits (4 lanes).
201 const uint16_t *sum_buf = (uint16_t *)pred_buf;
202 uint32x4_t sum_32x4 = { 0, 0, 0, 0 };
203 do {
204 // For all widths, we load, add and combine the data so it fits in 4 lanes.
205 if (width == 4) {
206 const uint16x4_t a0 =
207 vadd_u16(vld1_u16(sum_buf), vld1_u16(sum_buf + CFL_BUF_LINE));
208 const uint16x4_t a1 = vadd_u16(vld1_u16(sum_buf + 2 * CFL_BUF_LINE),
209 vld1_u16(sum_buf + 3 * CFL_BUF_LINE));
210 sum_32x4 = vaddq_u32(sum_32x4, vaddl_u16(a0, a1));
211 } else if (width == 8) {
212 const uint16x8_t a0 = vldaddq_u16(sum_buf, CFL_BUF_LINE);
213 const uint16x8_t a1 =
214 vldaddq_u16(sum_buf + 2 * CFL_BUF_LINE, CFL_BUF_LINE);
215 sum_32x4 = vpadalq_u16(sum_32x4, a0);
216 sum_32x4 = vpadalq_u16(sum_32x4, a1);
217 } else {
218 const uint16x8_t row0 = vldaddq_u16(sum_buf, 8);
219 const uint16x8_t row1 = vldaddq_u16(sum_buf + CFL_BUF_LINE, 8);
220 const uint16x8_t row2 = vldaddq_u16(sum_buf + 2 * CFL_BUF_LINE, 8);
221 const uint16x8_t row3 = vldaddq_u16(sum_buf + 3 * CFL_BUF_LINE, 8);
222 sum_32x4 = vpadalq_u16(sum_32x4, row0);
223 sum_32x4 = vpadalq_u16(sum_32x4, row1);
224 sum_32x4 = vpadalq_u16(sum_32x4, row2);
225 sum_32x4 = vpadalq_u16(sum_32x4, row3);
226
227 if (width == 32) {
228 const uint16x8_t row0_1 = vldaddq_u16(sum_buf + 16, 8);
229 const uint16x8_t row1_1 = vldaddq_u16(sum_buf + CFL_BUF_LINE + 16, 8);
230 const uint16x8_t row2_1 =
231 vldaddq_u16(sum_buf + 2 * CFL_BUF_LINE + 16, 8);
232 const uint16x8_t row3_1 =
233 vldaddq_u16(sum_buf + 3 * CFL_BUF_LINE + 16, 8);
234
235 sum_32x4 = vpadalq_u16(sum_32x4, row0_1);
236 sum_32x4 = vpadalq_u16(sum_32x4, row1_1);
237 sum_32x4 = vpadalq_u16(sum_32x4, row2_1);
238 sum_32x4 = vpadalq_u16(sum_32x4, row3_1);
239 }
240 }
241 } while ((sum_buf += step) < sum_end);
242
243 // Permute and add in such a way that each lane contains the block sum.
244 // [A+C+B+D, B+D+A+C, C+A+D+B, D+B+C+A]
245#if __ARM_ARCH >= 8
246 sum_32x4 = vpaddq_u32(sum_32x4, sum_32x4);
247 sum_32x4 = vpaddq_u32(sum_32x4, sum_32x4);
248#else
249 uint32x4_t flip =
250 vcombine_u32(vget_high_u32(sum_32x4), vget_low_u32(sum_32x4));
251 sum_32x4 = vaddq_u32(sum_32x4, flip);
252 sum_32x4 = vaddq_u32(sum_32x4, vrev64q_u32(sum_32x4));
253#endif
254
255 // Computing the average could be done using scalars, but getting off the NEON
256 // engine introduces latency, so we use vqrshrn.
257 int16x4_t avg_16x4;
258 // Constant propagation makes for some ugly code.
259 switch (num_pel_log2) {
260 case 4: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 4)); break;
261 case 5: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 5)); break;
262 case 6: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 6)); break;
263 case 7: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 7)); break;
264 case 8: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 8)); break;
265 case 9: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 9)); break;
266 case 10:
267 avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 10));
268 break;
269 default: assert(0);
270 }
271
272 if (width == 4) {
273 do {
274 vst1_s16(pred_buf, vsub_s16(vld1_s16(pred_buf), avg_16x4));
275 } while ((pred_buf += CFL_BUF_LINE) < end);
276 } else {
277 const int16x8_t avg_16x8 = vcombine_s16(avg_16x4, avg_16x4);
278 do {
279 vldsubstq_s16(pred_buf, avg_16x8);
280 vldsubstq_s16(pred_buf + CFL_BUF_LINE, avg_16x8);
281 vldsubstq_s16(pred_buf + 2 * CFL_BUF_LINE, avg_16x8);
282 vldsubstq_s16(pred_buf + 3 * CFL_BUF_LINE, avg_16x8);
283
284 if (width > 8) {
285 vldsubstq_s16(pred_buf + 8, avg_16x8);
286 vldsubstq_s16(pred_buf + CFL_BUF_LINE + 8, avg_16x8);
287 vldsubstq_s16(pred_buf + 2 * CFL_BUF_LINE + 8, avg_16x8);
288 vldsubstq_s16(pred_buf + 3 * CFL_BUF_LINE + 8, avg_16x8);
289 }
290 if (width == 32) {
291 vldsubstq_s16(pred_buf + 16, avg_16x8);
292 vldsubstq_s16(pred_buf + 24, avg_16x8);
293 vldsubstq_s16(pred_buf + CFL_BUF_LINE + 16, avg_16x8);
294 vldsubstq_s16(pred_buf + CFL_BUF_LINE + 24, avg_16x8);
295 vldsubstq_s16(pred_buf + 2 * CFL_BUF_LINE + 16, avg_16x8);
296 vldsubstq_s16(pred_buf + 2 * CFL_BUF_LINE + 24, avg_16x8);
297 vldsubstq_s16(pred_buf + 3 * CFL_BUF_LINE + 16, avg_16x8);
298 vldsubstq_s16(pred_buf + 3 * CFL_BUF_LINE + 24, avg_16x8);
299 }
300 } while ((pred_buf += step) < end);
301 }
302}
303
304CFL_SUB_AVG_FN(neon)
Luc Trudeau5905ac52018-03-08 13:22:23 -0500305
306// Saturating negate 16-bit integers in a when the corresponding signed 16-bit
307// integer in b is negative.
308// Notes:
309// * Negating INT16_MIN results in INT16_MIN. However, this cannot occur in
310// practice, as scaled_luma is the multiplication of two absolute values.
311// * In the Intel equivalent, elements in a are zeroed out when the
312// corresponding elements in b are zero. Because vsign is used twice in a
313// row, with b in the first call becoming a in the second call, there's no
314// impact from not zeroing out.
315static int16x4_t vsign_s16(int16x4_t a, int16x4_t b) {
316 const int16x4_t mask = vshr_n_s16(b, 15);
317 return veor_s16(vadd_s16(a, mask), mask);
318}
319
320// Saturating negate 16-bit integers in a when the corresponding signed 16-bit
321// integer in b is negative.
322// Notes:
323// * Negating INT16_MIN results in INT16_MIN. However, this cannot occur in
324// practice, as scaled_luma is the multiplication of two absolute values.
325// * In the Intel equivalent, elements in a are zeroed out when the
326// corresponding elements in b are zero. Because vsignq is used twice in a
327// row, with b in the first call becoming a in the second call, there's no
328// impact from not zeroing out.
329static int16x8_t vsignq_s16(int16x8_t a, int16x8_t b) {
330 const int16x8_t mask = vshrq_n_s16(b, 15);
331 return veorq_s16(vaddq_s16(a, mask), mask);
332}
333
334static INLINE int16x4_t predict_w4(const int16_t *pred_buf_q3,
335 int16x4_t alpha_sign, int abs_alpha_q12,
336 int16x4_t dc) {
337 const int16x4_t ac_q3 = vld1_s16(pred_buf_q3);
338 const int16x4_t ac_sign = veor_s16(alpha_sign, ac_q3);
339 int16x4_t scaled_luma = vqrdmulh_n_s16(vabs_s16(ac_q3), abs_alpha_q12);
340 return vadd_s16(vsign_s16(scaled_luma, ac_sign), dc);
341}
342
343static INLINE int16x8_t predict_w8(const int16_t *pred_buf_q3,
344 int16x8_t alpha_sign, int abs_alpha_q12,
345 int16x8_t dc) {
346 const int16x8_t ac_q3 = vld1q_s16(pred_buf_q3);
347 const int16x8_t ac_sign = veorq_s16(alpha_sign, ac_q3);
348 int16x8_t scaled_luma = vqrdmulhq_n_s16(vabsq_s16(ac_q3), abs_alpha_q12);
349 return vaddq_s16(vsignq_s16(scaled_luma, ac_sign), dc);
350}
351
352// Vector signed->unsigned narrowing half store
353static void vsthun_s16(uint8_t *dst, int16x4_t scaled_luma) {
354 vsth_u8(dst, vqmovun_s16(vcombine_s16(scaled_luma, scaled_luma)));
355}
356
357// Vector signed->unsigned narrowing store
358static void vst1un_s16(uint8_t *dst, int16x8_t scaled_luma) {
359 vst1_u8(dst, vqmovun_s16(scaled_luma));
360}
361
362// Vector signed->unsigned narrowing store
363static void vst1unq_s16(uint8_t *dst, int16x8_t scaled_luma,
364 int16x8_t scaled_luma_next) {
365 vst1q_u8(dst, vcombine_u8(vqmovun_s16(scaled_luma),
366 vqmovun_s16(scaled_luma_next)));
367}
368
369static INLINE void cfl_predict_lbd_neon(const int16_t *pred_buf_q3,
370 uint8_t *dst, int dst_stride,
371 int alpha_q3, int width, int height) {
372 const int16_t abs_alpha_q12 = abs(alpha_q3) << 9;
373 const int16_t *const end = pred_buf_q3 + height * CFL_BUF_LINE;
374 if (width == 4) {
375 const int16x4_t alpha_sign = vdup_n_s16(alpha_q3);
376 const int16x4_t dc = vdup_n_s16(*dst);
377 do {
378 const int16x4_t scaled_luma =
379 predict_w4(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
380 vsthun_s16(dst, scaled_luma);
381 dst += dst_stride;
382 } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
383 } else {
384 const int16x8_t alpha_sign = vdupq_n_s16(alpha_q3);
385 const int16x8_t dc = vdupq_n_s16(*dst);
386 do {
387 const int16x8_t scaled_luma =
388 predict_w8(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
389 if (width == 8) {
390 vst1un_s16(dst, scaled_luma);
391 } else {
392 const int16x8_t scaled_luma_1 =
393 predict_w8(pred_buf_q3 + 8, alpha_sign, abs_alpha_q12, dc);
394 vst1unq_s16(dst, scaled_luma, scaled_luma_1);
395 if (width == 32) {
396 const int16x8_t scaled_luma_2 =
397 predict_w8(pred_buf_q3 + 16, alpha_sign, abs_alpha_q12, dc);
398 const int16x8_t scaled_luma_3 =
399 predict_w8(pred_buf_q3 + 24, alpha_sign, abs_alpha_q12, dc);
400 vst1unq_s16(dst + 16, scaled_luma_2, scaled_luma_3);
401 }
402 }
403 dst += dst_stride;
404 } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
405 }
406}
407
408CFL_PREDICT_FN(neon, lbd)
409
410static INLINE uint16x4_t clamp_s16(int16x4_t a, int16x4_t max) {
411 return vreinterpret_u16_s16(vmax_s16(vmin_s16(a, max), vdup_n_s16(0)));
412}
413
414static INLINE uint16x8_t clampq_s16(int16x8_t a, int16x8_t max) {
415 return vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(a, max), vdupq_n_s16(0)));
416}
417
418static INLINE void cfl_predict_hbd_neon(const int16_t *pred_buf_q3,
419 uint16_t *dst, int dst_stride,
420 int alpha_q3, int bd, int width,
421 int height) {
422 const int max = (1 << bd) - 1;
423 const int16_t abs_alpha_q12 = abs(alpha_q3) << 9;
424 const int16_t *const end = pred_buf_q3 + height * CFL_BUF_LINE;
425 if (width == 4) {
426 const int16x4_t alpha_sign = vdup_n_s16(alpha_q3);
427 const int16x4_t dc = vdup_n_s16(*dst);
428 const int16x4_t max_16x4 = vdup_n_s16(max);
429 do {
430 const int16x4_t scaled_luma =
431 predict_w4(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
432 vst1_u16(dst, clamp_s16(scaled_luma, max_16x4));
433 dst += dst_stride;
434 } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
435 } else {
436 const int16x8_t alpha_sign = vdupq_n_s16(alpha_q3);
437 const int16x8_t dc = vdupq_n_s16(*dst);
438 const int16x8_t max_16x8 = vdupq_n_s16(max);
439 do {
440 const int16x8_t scaled_luma =
441 predict_w8(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
442 vst1q_u16(dst, clampq_s16(scaled_luma, max_16x8));
443 if (width >= 16) {
444 const int16x8_t scaled_luma_1 =
445 predict_w8(pred_buf_q3 + 8, alpha_sign, abs_alpha_q12, dc);
446 vst1q_u16(dst + 8, clampq_s16(scaled_luma_1, max_16x8));
447 if (width == 32) {
448 const int16x8_t scaled_luma_2 =
449 predict_w8(pred_buf_q3 + 16, alpha_sign, abs_alpha_q12, dc);
450 vst1q_u16(dst + 16, clampq_s16(scaled_luma_2, max_16x8));
451 const int16x8_t scaled_luma_3 =
452 predict_w8(pred_buf_q3 + 24, alpha_sign, abs_alpha_q12, dc);
453 vst1q_u16(dst + 24, clampq_s16(scaled_luma_3, max_16x8));
454 }
455 }
456 dst += dst_stride;
457 } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
458 }
459}
460
461CFL_PREDICT_FN(neon, hbd)