Blame - av1/common/arm/cfl_neon.c - aom

blob: c912bcfe9dd9d35a59229dca64c1c85d6107f7c1 [file] [log] [blame]

Luc Trudeau	d8d2ef1	2018-02-15 13:10:18 -0500	[diff] [blame]	1	/*
				2	* Copyright (c) 2017, Alliance for Open Media. All rights reserved
				3	*
				4	* This source code is subject to the terms of the BSD 2 Clause License and
				5	* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
				6	* was not distributed with this source code in the LICENSE file, you can
				7	* obtain it at www.aomedia.org/license/software. If the Alliance for Open
				8	* Media Patent License 1.0 was not distributed with this source code in the
				9	* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
				10	*/
				11	#include <arm_neon.h>
				12
				13	#include "./av1_rtcd.h"
				14
				15	#include "av1/common/cfl.h"
				16
				17	static INLINE void vldsubstq_s16(int16_t *buf, int16x8_t sub) {
				18	vst1q_s16(buf, vsubq_s16(vld1q_s16(buf), sub));
				19	}
				20
				21	static INLINE uint16x8_t vldaddq_u16(const uint16_t *buf, size_t offset) {
				22	return vaddq_u16(vld1q_u16(buf), vld1q_u16(buf + offset));
				23	}
				24
Luc Trudeau	4692963	2018-02-16 15:09:26 -0500	[diff] [blame]	25	// Load half of a vector and duplicated in other half
				26	static INLINE uint8x8_t vldh_dup_u8(const uint8_t *ptr) {
				27	return vreinterpret_u8_u32(vld1_dup_u32((const uint32_t *)ptr));
				28	}
				29
				30	// Store half of a vector.
				31	static INLINE void vsth_s16(int16_t *ptr, int16x4_t val) {
				32	((uint32_t )ptr) = vreinterpret_u32_s16(val)[0];
				33	}
				34
Luc Trudeau	5905ac5	2018-03-08 13:22:23 -0500	[diff] [blame]	35	// Store half of a vector.
				36	static INLINE void vsth_u8(uint8_t *ptr, uint8x8_t val) {
				37	((uint32_t )ptr) = vreinterpret_u32_u8(val)[0];
				38	}
				39
Luc Trudeau	4692963	2018-02-16 15:09:26 -0500	[diff] [blame]	40	static void cfl_luma_subsampling_420_lbd_neon(const uint8_t *input,
				41	int input_stride,
				42	int16_t *pred_buf_q3, int width,
				43	int height) {
				44	const int16_t end = pred_buf_q3 + (height >> 1) CFL_BUF_LINE;
				45	const int luma_stride = input_stride << 1;
				46	do {
				47	if (width == 4) {
				48	const uint16x4_t top = vpaddl_u8(vldh_dup_u8(input));
				49	const uint16x4_t sum = vpadal_u8(top, vldh_dup_u8(input + input_stride));
				50	vsth_s16(pred_buf_q3, vshl_n_s16(vreinterpret_s16_u16(sum), 1));
				51	} else if (width == 8) {
				52	const uint16x4_t top = vpaddl_u8(vld1_u8(input));
				53	const uint16x4_t sum = vpadal_u8(top, vld1_u8(input + input_stride));
				54	vst1_s16(pred_buf_q3, vshl_n_s16(vreinterpret_s16_u16(sum), 1));
				55	} else {
				56	const uint16x8_t top = vpaddlq_u8(vld1q_u8(input));
				57	const uint16x8_t sum = vpadalq_u8(top, vld1q_u8(input + input_stride));
				58	vst1q_s16(pred_buf_q3, vshlq_n_s16(vreinterpretq_s16_u16(sum), 1));
				59	if (width == 32) {
				60	const uint16x8_t next_top = vpaddlq_u8(vld1q_u8(input + 16));
				61	const uint16x8_t next_sum =
				62	vpadalq_u8(next_top, vld1q_u8(input + 16 + input_stride));
				63	vst1q_s16(pred_buf_q3 + 8,
				64	vshlq_n_s16(vreinterpretq_s16_u16(next_sum), 1));
				65	}
				66	}
				67	input += luma_stride;
				68	} while ((pred_buf_q3 += CFL_BUF_LINE) < end);
				69	}
				70
Luc Trudeau	3406166	2018-03-27 20:10:49 -0400	[diff] [blame]	71	static void cfl_luma_subsampling_422_lbd_neon(const uint8_t *input,
				72	int input_stride,
				73	int16_t *pred_buf_q3, int width,
				74	int height) {
				75	const int16_t end = pred_buf_q3 + height CFL_BUF_LINE;
				76	do {
				77	if (width == 4) {
				78	const uint16x4_t top = vpaddl_u8(vldh_dup_u8(input));
				79	vsth_s16(pred_buf_q3, vshl_n_s16(vreinterpret_s16_u16(top), 2));
				80	} else if (width == 8) {
				81	const uint16x4_t top = vpaddl_u8(vld1_u8(input));
				82	vst1_s16(pred_buf_q3, vshl_n_s16(vreinterpret_s16_u16(top), 2));
				83	} else {
				84	const uint16x8_t top = vpaddlq_u8(vld1q_u8(input));
				85	vst1q_s16(pred_buf_q3, vshlq_n_s16(vreinterpretq_s16_u16(top), 2));
				86	if (width == 32) {
				87	const uint16x8_t next_top = vpaddlq_u8(vld1q_u8(input + 16));
				88	vst1q_s16(pred_buf_q3 + 8,
				89	vshlq_n_s16(vreinterpretq_s16_u16(next_top), 2));
				90	}
				91	}
				92	input += input_stride;
				93	} while ((pred_buf_q3 += CFL_BUF_LINE) < end);
				94	}
				95
Luc Trudeau	9ba3568	2018-03-23 21:08:15 -0400	[diff] [blame]	96	static void cfl_luma_subsampling_444_lbd_neon(const uint8_t *input,
				97	int input_stride,
				98	int16_t *pred_buf_q3, int width,
				99	int height) {
				100	const int16_t end = pred_buf_q3 + height CFL_BUF_LINE;
				101	do {
				102	if (width == 4) {
				103	const uint16x8_t top = vshll_n_u8(vldh_dup_u8(input), 3);
				104	vst1_s16(pred_buf_q3, vreinterpret_s16_u16(vget_low_u16(top)));
				105	} else if (width == 8) {
				106	const uint16x8_t top = vshll_n_u8(vld1_u8(input), 3);
				107	vst1q_s16(pred_buf_q3, vreinterpretq_s16_u16(top));
				108	} else {
				109	const uint8x16_t top = vld1q_u8(input);
				110	vst1q_s16(pred_buf_q3,
				111	vreinterpretq_s16_u16(vshll_n_u8(vget_low_u8(top), 3)));
				112	vst1q_s16(pred_buf_q3 + 8,
				113	vreinterpretq_s16_u16(vshll_n_u8(vget_high_u8(top), 3)));
				114	if (width == 32) {
				115	const uint8x16_t next_top = vld1q_u8(input + 16);
				116	vst1q_s16(pred_buf_q3 + 16,
				117	vreinterpretq_s16_u16(vshll_n_u8(vget_low_u8(next_top), 3)));
				118	vst1q_s16(pred_buf_q3 + 24,
				119	vreinterpretq_s16_u16(vshll_n_u8(vget_high_u8(next_top), 3)));
				120	}
				121	}
				122	input += input_stride;
				123	} while ((pred_buf_q3 += CFL_BUF_LINE) < end);
				124	}
				125
Luc Trudeau	32b8af7	2018-03-30 18:38:02 -0400	[diff] [blame^]	126	#if __ARM_ARCH <= 7
				127	uint16x8_t vpaddq_u16(uint16x8_t a, uint16x8_t b) {
				128	return vcombine_u16(vpadd_u16(vget_low_u16(a), vget_high_u16(a)),
				129	vpadd_u16(vget_low_u16(b), vget_high_u16(b)));
				130	}
				131	#endif
				132
				133	static void cfl_luma_subsampling_420_hbd_neon(const uint16_t *input,
				134	int input_stride,
				135	int16_t *pred_buf_q3, int width,
				136	int height) {
				137	const int16_t end = pred_buf_q3 + (height >> 1) CFL_BUF_LINE;
				138	const int luma_stride = input_stride << 1;
				139	do {
				140	if (width == 4) {
				141	const uint16x4_t top = vld1_u16(input);
				142	const uint16x4_t bot = vld1_u16(input + input_stride);
				143	const uint16x4_t sum = vadd_u16(top, bot);
				144	const uint16x4_t hsum = vpadd_u16(sum, sum);
				145	vsth_s16(pred_buf_q3, vshl_n_s16(vreinterpret_s16_u16(hsum), 1));
				146	} else if (width < 32) {
				147	const uint16x8_t top = vld1q_u16(input);
				148	const uint16x8_t bot = vld1q_u16(input + input_stride);
				149	const uint16x8_t sum = vaddq_u16(top, bot);
				150	if (width == 8) {
				151	const int16x4_t hsum =
				152	vreinterpret_s16_u16(vget_low_u16(vpaddq_u16(sum, sum)));
				153	vst1_s16(pred_buf_q3, vshl_n_s16(hsum, 1));
				154	} else {
				155	const uint16x8_t top_1 = vld1q_u16(input + 8);
				156	const uint16x8_t bot_1 = vld1q_u16(input + 8 + input_stride);
				157	const uint16x8_t sum_1 = vaddq_u16(top_1, bot_1);
				158	const int16x8_t hsum = vreinterpretq_s16_u16(vpaddq_u16(sum, sum_1));
				159	vst1q_s16(pred_buf_q3, vshlq_n_s16(hsum, 1));
				160	}
				161	} else {
				162	const uint16x8x4_t top = vld4q_u16(input);
				163	const uint16x8x4_t bot = vld4q_u16(input + input_stride);
				164	// equivalent to a vpaddq_u16 (because vld4q interleaves)
				165	const uint16x8_t top_0 = vaddq_u16(top.val[0], top.val[1]);
				166	// equivalent to a vpaddq_u16 (because vld4q interleaves)
				167	const uint16x8_t bot_0 = vaddq_u16(bot.val[0], bot.val[1]);
				168	// equivalent to a vpaddq_u16 (because vld4q interleaves)
				169	const uint16x8_t top_1 = vaddq_u16(top.val[2], top.val[3]);
				170	// equivalent to a vpaddq_u16 (because vld4q interleaves)
				171	const uint16x8_t bot_1 = vaddq_u16(bot.val[2], bot.val[3]);
				172	int16x8x2_t sum;
				173	sum.val[0] =
				174	vshlq_n_s16(vreinterpretq_s16_u16(vaddq_u16(top_0, bot_0)), 1);
				175	sum.val[1] =
				176	vshlq_n_s16(vreinterpretq_s16_u16(vaddq_u16(top_1, bot_1)), 1);
				177	vst2q_s16(pred_buf_q3, sum);
				178	}
				179	input += luma_stride;
				180	} while ((pred_buf_q3 += CFL_BUF_LINE) < end);
				181	}
				182
Luc Trudeau	4692963	2018-02-16 15:09:26 -0500	[diff] [blame]	183	CFL_GET_SUBSAMPLE_FUNCTION(neon)
				184
Luc Trudeau	d8d2ef1	2018-02-15 13:10:18 -0500	[diff] [blame]	185	static INLINE void subtract_average_neon(int16_t *pred_buf, int width,
				186	int height, int round_offset,
				187	const int num_pel_log2) {
				188	const int16_t const end = pred_buf + height CFL_BUF_LINE;
				189	const uint16_t const sum_end = (uint16_t )end;
				190
				191	// Round offset is not needed, because NEON will handle the rounding.
				192	(void)round_offset;
				193
				194	// To optimize the use of the CPU pipeline, we process 4 rows per iteration
				195	const int step = 4 * CFL_BUF_LINE;
				196
				197	// At this stage, the prediction buffer contains scaled reconstructed luma
				198	// pixels, which are positive integer and only require 15 bits. By using
				199	// unsigned integer for the sum, we can do one addition operation inside 16
				200	// bits (8 lanes) before having to convert to 32 bits (4 lanes).
				201	const uint16_t sum_buf = (uint16_t )pred_buf;
				202	uint32x4_t sum_32x4 = { 0, 0, 0, 0 };
				203	do {
				204	// For all widths, we load, add and combine the data so it fits in 4 lanes.
				205	if (width == 4) {
				206	const uint16x4_t a0 =
				207	vadd_u16(vld1_u16(sum_buf), vld1_u16(sum_buf + CFL_BUF_LINE));
				208	const uint16x4_t a1 = vadd_u16(vld1_u16(sum_buf + 2 * CFL_BUF_LINE),
				209	vld1_u16(sum_buf + 3 * CFL_BUF_LINE));
				210	sum_32x4 = vaddq_u32(sum_32x4, vaddl_u16(a0, a1));
				211	} else if (width == 8) {
				212	const uint16x8_t a0 = vldaddq_u16(sum_buf, CFL_BUF_LINE);
				213	const uint16x8_t a1 =
				214	vldaddq_u16(sum_buf + 2 * CFL_BUF_LINE, CFL_BUF_LINE);
				215	sum_32x4 = vpadalq_u16(sum_32x4, a0);
				216	sum_32x4 = vpadalq_u16(sum_32x4, a1);
				217	} else {
				218	const uint16x8_t row0 = vldaddq_u16(sum_buf, 8);
				219	const uint16x8_t row1 = vldaddq_u16(sum_buf + CFL_BUF_LINE, 8);
				220	const uint16x8_t row2 = vldaddq_u16(sum_buf + 2 * CFL_BUF_LINE, 8);
				221	const uint16x8_t row3 = vldaddq_u16(sum_buf + 3 * CFL_BUF_LINE, 8);
				222	sum_32x4 = vpadalq_u16(sum_32x4, row0);
				223	sum_32x4 = vpadalq_u16(sum_32x4, row1);
				224	sum_32x4 = vpadalq_u16(sum_32x4, row2);
				225	sum_32x4 = vpadalq_u16(sum_32x4, row3);
				226
				227	if (width == 32) {
				228	const uint16x8_t row0_1 = vldaddq_u16(sum_buf + 16, 8);
				229	const uint16x8_t row1_1 = vldaddq_u16(sum_buf + CFL_BUF_LINE + 16, 8);
				230	const uint16x8_t row2_1 =
				231	vldaddq_u16(sum_buf + 2 * CFL_BUF_LINE + 16, 8);
				232	const uint16x8_t row3_1 =
				233	vldaddq_u16(sum_buf + 3 * CFL_BUF_LINE + 16, 8);
				234
				235	sum_32x4 = vpadalq_u16(sum_32x4, row0_1);
				236	sum_32x4 = vpadalq_u16(sum_32x4, row1_1);
				237	sum_32x4 = vpadalq_u16(sum_32x4, row2_1);
				238	sum_32x4 = vpadalq_u16(sum_32x4, row3_1);
				239	}
				240	}
				241	} while ((sum_buf += step) < sum_end);
				242
				243	// Permute and add in such a way that each lane contains the block sum.
				244	// [A+C+B+D, B+D+A+C, C+A+D+B, D+B+C+A]
				245	#if __ARM_ARCH >= 8
				246	sum_32x4 = vpaddq_u32(sum_32x4, sum_32x4);
				247	sum_32x4 = vpaddq_u32(sum_32x4, sum_32x4);
				248	#else
				249	uint32x4_t flip =
				250	vcombine_u32(vget_high_u32(sum_32x4), vget_low_u32(sum_32x4));
				251	sum_32x4 = vaddq_u32(sum_32x4, flip);
				252	sum_32x4 = vaddq_u32(sum_32x4, vrev64q_u32(sum_32x4));
				253	#endif
				254
				255	// Computing the average could be done using scalars, but getting off the NEON
				256	// engine introduces latency, so we use vqrshrn.
				257	int16x4_t avg_16x4;
				258	// Constant propagation makes for some ugly code.
				259	switch (num_pel_log2) {
				260	case 4: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 4)); break;
				261	case 5: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 5)); break;
				262	case 6: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 6)); break;
				263	case 7: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 7)); break;
				264	case 8: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 8)); break;
				265	case 9: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 9)); break;
				266	case 10:
				267	avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 10));
				268	break;
				269	default: assert(0);
				270	}
				271
				272	if (width == 4) {
				273	do {
				274	vst1_s16(pred_buf, vsub_s16(vld1_s16(pred_buf), avg_16x4));
				275	} while ((pred_buf += CFL_BUF_LINE) < end);
				276	} else {
				277	const int16x8_t avg_16x8 = vcombine_s16(avg_16x4, avg_16x4);
				278	do {
				279	vldsubstq_s16(pred_buf, avg_16x8);
				280	vldsubstq_s16(pred_buf + CFL_BUF_LINE, avg_16x8);
				281	vldsubstq_s16(pred_buf + 2 * CFL_BUF_LINE, avg_16x8);
				282	vldsubstq_s16(pred_buf + 3 * CFL_BUF_LINE, avg_16x8);
				283
				284	if (width > 8) {
				285	vldsubstq_s16(pred_buf + 8, avg_16x8);
				286	vldsubstq_s16(pred_buf + CFL_BUF_LINE + 8, avg_16x8);
				287	vldsubstq_s16(pred_buf + 2 * CFL_BUF_LINE + 8, avg_16x8);
				288	vldsubstq_s16(pred_buf + 3 * CFL_BUF_LINE + 8, avg_16x8);
				289	}
				290	if (width == 32) {
				291	vldsubstq_s16(pred_buf + 16, avg_16x8);
				292	vldsubstq_s16(pred_buf + 24, avg_16x8);
				293	vldsubstq_s16(pred_buf + CFL_BUF_LINE + 16, avg_16x8);
				294	vldsubstq_s16(pred_buf + CFL_BUF_LINE + 24, avg_16x8);
				295	vldsubstq_s16(pred_buf + 2 * CFL_BUF_LINE + 16, avg_16x8);
				296	vldsubstq_s16(pred_buf + 2 * CFL_BUF_LINE + 24, avg_16x8);
				297	vldsubstq_s16(pred_buf + 3 * CFL_BUF_LINE + 16, avg_16x8);
				298	vldsubstq_s16(pred_buf + 3 * CFL_BUF_LINE + 24, avg_16x8);
				299	}
				300	} while ((pred_buf += step) < end);
				301	}
				302	}
				303
				304	CFL_SUB_AVG_FN(neon)
Luc Trudeau	5905ac5	2018-03-08 13:22:23 -0500	[diff] [blame]	305
				306	// Saturating negate 16-bit integers in a when the corresponding signed 16-bit
				307	// integer in b is negative.
				308	// Notes:
				309	// * Negating INT16_MIN results in INT16_MIN. However, this cannot occur in
				310	// practice, as scaled_luma is the multiplication of two absolute values.
				311	// * In the Intel equivalent, elements in a are zeroed out when the
				312	// corresponding elements in b are zero. Because vsign is used twice in a
				313	// row, with b in the first call becoming a in the second call, there's no
				314	// impact from not zeroing out.
				315	static int16x4_t vsign_s16(int16x4_t a, int16x4_t b) {
				316	const int16x4_t mask = vshr_n_s16(b, 15);
				317	return veor_s16(vadd_s16(a, mask), mask);
				318	}
				319
				320	// Saturating negate 16-bit integers in a when the corresponding signed 16-bit
				321	// integer in b is negative.
				322	// Notes:
				323	// * Negating INT16_MIN results in INT16_MIN. However, this cannot occur in
				324	// practice, as scaled_luma is the multiplication of two absolute values.
				325	// * In the Intel equivalent, elements in a are zeroed out when the
				326	// corresponding elements in b are zero. Because vsignq is used twice in a
				327	// row, with b in the first call becoming a in the second call, there's no
				328	// impact from not zeroing out.
				329	static int16x8_t vsignq_s16(int16x8_t a, int16x8_t b) {
				330	const int16x8_t mask = vshrq_n_s16(b, 15);
				331	return veorq_s16(vaddq_s16(a, mask), mask);
				332	}
				333
				334	static INLINE int16x4_t predict_w4(const int16_t *pred_buf_q3,
				335	int16x4_t alpha_sign, int abs_alpha_q12,
				336	int16x4_t dc) {
				337	const int16x4_t ac_q3 = vld1_s16(pred_buf_q3);
				338	const int16x4_t ac_sign = veor_s16(alpha_sign, ac_q3);
				339	int16x4_t scaled_luma = vqrdmulh_n_s16(vabs_s16(ac_q3), abs_alpha_q12);
				340	return vadd_s16(vsign_s16(scaled_luma, ac_sign), dc);
				341	}
				342
				343	static INLINE int16x8_t predict_w8(const int16_t *pred_buf_q3,
				344	int16x8_t alpha_sign, int abs_alpha_q12,
				345	int16x8_t dc) {
				346	const int16x8_t ac_q3 = vld1q_s16(pred_buf_q3);
				347	const int16x8_t ac_sign = veorq_s16(alpha_sign, ac_q3);
				348	int16x8_t scaled_luma = vqrdmulhq_n_s16(vabsq_s16(ac_q3), abs_alpha_q12);
				349	return vaddq_s16(vsignq_s16(scaled_luma, ac_sign), dc);
				350	}
				351
				352	// Vector signed->unsigned narrowing half store
				353	static void vsthun_s16(uint8_t *dst, int16x4_t scaled_luma) {
				354	vsth_u8(dst, vqmovun_s16(vcombine_s16(scaled_luma, scaled_luma)));
				355	}
				356
				357	// Vector signed->unsigned narrowing store
				358	static void vst1un_s16(uint8_t *dst, int16x8_t scaled_luma) {
				359	vst1_u8(dst, vqmovun_s16(scaled_luma));
				360	}
				361
				362	// Vector signed->unsigned narrowing store
				363	static void vst1unq_s16(uint8_t *dst, int16x8_t scaled_luma,
				364	int16x8_t scaled_luma_next) {
				365	vst1q_u8(dst, vcombine_u8(vqmovun_s16(scaled_luma),
				366	vqmovun_s16(scaled_luma_next)));
				367	}
				368
				369	static INLINE void cfl_predict_lbd_neon(const int16_t *pred_buf_q3,
				370	uint8_t *dst, int dst_stride,
				371	int alpha_q3, int width, int height) {
				372	const int16_t abs_alpha_q12 = abs(alpha_q3) << 9;
				373	const int16_t const end = pred_buf_q3 + height CFL_BUF_LINE;
				374	if (width == 4) {
				375	const int16x4_t alpha_sign = vdup_n_s16(alpha_q3);
				376	const int16x4_t dc = vdup_n_s16(*dst);
				377	do {
				378	const int16x4_t scaled_luma =
				379	predict_w4(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
				380	vsthun_s16(dst, scaled_luma);
				381	dst += dst_stride;
				382	} while ((pred_buf_q3 += CFL_BUF_LINE) < end);
				383	} else {
				384	const int16x8_t alpha_sign = vdupq_n_s16(alpha_q3);
				385	const int16x8_t dc = vdupq_n_s16(*dst);
				386	do {
				387	const int16x8_t scaled_luma =
				388	predict_w8(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
				389	if (width == 8) {
				390	vst1un_s16(dst, scaled_luma);
				391	} else {
				392	const int16x8_t scaled_luma_1 =
				393	predict_w8(pred_buf_q3 + 8, alpha_sign, abs_alpha_q12, dc);
				394	vst1unq_s16(dst, scaled_luma, scaled_luma_1);
				395	if (width == 32) {
				396	const int16x8_t scaled_luma_2 =
				397	predict_w8(pred_buf_q3 + 16, alpha_sign, abs_alpha_q12, dc);
				398	const int16x8_t scaled_luma_3 =
				399	predict_w8(pred_buf_q3 + 24, alpha_sign, abs_alpha_q12, dc);
				400	vst1unq_s16(dst + 16, scaled_luma_2, scaled_luma_3);
				401	}
				402	}
				403	dst += dst_stride;
				404	} while ((pred_buf_q3 += CFL_BUF_LINE) < end);
				405	}
				406	}
				407
				408	CFL_PREDICT_FN(neon, lbd)
				409
				410	static INLINE uint16x4_t clamp_s16(int16x4_t a, int16x4_t max) {
				411	return vreinterpret_u16_s16(vmax_s16(vmin_s16(a, max), vdup_n_s16(0)));
				412	}
				413
				414	static INLINE uint16x8_t clampq_s16(int16x8_t a, int16x8_t max) {
				415	return vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(a, max), vdupq_n_s16(0)));
				416	}
				417
				418	static INLINE void cfl_predict_hbd_neon(const int16_t *pred_buf_q3,
				419	uint16_t *dst, int dst_stride,
				420	int alpha_q3, int bd, int width,
				421	int height) {
				422	const int max = (1 << bd) - 1;
				423	const int16_t abs_alpha_q12 = abs(alpha_q3) << 9;
				424	const int16_t const end = pred_buf_q3 + height CFL_BUF_LINE;
				425	if (width == 4) {
				426	const int16x4_t alpha_sign = vdup_n_s16(alpha_q3);
				427	const int16x4_t dc = vdup_n_s16(*dst);
				428	const int16x4_t max_16x4 = vdup_n_s16(max);
				429	do {
				430	const int16x4_t scaled_luma =
				431	predict_w4(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
				432	vst1_u16(dst, clamp_s16(scaled_luma, max_16x4));
				433	dst += dst_stride;
				434	} while ((pred_buf_q3 += CFL_BUF_LINE) < end);
				435	} else {
				436	const int16x8_t alpha_sign = vdupq_n_s16(alpha_q3);
				437	const int16x8_t dc = vdupq_n_s16(*dst);
				438	const int16x8_t max_16x8 = vdupq_n_s16(max);
				439	do {
				440	const int16x8_t scaled_luma =
				441	predict_w8(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
				442	vst1q_u16(dst, clampq_s16(scaled_luma, max_16x8));
				443	if (width >= 16) {
				444	const int16x8_t scaled_luma_1 =
				445	predict_w8(pred_buf_q3 + 8, alpha_sign, abs_alpha_q12, dc);
				446	vst1q_u16(dst + 8, clampq_s16(scaled_luma_1, max_16x8));
				447	if (width == 32) {
				448	const int16x8_t scaled_luma_2 =
				449	predict_w8(pred_buf_q3 + 16, alpha_sign, abs_alpha_q12, dc);
				450	vst1q_u16(dst + 16, clampq_s16(scaled_luma_2, max_16x8));
				451	const int16x8_t scaled_luma_3 =
				452	predict_w8(pred_buf_q3 + 24, alpha_sign, abs_alpha_q12, dc);
				453	vst1q_u16(dst + 24, clampq_s16(scaled_luma_3, max_16x8));
				454	}
				455	}
				456	dst += dst_stride;
				457	} while ((pred_buf_q3 += CFL_BUF_LINE) < end);
				458	}
				459	}
				460
				461	CFL_PREDICT_FN(neon, hbd)