Blame - aom_dsp/arm/intrapred_neon.c - avm

blob: 5d3c15265b79bd4ab05806a7cda2b0e81a011156 [file] [log] [blame]

Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	1	/*
Krishna Rapaka	7319db5	2021-09-28 20:35:29 -0700	[diff] [blame]	2	* Copyright (c) 2021, Alliance for Open Media. All rights reserved
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	3	*
Vibhoothi	41c6dd7	2021-10-12 18:48:26 +0000	[diff] [blame]	4	* This source code is subject to the terms of the BSD 3-Clause Clear License
				5	* and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
				6	* License was not distributed with this source code in the LICENSE file, you
				7	* can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the
				8	* Alliance for Open Media Patent License 1.0 was not distributed with this
				9	* source code in the PATENTS file, you can obtain it at
				10	* aomedia.org/license/patent-license/.
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	11	*/
				12
				13	#include <arm_neon.h>
				14
Vitalii Dziumenko	7b9b739	2020-05-26 04:42:51 +0300	[diff] [blame]	15	#include "common/tools_common.h"
				16
Tom Finegan	60e653d	2018-05-22 11:34:58 -0700	[diff] [blame]	17	#include "config/aom_config.h"
Tom Finegan	44702c8	2018-05-22 13:00:39 -0700	[diff] [blame]	18	#include "config/aom_dsp_rtcd.h"
Tom Finegan	60e653d	2018-05-22 11:34:58 -0700	[diff] [blame]	19
Yaowu Xu	f883b42	2016-08-30 14:01:10 -0700	[diff] [blame]	20	#include "aom/aom_integer.h"
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	21
				22	//------------------------------------------------------------------------------
				23	// DC 4x4
				24
				25	// 'do_above' and 'do_left' facilitate branch removal when inlined.
				26	static INLINE void dc_4x4(uint8_t dst, ptrdiff_t stride, const uint8_t above,
				27	const uint8_t *left, int do_above, int do_left) {
				28	uint16x8_t sum_top;
				29	uint16x8_t sum_left;
				30	uint8x8_t dc0;
				31
				32	if (do_above) {
				33	const uint8x8_t A = vld1_u8(above); // top row
				34	const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
				35	const uint16x4_t p1 = vpadd_u16(p0, p0);
				36	sum_top = vcombine_u16(p1, p1);
				37	}
				38
				39	if (do_left) {
				40	const uint8x8_t L = vld1_u8(left); // left border
				41	const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left
				42	const uint16x4_t p1 = vpadd_u16(p0, p0);
				43	sum_left = vcombine_u16(p1, p1);
				44	}
				45
				46	if (do_above && do_left) {
				47	const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
				48	dc0 = vrshrn_n_u16(sum, 3);
				49	} else if (do_above) {
				50	dc0 = vrshrn_n_u16(sum_top, 2);
				51	} else if (do_left) {
				52	dc0 = vrshrn_n_u16(sum_left, 2);
				53	} else {
				54	dc0 = vdup_n_u8(0x80);
				55	}
				56
				57	{
				58	const uint8x8_t dc = vdup_lane_u8(dc0, 0);
				59	int i;
				60	for (i = 0; i < 4; ++i) {
				61	vst1_lane_u32((uint32_t )(dst + i stride), vreinterpret_u32_u8(dc), 0);
				62	}
				63	}
				64	}
				65
Yaowu Xu	f883b42	2016-08-30 14:01:10 -0700	[diff] [blame]	66	void aom_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	67	const uint8_t above, const uint8_t left) {
				68	dc_4x4(dst, stride, above, left, 1, 1);
				69	}
				70
Yaowu Xu	f883b42	2016-08-30 14:01:10 -0700	[diff] [blame]	71	void aom_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	72	const uint8_t above, const uint8_t left) {
				73	(void)above;
				74	dc_4x4(dst, stride, NULL, left, 0, 1);
				75	}
				76
Yaowu Xu	f883b42	2016-08-30 14:01:10 -0700	[diff] [blame]	77	void aom_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	78	const uint8_t above, const uint8_t left) {
				79	(void)left;
				80	dc_4x4(dst, stride, above, NULL, 1, 0);
				81	}
				82
Yaowu Xu	f883b42	2016-08-30 14:01:10 -0700	[diff] [blame]	83	void aom_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	84	const uint8_t above, const uint8_t left) {
				85	(void)above;
				86	(void)left;
				87	dc_4x4(dst, stride, NULL, NULL, 0, 0);
				88	}
				89
				90	//------------------------------------------------------------------------------
				91	// DC 8x8
				92
				93	// 'do_above' and 'do_left' facilitate branch removal when inlined.
				94	static INLINE void dc_8x8(uint8_t dst, ptrdiff_t stride, const uint8_t above,
				95	const uint8_t *left, int do_above, int do_left) {
				96	uint16x8_t sum_top;
				97	uint16x8_t sum_left;
				98	uint8x8_t dc0;
				99
				100	if (do_above) {
				101	const uint8x8_t A = vld1_u8(above); // top row
				102	const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
				103	const uint16x4_t p1 = vpadd_u16(p0, p0);
				104	const uint16x4_t p2 = vpadd_u16(p1, p1);
				105	sum_top = vcombine_u16(p2, p2);
				106	}
				107
				108	if (do_left) {
				109	const uint8x8_t L = vld1_u8(left); // left border
				110	const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left
				111	const uint16x4_t p1 = vpadd_u16(p0, p0);
				112	const uint16x4_t p2 = vpadd_u16(p1, p1);
				113	sum_left = vcombine_u16(p2, p2);
				114	}
				115
				116	if (do_above && do_left) {
				117	const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
				118	dc0 = vrshrn_n_u16(sum, 4);
				119	} else if (do_above) {
				120	dc0 = vrshrn_n_u16(sum_top, 3);
				121	} else if (do_left) {
				122	dc0 = vrshrn_n_u16(sum_left, 3);
				123	} else {
				124	dc0 = vdup_n_u8(0x80);
				125	}
				126
				127	{
				128	const uint8x8_t dc = vdup_lane_u8(dc0, 0);
				129	int i;
				130	for (i = 0; i < 8; ++i) {
				131	vst1_u32((uint32_t )(dst + i stride), vreinterpret_u32_u8(dc));
				132	}
				133	}
				134	}
				135
Yaowu Xu	f883b42	2016-08-30 14:01:10 -0700	[diff] [blame]	136	void aom_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	137	const uint8_t above, const uint8_t left) {
				138	dc_8x8(dst, stride, above, left, 1, 1);
				139	}
				140
Yaowu Xu	f883b42	2016-08-30 14:01:10 -0700	[diff] [blame]	141	void aom_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	142	const uint8_t above, const uint8_t left) {
				143	(void)above;
				144	dc_8x8(dst, stride, NULL, left, 0, 1);
				145	}
				146
Yaowu Xu	f883b42	2016-08-30 14:01:10 -0700	[diff] [blame]	147	void aom_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	148	const uint8_t above, const uint8_t left) {
				149	(void)left;
				150	dc_8x8(dst, stride, above, NULL, 1, 0);
				151	}
				152
Yaowu Xu	f883b42	2016-08-30 14:01:10 -0700	[diff] [blame]	153	void aom_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	154	const uint8_t above, const uint8_t left) {
				155	(void)above;
				156	(void)left;
				157	dc_8x8(dst, stride, NULL, NULL, 0, 0);
				158	}
				159
				160	//------------------------------------------------------------------------------
				161	// DC 16x16
				162
				163	// 'do_above' and 'do_left' facilitate branch removal when inlined.
				164	static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride,
				165	const uint8_t above, const uint8_t left,
				166	int do_above, int do_left) {
				167	uint16x8_t sum_top;
				168	uint16x8_t sum_left;
				169	uint8x8_t dc0;
				170
				171	if (do_above) {
				172	const uint8x16_t A = vld1q_u8(above); // top row
				173	const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top
				174	const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
				175	const uint16x4_t p2 = vpadd_u16(p1, p1);
				176	const uint16x4_t p3 = vpadd_u16(p2, p2);
				177	sum_top = vcombine_u16(p3, p3);
				178	}
				179
				180	if (do_left) {
				181	const uint8x16_t L = vld1q_u8(left); // left row
				182	const uint16x8_t p0 = vpaddlq_u8(L); // cascading summation of the left
				183	const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
				184	const uint16x4_t p2 = vpadd_u16(p1, p1);
				185	const uint16x4_t p3 = vpadd_u16(p2, p2);
				186	sum_left = vcombine_u16(p3, p3);
				187	}
				188
				189	if (do_above && do_left) {
				190	const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
				191	dc0 = vrshrn_n_u16(sum, 5);
				192	} else if (do_above) {
				193	dc0 = vrshrn_n_u16(sum_top, 4);
				194	} else if (do_left) {
				195	dc0 = vrshrn_n_u16(sum_left, 4);
				196	} else {
				197	dc0 = vdup_n_u8(0x80);
				198	}
				199
				200	{
				201	const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
				202	int i;
				203	for (i = 0; i < 16; ++i) {
				204	vst1q_u8(dst + i * stride, dc);
				205	}
				206	}
				207	}
				208
Yaowu Xu	f883b42	2016-08-30 14:01:10 -0700	[diff] [blame]	209	void aom_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	210	const uint8_t above, const uint8_t left) {
				211	dc_16x16(dst, stride, above, left, 1, 1);
				212	}
				213
Yaowu Xu	f883b42	2016-08-30 14:01:10 -0700	[diff] [blame]	214	void aom_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	215	const uint8_t *above,
				216	const uint8_t *left) {
				217	(void)above;
				218	dc_16x16(dst, stride, NULL, left, 0, 1);
				219	}
				220
Yaowu Xu	f883b42	2016-08-30 14:01:10 -0700	[diff] [blame]	221	void aom_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	222	const uint8_t *above,
				223	const uint8_t *left) {
				224	(void)left;
				225	dc_16x16(dst, stride, above, NULL, 1, 0);
				226	}
				227
Yaowu Xu	f883b42	2016-08-30 14:01:10 -0700	[diff] [blame]	228	void aom_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	229	const uint8_t *above,
				230	const uint8_t *left) {
				231	(void)above;
				232	(void)left;
				233	dc_16x16(dst, stride, NULL, NULL, 0, 0);
				234	}
				235
				236	//------------------------------------------------------------------------------
				237	// DC 32x32
				238
				239	// 'do_above' and 'do_left' facilitate branch removal when inlined.
				240	static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride,
				241	const uint8_t above, const uint8_t left,
				242	int do_above, int do_left) {
				243	uint16x8_t sum_top;
				244	uint16x8_t sum_left;
				245	uint8x8_t dc0;
				246
				247	if (do_above) {
				248	const uint8x16_t A0 = vld1q_u8(above); // top row
				249	const uint8x16_t A1 = vld1q_u8(above + 16);
				250	const uint16x8_t p0 = vpaddlq_u8(A0); // cascading summation of the top
				251	const uint16x8_t p1 = vpaddlq_u8(A1);
				252	const uint16x8_t p2 = vaddq_u16(p0, p1);
				253	const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
				254	const uint16x4_t p4 = vpadd_u16(p3, p3);
				255	const uint16x4_t p5 = vpadd_u16(p4, p4);
				256	sum_top = vcombine_u16(p5, p5);
				257	}
				258
				259	if (do_left) {
				260	const uint8x16_t L0 = vld1q_u8(left); // left row
				261	const uint8x16_t L1 = vld1q_u8(left + 16);
				262	const uint16x8_t p0 = vpaddlq_u8(L0); // cascading summation of the left
				263	const uint16x8_t p1 = vpaddlq_u8(L1);
				264	const uint16x8_t p2 = vaddq_u16(p0, p1);
				265	const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
				266	const uint16x4_t p4 = vpadd_u16(p3, p3);
				267	const uint16x4_t p5 = vpadd_u16(p4, p4);
				268	sum_left = vcombine_u16(p5, p5);
				269	}
				270
				271	if (do_above && do_left) {
				272	const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
				273	dc0 = vrshrn_n_u16(sum, 6);
				274	} else if (do_above) {
				275	dc0 = vrshrn_n_u16(sum_top, 5);
				276	} else if (do_left) {
				277	dc0 = vrshrn_n_u16(sum_left, 5);
				278	} else {
				279	dc0 = vdup_n_u8(0x80);
				280	}
				281
				282	{
				283	const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
				284	int i;
				285	for (i = 0; i < 32; ++i) {
				286	vst1q_u8(dst + i * stride, dc);
				287	vst1q_u8(dst + i * stride + 16, dc);
				288	}
				289	}
				290	}
				291
Yaowu Xu	f883b42	2016-08-30 14:01:10 -0700	[diff] [blame]	292	void aom_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	293	const uint8_t above, const uint8_t left) {
				294	dc_32x32(dst, stride, above, left, 1, 1);
				295	}
				296
Yaowu Xu	f883b42	2016-08-30 14:01:10 -0700	[diff] [blame]	297	void aom_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	298	const uint8_t *above,
				299	const uint8_t *left) {
				300	(void)above;
				301	dc_32x32(dst, stride, NULL, left, 0, 1);
				302	}
				303
Yaowu Xu	f883b42	2016-08-30 14:01:10 -0700	[diff] [blame]	304	void aom_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	305	const uint8_t *above,
				306	const uint8_t *left) {
				307	(void)left;
				308	dc_32x32(dst, stride, above, NULL, 1, 0);
				309	}
				310
Yaowu Xu	f883b42	2016-08-30 14:01:10 -0700	[diff] [blame]	311	void aom_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	312	const uint8_t *above,
				313	const uint8_t *left) {
				314	(void)above;
				315	(void)left;
				316	dc_32x32(dst, stride, NULL, NULL, 0, 0);
				317	}
				318
				319	// -----------------------------------------------------------------------------
				320
Yaowu Xu	f883b42	2016-08-30 14:01:10 -0700	[diff] [blame]	321	void aom_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	322	const uint8_t above, const uint8_t left) {
				323	const uint8x8_t XABCD_u8 = vld1_u8(above - 1);
				324	const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
				325	const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
				326	const uint32x2_t zero = vdup_n_u32(0);
				327	const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0);
				328	const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL);
				329	const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8));
				330	const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
				331	const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
				332	const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
				333	const uint8_t D = vget_lane_u8(XABCD_u8, 4);
				334	const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
				335	const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
				336	const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
				337	const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
				338	const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
				339	const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
				340	const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
				341	const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
				342	const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
				343	vst1_lane_u32((uint32_t )(dst + 0 stride), r0, 0);
				344	vst1_lane_u32((uint32_t )(dst + 1 stride), r1, 0);
				345	vst1_lane_u32((uint32_t )(dst + 2 stride), r2, 0);
				346	vst1_lane_u32((uint32_t )(dst + 3 stride), r3, 0);
				347	}
				348
Yaowu Xu	f883b42	2016-08-30 14:01:10 -0700	[diff] [blame]	349	void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	350	const uint8_t above, const uint8_t left) {
				351	int i;
				352	uint32x2_t d0u32 = vdup_n_u32(0);
				353	(void)left;
				354
				355	d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0);
				356	for (i = 0; i < 4; i++, dst += stride)
				357	vst1_lane_u32((uint32_t *)dst, d0u32, 0);
				358	}
				359
Yaowu Xu	f883b42	2016-08-30 14:01:10 -0700	[diff] [blame]	360	void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	361	const uint8_t above, const uint8_t left) {
				362	int i;
				363	uint8x8_t d0u8 = vdup_n_u8(0);
				364	(void)left;
				365
				366	d0u8 = vld1_u8(above);
				367	for (i = 0; i < 8; i++, dst += stride) vst1_u8(dst, d0u8);
				368	}
				369
Yaowu Xu	f883b42	2016-08-30 14:01:10 -0700	[diff] [blame]	370	void aom_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	371	const uint8_t above, const uint8_t left) {
				372	int i;
				373	uint8x16_t q0u8 = vdupq_n_u8(0);
				374	(void)left;
				375
				376	q0u8 = vld1q_u8(above);
				377	for (i = 0; i < 16; i++, dst += stride) vst1q_u8(dst, q0u8);
				378	}
				379
Yaowu Xu	f883b42	2016-08-30 14:01:10 -0700	[diff] [blame]	380	void aom_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	381	const uint8_t above, const uint8_t left) {
				382	int i;
				383	uint8x16_t q0u8 = vdupq_n_u8(0);
				384	uint8x16_t q1u8 = vdupq_n_u8(0);
				385	(void)left;
				386
				387	q0u8 = vld1q_u8(above);
				388	q1u8 = vld1q_u8(above + 16);
				389	for (i = 0; i < 32; i++, dst += stride) {
				390	vst1q_u8(dst, q0u8);
				391	vst1q_u8(dst + 16, q1u8);
				392	}
				393	}
				394
Yaowu Xu	f883b42	2016-08-30 14:01:10 -0700	[diff] [blame]	395	void aom_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	396	const uint8_t above, const uint8_t left) {
				397	uint8x8_t d0u8 = vdup_n_u8(0);
				398	uint32x2_t d1u32 = vdup_n_u32(0);
				399	(void)above;
				400
				401	d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0);
				402
				403	d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0);
				404	vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
				405	dst += stride;
				406	d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1);
				407	vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
				408	dst += stride;
				409	d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2);
				410	vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
				411	dst += stride;
				412	d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3);
				413	vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
				414	}
				415
Yaowu Xu	f883b42	2016-08-30 14:01:10 -0700	[diff] [blame]	416	void aom_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	417	const uint8_t above, const uint8_t left) {
				418	uint8x8_t d0u8 = vdup_n_u8(0);
				419	uint64x1_t d1u64 = vdup_n_u64(0);
				420	(void)above;
				421
				422	d1u64 = vld1_u64((const uint64_t *)left);
				423
				424	d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0);
				425	vst1_u8(dst, d0u8);
				426	dst += stride;
				427	d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1);
				428	vst1_u8(dst, d0u8);
				429	dst += stride;
				430	d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2);
				431	vst1_u8(dst, d0u8);
				432	dst += stride;
				433	d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3);
				434	vst1_u8(dst, d0u8);
				435	dst += stride;
				436	d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4);
				437	vst1_u8(dst, d0u8);
				438	dst += stride;
				439	d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5);
				440	vst1_u8(dst, d0u8);
				441	dst += stride;
				442	d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6);
				443	vst1_u8(dst, d0u8);
				444	dst += stride;
				445	d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7);
				446	vst1_u8(dst, d0u8);
				447	}
				448
Yaowu Xu	f883b42	2016-08-30 14:01:10 -0700	[diff] [blame]	449	void aom_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	450	const uint8_t above, const uint8_t left) {
				451	int j;
				452	uint8x8_t d2u8 = vdup_n_u8(0);
				453	uint8x16_t q0u8 = vdupq_n_u8(0);
				454	uint8x16_t q1u8 = vdupq_n_u8(0);
				455	(void)above;
				456
				457	q1u8 = vld1q_u8(left);
				458	d2u8 = vget_low_u8(q1u8);
				459	for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
				460	q0u8 = vdupq_lane_u8(d2u8, 0);
				461	vst1q_u8(dst, q0u8);
				462	dst += stride;
				463	q0u8 = vdupq_lane_u8(d2u8, 1);
				464	vst1q_u8(dst, q0u8);
				465	dst += stride;
				466	q0u8 = vdupq_lane_u8(d2u8, 2);
				467	vst1q_u8(dst, q0u8);
				468	dst += stride;
				469	q0u8 = vdupq_lane_u8(d2u8, 3);
				470	vst1q_u8(dst, q0u8);
				471	dst += stride;
				472	q0u8 = vdupq_lane_u8(d2u8, 4);
				473	vst1q_u8(dst, q0u8);
				474	dst += stride;
				475	q0u8 = vdupq_lane_u8(d2u8, 5);
				476	vst1q_u8(dst, q0u8);
				477	dst += stride;
				478	q0u8 = vdupq_lane_u8(d2u8, 6);
				479	vst1q_u8(dst, q0u8);
				480	dst += stride;
				481	q0u8 = vdupq_lane_u8(d2u8, 7);
				482	vst1q_u8(dst, q0u8);
				483	dst += stride;
				484	}
				485	}
				486
Yaowu Xu	f883b42	2016-08-30 14:01:10 -0700	[diff] [blame]	487	void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	488	const uint8_t above, const uint8_t left) {
				489	int j, k;
				490	uint8x8_t d2u8 = vdup_n_u8(0);
				491	uint8x16_t q0u8 = vdupq_n_u8(0);
				492	uint8x16_t q1u8 = vdupq_n_u8(0);
				493	(void)above;
				494
				495	for (k = 0; k < 2; k++, left += 16) {
				496	q1u8 = vld1q_u8(left);
				497	d2u8 = vget_low_u8(q1u8);
				498	for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
				499	q0u8 = vdupq_lane_u8(d2u8, 0);
				500	vst1q_u8(dst, q0u8);
				501	vst1q_u8(dst + 16, q0u8);
				502	dst += stride;
				503	q0u8 = vdupq_lane_u8(d2u8, 1);
				504	vst1q_u8(dst, q0u8);
				505	vst1q_u8(dst + 16, q0u8);
				506	dst += stride;
				507	q0u8 = vdupq_lane_u8(d2u8, 2);
				508	vst1q_u8(dst, q0u8);
				509	vst1q_u8(dst + 16, q0u8);
				510	dst += stride;
				511	q0u8 = vdupq_lane_u8(d2u8, 3);
				512	vst1q_u8(dst, q0u8);
				513	vst1q_u8(dst + 16, q0u8);
				514	dst += stride;
				515	q0u8 = vdupq_lane_u8(d2u8, 4);
				516	vst1q_u8(dst, q0u8);
				517	vst1q_u8(dst + 16, q0u8);
				518	dst += stride;
				519	q0u8 = vdupq_lane_u8(d2u8, 5);
				520	vst1q_u8(dst, q0u8);
				521	vst1q_u8(dst + 16, q0u8);
				522	dst += stride;
				523	q0u8 = vdupq_lane_u8(d2u8, 6);
				524	vst1q_u8(dst, q0u8);
				525	vst1q_u8(dst + 16, q0u8);
				526	dst += stride;
				527	q0u8 = vdupq_lane_u8(d2u8, 7);
				528	vst1q_u8(dst, q0u8);
				529	vst1q_u8(dst + 16, q0u8);
				530	dst += stride;
				531	}
				532	}
				533	}
Sachin Kumar Garg	8a68f7f	2018-07-09 11:16:33 +0530	[diff] [blame]	534
				535	static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
				536	const uint16_t *above,
				537	const uint16_t *left) {
				538	assert(bw >= 4);
				539	assert(IS_POWER_OF_TWO(bw));
				540	int expected_dc, sum = 0;
				541	const int count = bw * 2;
				542	uint32x4_t sum_q = vdupq_n_u32(0);
				543	uint32x2_t sum_d;
				544	uint16_t *dst_1;
				545	if (bw >= 8) {
				546	for (int i = 0; i < bw; i += 8) {
				547	sum_q = vpadalq_u16(sum_q, vld1q_u16(above));
				548	sum_q = vpadalq_u16(sum_q, vld1q_u16(left));
				549	above += 8;
				550	left += 8;
				551	}
				552	sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q));
				553	sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0);
				554	expected_dc = (sum + (count >> 1)) / count;
				555	const uint16x8_t dc = vdupq_n_u16((uint16_t)expected_dc);
				556	for (int r = 0; r < bw; r++) {
				557	dst_1 = dst;
				558	for (int i = 0; i < bw; i += 8) {
				559	vst1q_u16(dst_1, dc);
				560	dst_1 += 8;
				561	}
				562	dst += stride;
				563	}
				564	} else { // 4x4
				565	sum_q = vaddl_u16(vld1_u16(above), vld1_u16(left));
				566	sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q));
				567	sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0);
				568	expected_dc = (sum + (count >> 1)) / count;
				569	const uint16x4_t dc = vdup_n_u16((uint16_t)expected_dc);
				570	for (int r = 0; r < bw; r++) {
				571	vst1_u16(dst, dc);
				572	dst += stride;
				573	}
				574	}
				575	}
				576
				577	#define intra_pred_highbd_sized_neon(type, width) \
				578	void aom_highbd_##type##_predictor_##width##x##width##_neon( \
				579	uint16_t dst, ptrdiff_t stride, const uint16_t above, \
				580	const uint16_t *left, int bd) { \
				581	(void)bd; \
				582	highbd_##type##_predictor(dst, stride, width, above, left); \
				583	}
				584
				585	#define intra_pred_square(type) \
				586	intra_pred_highbd_sized_neon(type, 4); \
				587	intra_pred_highbd_sized_neon(type, 8); \
				588	intra_pred_highbd_sized_neon(type, 16); \
				589	intra_pred_highbd_sized_neon(type, 32); \
				590	intra_pred_highbd_sized_neon(type, 64);
				591
				592	intra_pred_square(dc);
				593	#undef intra_pred_square
Vitalii Dziumenko	ef5d9ba	2020-04-14 21:10:59 +0300	[diff] [blame]	594
				595	/* ---------------------P R E D I C T I O N Z 1--------------------------- */
				596
				597	static DECLARE_ALIGNED(16, uint8_t, EvenOddMaskx[8][16]) = {
				598	{ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 },
				599	{ 0, 1, 3, 5, 7, 9, 11, 13, 0, 2, 4, 6, 8, 10, 12, 14 },
				600	{ 0, 0, 2, 4, 6, 8, 10, 12, 0, 0, 3, 5, 7, 9, 11, 13 },
				601	{ 0, 0, 0, 3, 5, 7, 9, 11, 0, 0, 0, 4, 6, 8, 10, 12 },
				602	{ 0, 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 0, 5, 7, 9, 11 },
				603	{ 0, 0, 0, 0, 0, 5, 7, 9, 0, 0, 0, 0, 0, 6, 8, 10 },
				604	{ 0, 0, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 0, 7, 9 },
				605	{ 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 8 }
				606	};
				607
				608	// Low bit depth functions
				609	static DECLARE_ALIGNED(32, uint8_t, BaseMask[33][32]) = {
				610	{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				611	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
				612	{ 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				613	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
				614	{ 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				615	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
				616	{ 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				617	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
				618	{ 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				619	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
				620	{ 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				621	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
				622	{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				623	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
				624	{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				625	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
				626	{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0,
				627	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
				628	{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
				629	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
				630	{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
				631	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				632	0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
				633	{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				634	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				635	0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
				636	{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				637	0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				638	0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
				639	{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				640	0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				641	0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
				642	{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				643	0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0,
				644	0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
				645	{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				646	0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
				647	0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
				648	{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				649	0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0,
				650	0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
				651	{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				652	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0,
				653	0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
				654	{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				655	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0,
				656	0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
				657	{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				658	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0,
				659	0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
				660	{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				661	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0,
				662	0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
				663	{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				664	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
				665	0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
				666	{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				667	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				668	0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
				669	{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				670	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				671	0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
				672	{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				673	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				674	0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 },
				675	{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				676	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				677	0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0 },
				678	{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				679	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				680	0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0 },
				681	{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				682	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				683	0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0 },
				684	{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				685	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				686	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0 },
				687	{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				688	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				689	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0 },
				690	{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				691	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				692	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0 },
				693	{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				694	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				695	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0 },
				696	{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				697	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				698	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
				699	};
				700
				701	/* clang-format on */
				702	static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_neon_64(
				703	int H, int W, uint8x8_t dst, const uint8_t above, int upsample_above,
				704	int dx) {
				705	const int frac_bits = 6 - upsample_above;
				706	const int max_base_x = ((W + H) - 1) << upsample_above;
				707
				708	assert(dx > 0);
				709	// pre-filter above pixels
				710	// store in temp buffers:
				711	// above[x] * 32 + 16
				712	// above[x+1] - above[x]
				713	// final pixels will be calculated as:
				714	// (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
				715
				716	uint16x8_t a0, a1;
				717	uint16x8_t diff, a32;
				718	uint16x8_t a16;
				719	uint8x8_t a_mbase_x;
				720
				721	a16 = vdupq_n_u16(16);
				722	a_mbase_x = vdup_n_u8(above[max_base_x]);
				723	uint16x8_t v_32 = vdupq_n_u16(32);
				724	int16x8_t v_upsample_above = vdupq_n_s16(upsample_above);
				725	uint16x8_t c3f = vdupq_n_u16(0x3f);
				726
				727	int x = dx;
				728	for (int r = 0; r < W; r++) {
				729	uint16x8_t res;
				730	uint16x8_t shift;
				731	uint8x8x2_t v_tmp_a0_128;
				732
				733	int base = x >> frac_bits;
				734	int base_max_diff = (max_base_x - base) >> upsample_above;
				735	if (base_max_diff <= 0) {
				736	for (int i = r; i < W; ++i) {
				737	dst[i] = a_mbase_x; // save 4 values
				738	}
				739	return;
				740	}
				741
				742	if (base_max_diff > H) base_max_diff = H;
				743
				744	if (upsample_above) {
				745	v_tmp_a0_128 = vld2_u8(above + base);
				746	shift = vshrq_n_u16(
				747	vandq_u16(vshlq_u16(vdupq_n_u16(x), v_upsample_above), c3f), 1);
				748	} else {
				749	v_tmp_a0_128.val[0] = vld1_u8(above + base);
				750	v_tmp_a0_128.val[1] = vld1_u8(above + base + 1);
				751	shift = vshrq_n_u16(vandq_u16(vdupq_n_u16(x), c3f), 1);
				752	}
				753	a0 = vmovl_u8(v_tmp_a0_128.val[0]);
				754	a1 = vmovl_u8(v_tmp_a0_128.val[1]);
				755	diff = vsubq_u16(a1, a0); // a[x+1] - a[x]
				756	a32 = vmlaq_u16(a16, a0, v_32); // a[x] * 32 + 16
				757	res = vmlaq_u16(a32, diff, shift);
				758
				759	uint8x8_t mask = vld1_u8(BaseMask[base_max_diff]);
				760	dst[r] =
				761	vorr_u8(vand_u8(mask, vshrn_n_u16(res, 5)), vbic_u8(a_mbase_x, mask));
				762
				763	x += dx;
				764	}
				765	}
				766
				767	static void dr_prediction_z1_4xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
				768	const uint8_t *above, int upsample_above,
				769	int dx) {
				770	uint8x8_t dstvec[16];
				771
				772	dr_prediction_z1_HxW_internal_neon_64(4, N, dstvec, above, upsample_above,
				773	dx);
				774	for (int i = 0; i < N; i++) {
				775	vst1_lane_u32((uint32_t )(dst + stride i),
				776	vreinterpret_u32_u8(dstvec[i]), 0);
				777	}
				778	}
				779
				780	static void dr_prediction_z1_8xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
				781	const uint8_t *above, int upsample_above,
				782	int dx) {
				783	uint8x8_t dstvec[32];
				784
				785	dr_prediction_z1_HxW_internal_neon_64(8, N, dstvec, above, upsample_above,
				786	dx);
				787	for (int i = 0; i < N; i++) {
				788	vst1_u8(dst + stride * i, dstvec[i]);
				789	}
				790	}
				791
				792	static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_neon(
				793	int H, int W, uint8x16_t dst, const uint8_t above, int upsample_above,
				794	int dx) {
				795	const int frac_bits = 6 - upsample_above;
				796	const int max_base_x = ((W + H) - 1) << upsample_above;
				797
				798	assert(dx > 0);
				799	// pre-filter above pixels
				800	// store in temp buffers:
				801	// above[x] * 32 + 16
				802	// above[x+1] - above[x]
				803	// final pixels will be calculated as:
				804	// (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
				805
				806	uint8x16x2_t a0, a1;
				807	uint16x8x2_t diff, a32;
				808	uint16x8_t a16, c3f;
				809	uint8x16_t a_mbase_x;
				810
				811	a16 = vdupq_n_u16(16);
				812	a_mbase_x = vdupq_n_u8(above[max_base_x]);
				813	c3f = vdupq_n_u16(0x3f);
				814	uint16x8_t v_32 = vdupq_n_u16(32);
				815	uint8x16_t v_zero = vdupq_n_u8(0);
				816	int16x8_t v_upsample_above = vdupq_n_s16(upsample_above);
				817
				818	int x = dx;
				819	for (int r = 0; r < W; r++) {
				820	uint16x8x2_t res;
				821	uint16x8_t shift;
				822	uint8x16_t a0_128, a1_128;
				823
				824	int base = x >> frac_bits;
				825	int base_max_diff = (max_base_x - base) >> upsample_above;
				826	if (base_max_diff <= 0) {
				827	for (int i = r; i < W; ++i) {
				828	dst[i] = a_mbase_x; // save 4 values
				829	}
				830	return;
				831	}
				832
				833	if (base_max_diff > H) base_max_diff = H;
				834
				835	if (upsample_above) {
				836	uint8x8x2_t v_tmp_a0_128 = vld2_u8(above + base);
				837	a0_128 = vcombine_u8(v_tmp_a0_128.val[0], v_tmp_a0_128.val[1]);
				838	a1_128 = vextq_u8(a0_128, v_zero, 8);
				839	shift = vshrq_n_u16(
				840	vandq_u16(vshlq_u16(vdupq_n_u16(x), v_upsample_above), c3f), 1);
				841	} else {
				842	a0_128 = vld1q_u8(above + base);
				843	a1_128 = vld1q_u8(above + base + 1);
				844	shift = vshrq_n_u16(vandq_u16(vdupq_n_u16(x), c3f), 1);
				845	}
				846	a0 = vzipq_u8(a0_128, v_zero);
				847	a1 = vzipq_u8(a1_128, v_zero);
				848	diff.val[0] = vsubq_u16(vreinterpretq_u16_u8(a1.val[0]),
				849	vreinterpretq_u16_u8(a0.val[0])); // a[x+1] - a[x]
				850	diff.val[1] = vsubq_u16(vreinterpretq_u16_u8(a1.val[1]),
				851	vreinterpretq_u16_u8(a0.val[1])); // a[x+1] - a[x]
				852	a32.val[0] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0.val[0]),
				853	v_32); // a[x] * 32 + 16
				854	a32.val[1] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0.val[1]),
				855	v_32); // a[x] * 32 + 16
				856	res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shift);
				857	res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shift);
				858	uint8x16_t v_temp =
				859	vcombine_u8(vshrn_n_u16(res.val[0], 5), vshrn_n_u16(res.val[1], 5));
				860
				861	uint8x16_t mask = vld1q_u8(BaseMask[base_max_diff]);
				862	dst[r] = vorrq_u8(vandq_u8(mask, v_temp), vbicq_u8(a_mbase_x, mask));
				863
				864	x += dx;
				865	}
				866	}
				867
				868	static void dr_prediction_z1_16xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
				869	const uint8_t *above, int upsample_above,
				870	int dx) {
				871	uint8x16_t dstvec[64];
				872
				873	dr_prediction_z1_HxW_internal_neon(16, N, dstvec, above, upsample_above, dx);
				874	for (int i = 0; i < N; i++) {
				875	vst1q_u8(dst + stride * i, dstvec[i]);
				876	}
				877	}
				878
				879	static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_neon(
				880	int N, uint8x16x2_t dstvec, const uint8_t above, int upsample_above,
				881	int dx) {
				882	// here upsample_above is 0 by design of av1_use_intra_edge_upsample
				883	(void)upsample_above;
				884	const int frac_bits = 6;
				885	const int max_base_x = ((32 + N) - 1);
				886
				887	// pre-filter above pixels
				888	// store in temp buffers:
				889	// above[x] * 32 + 16
				890	// above[x+1] - above[x]
				891	// final pixels will be calculated as:
				892	// (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
				893
				894	uint8x16_t a_mbase_x;
				895	uint8x16x2_t a0, a1;
				896	uint16x8x2_t diff, a32;
				897	uint16x8_t a16, c3f;
				898
				899	a_mbase_x = vdupq_n_u8(above[max_base_x]);
				900	a16 = vdupq_n_u16(16);
				901	c3f = vdupq_n_u16(0x3f);
				902	uint16x8_t v_32 = vdupq_n_u16(32);
				903	uint8x16_t v_zero = vdupq_n_u8(0);
				904
				905	int x = dx;
				906	for (int r = 0; r < N; r++) {
				907	uint16x8x2_t res;
				908	uint8x16_t res16[2];
				909	uint8x16_t a0_128, a1_128;
				910
				911	int base = x >> frac_bits;
				912	int base_max_diff = (max_base_x - base);
				913	if (base_max_diff <= 0) {
				914	for (int i = r; i < N; ++i) {
				915	dstvec[i].val[0] = a_mbase_x; // save 32 values
				916	dstvec[i].val[1] = a_mbase_x;
				917	}
				918	return;
				919	}
				920	if (base_max_diff > 32) base_max_diff = 32;
				921
				922	uint16x8_t shift = vshrq_n_u16(vandq_u16(vdupq_n_u16(x), c3f), 1);
				923
				924	for (int j = 0, jj = 0; j < 32; j += 16, jj++) {
				925	int mdiff = base_max_diff - j;
				926	if (mdiff <= 0) {
				927	res16[jj] = a_mbase_x;
				928	} else {
				929	a0_128 = vld1q_u8(above + base + j);
				930	a1_128 = vld1q_u8(above + base + j + 1);
				931	a0 = vzipq_u8(a0_128, v_zero);
				932	a1 = vzipq_u8(a1_128, v_zero);
				933	diff.val[0] =
				934	vsubq_u16(vreinterpretq_u16_u8(a1.val[0]),
				935	vreinterpretq_u16_u8(a0.val[0])); // a[x+1] - a[x]
				936	diff.val[1] =
				937	vsubq_u16(vreinterpretq_u16_u8(a1.val[1]),
				938	vreinterpretq_u16_u8(a0.val[1])); // a[x+1] - a[x]
				939	a32.val[0] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0.val[0]),
				940	v_32); // a[x] * 32 + 16
				941	a32.val[1] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0.val[1]),
				942	v_32); // a[x] * 32 + 16
				943	res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shift);
				944	res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shift);
				945
				946	res16[jj] =
				947	vcombine_u8(vshrn_n_u16(res.val[0], 5), vshrn_n_u16(res.val[1], 5));
				948	}
				949	}
				950
				951	uint8x16x2_t mask;
				952
				953	mask.val[0] = vld1q_u8(BaseMask[base_max_diff]);
				954	mask.val[1] = vld1q_u8(BaseMask[base_max_diff] + 16);
				955	dstvec[r].val[0] = vorrq_u8(vandq_u8(mask.val[0], res16[0]),
				956	vbicq_u8(a_mbase_x, mask.val[0]));
				957	dstvec[r].val[1] = vorrq_u8(vandq_u8(mask.val[1], res16[1]),
				958	vbicq_u8(a_mbase_x, mask.val[1]));
				959	x += dx;
				960	}
				961	}
				962
				963	static void dr_prediction_z1_32xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
				964	const uint8_t *above, int upsample_above,
				965	int dx) {
				966	uint8x16x2_t dstvec[64];
				967
				968	dr_prediction_z1_32xN_internal_neon(N, dstvec, above, upsample_above, dx);
				969	for (int i = 0; i < N; i++) {
				970	vst1q_u8(dst + stride * i, dstvec[i].val[0]);
				971	vst1q_u8(dst + stride * i + 16, dstvec[i].val[1]);
				972	}
				973	}
				974
				975	static void dr_prediction_z1_64xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
				976	const uint8_t *above, int upsample_above,
				977	int dx) {
				978	// here upsample_above is 0 by design of av1_use_intra_edge_upsample
				979	(void)upsample_above;
				980	const int frac_bits = 6;
				981	const int max_base_x = ((64 + N) - 1);
				982
				983	// pre-filter above pixels
				984	// store in temp buffers:
				985	// above[x] * 32 + 16
				986	// above[x+1] - above[x]
				987	// final pixels will be calculated as:
				988	// (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
				989
				990	uint8x16x2_t a0, a1;
				991	uint16x8x2_t a32, diff;
				992	uint16x8_t a16, c3f;
				993	uint8x16_t a_mbase_x, max_base_x128, mask128;
				994
				995	a16 = vdupq_n_u16(16);
				996	a_mbase_x = vdupq_n_u8(above[max_base_x]);
				997	max_base_x128 = vdupq_n_u8(max_base_x);
				998	c3f = vdupq_n_u16(0x3f);
				999	uint16x8_t v_32 = vdupq_n_u16(32);
				1000	uint8x16_t v_zero = vdupq_n_u8(0);
				1001	uint8x16_t step = vdupq_n_u8(16);
				1002
				1003	int x = dx;
				1004	for (int r = 0; r < N; r++, dst += stride) {
				1005	uint16x8x2_t res;
				1006
				1007	int base = x >> frac_bits;
				1008	if (base >= max_base_x) {
				1009	for (int i = r; i < N; ++i) {
				1010	vst1q_u8(dst, a_mbase_x);
				1011	vst1q_u8(dst + 16, a_mbase_x);
				1012	vst1q_u8(dst + 32, a_mbase_x);
				1013	vst1q_u8(dst + 48, a_mbase_x);
				1014	dst += stride;
				1015	}
				1016	return;
				1017	}
				1018
				1019	uint16x8_t shift = vshrq_n_u16(vandq_u16(vdupq_n_u16(x), c3f), 1);
				1020	uint8x16_t a0_128, a1_128, res128;
				1021	uint8x16_t base_inc128 =
				1022	vaddq_u8(vdupq_n_u8(base), vcombine_u8(vcreate_u8(0x0706050403020100),
				1023	vcreate_u8(0x0F0E0D0C0B0A0908)));
				1024
				1025	for (int j = 0; j < 64; j += 16) {
				1026	int mdif = max_base_x - (base + j);
				1027	if (mdif <= 0) {
				1028	vst1q_u8(dst + j, a_mbase_x);
				1029	} else {
				1030	a0_128 = vld1q_u8(above + base + j);
				1031	a1_128 = vld1q_u8(above + base + 1 + j);
				1032	a0 = vzipq_u8(a0_128, v_zero);
				1033	a1 = vzipq_u8(a1_128, v_zero);
				1034	diff.val[0] =
				1035	vsubq_u16(vreinterpretq_u16_u8(a1.val[0]),
				1036	vreinterpretq_u16_u8(a0.val[0])); // a[x+1] - a[x]
				1037	diff.val[1] =
				1038	vsubq_u16(vreinterpretq_u16_u8(a1.val[1]),
				1039	vreinterpretq_u16_u8(a0.val[1])); // a[x+1] - a[x]
				1040	a32.val[0] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0.val[0]),
				1041	v_32); // a[x] * 32 + 16
				1042	a32.val[1] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0.val[1]),
				1043	v_32); // a[x] * 32 + 16
				1044	res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shift);
				1045	res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shift);
				1046	uint8x16_t v_temp =
				1047	vcombine_u8(vshrn_n_u16(res.val[0], 5), vshrn_n_u16(res.val[1], 5));
				1048
				1049	mask128 = vcgtq_u8(vqsubq_u8(max_base_x128, base_inc128), v_zero);
				1050	res128 =
				1051	vorrq_u8(vandq_u8(mask128, v_temp), vbicq_u8(a_mbase_x, mask128));
				1052	vst1q_u8(dst + j, res128);
				1053
				1054	base_inc128 = vaddq_u8(base_inc128, step);
				1055	}
				1056	}
				1057	x += dx;
				1058	}
				1059	}
				1060
				1061	// Directional prediction, zone 1: 0 < angle < 90
				1062	void av1_dr_prediction_z1_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
				1063	const uint8_t above, const uint8_t left,
				1064	int upsample_above, int dx, int dy) {
				1065	(void)left;
				1066	(void)dy;
				1067
				1068	switch (bw) {
				1069	case 4:
				1070	dr_prediction_z1_4xN_neon(bh, dst, stride, above, upsample_above, dx);
				1071	break;
				1072	case 8:
				1073	dr_prediction_z1_8xN_neon(bh, dst, stride, above, upsample_above, dx);
				1074	break;
				1075	case 16:
				1076	dr_prediction_z1_16xN_neon(bh, dst, stride, above, upsample_above, dx);
				1077	break;
				1078	case 32:
				1079	dr_prediction_z1_32xN_neon(bh, dst, stride, above, upsample_above, dx);
				1080	break;
				1081	case 64:
				1082	dr_prediction_z1_64xN_neon(bh, dst, stride, above, upsample_above, dx);
				1083	break;
				1084	default: break;
				1085	}
				1086	return;
				1087	}
				1088
				1089	/* ---------------------P R E D I C T I O N Z 2--------------------------- */
				1090
				1091	static DECLARE_ALIGNED(16, uint8_t, LoadMaskz2[4][16]) = {
				1092	{ 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
				1093	{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 },
				1094	{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
				1095	0, 0, 0 },
				1096	{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				1097	0xff, 0xff, 0xff, 0xff }
				1098	};
				1099
				1100	static AOM_FORCE_INLINE void vector_shift_x4(uint8x8_t vec, uint8x8_t v_zero,
				1101	int shift_value) {
				1102	switch (shift_value) {
				1103	case 1: vec = vext_u8(v_zero, *vec, 7); break;
				1104	case 2: vec = vext_u8(v_zero, *vec, 6); break;
				1105	case 3: vec = vext_u8(v_zero, *vec, 5); break;
				1106	default: break;
				1107	}
				1108	}
				1109
				1110	static void dr_prediction_z2_Nx4_neon(int N, uint8_t *dst, ptrdiff_t stride,
				1111	const uint8_t above, const uint8_t left,
				1112	int upsample_above, int upsample_left,
				1113	int dx, int dy) {
				1114	const int min_base_x = -(1 << upsample_above);
				1115	const int min_base_y = -(1 << upsample_left);
				1116	const int frac_bits_x = 6 - upsample_above;
				1117	const int frac_bits_y = 6 - upsample_left;
				1118
				1119	assert(dx > 0);
				1120	// pre-filter above pixels
				1121	// store in temp buffers:
				1122	// above[x] * 32 + 16
				1123	// above[x+1] - above[x]
				1124	// final pixels will be calculated as:
				1125	// (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
				1126	uint16x8_t a0_x, a1_x, a32, diff;
				1127	uint16x8_t v_32 = vdupq_n_u16(32);
				1128	uint16x8_t v_zero = vdupq_n_u16(0);
				1129	uint16x8_t a16 = vdupq_n_u16(16);
				1130
				1131	uint8x8_t v_zero_u8 = vdup_n_u8(0);
				1132	uint16x4_t v_c3f = vdup_n_u16(0x3f);
				1133	uint16x4_t r6 = vcreate_u16(0x00C0008000400000);
				1134	int16x4_t v_upsample_left = vdup_n_s16(upsample_left);
				1135	int16x4_t v_upsample_above = vdup_n_s16(upsample_above);
				1136	int16x4_t v_1234 = vcreate_s16(0x0004000300020001);
				1137	int16x4_t dy64 = vdup_n_s16(dy);
				1138	int16x4_t v_frac_bits_y = vdup_n_s16(-frac_bits_y);
				1139	int16x4_t min_base_y64 = vdup_n_s16(min_base_y);
				1140	int16x4_t v_one = vdup_lane_s16(v_1234, 0);
				1141
				1142	for (int r = 0; r < N; r++) {
				1143	uint16x8_t res, shift;
				1144	uint16x4_t ydx;
				1145	uint8x8_t resx, resy;
				1146	uint16x4x2_t v_shift;
				1147
				1148	int y = r + 1;
				1149	int base_x = (-y * dx) >> frac_bits_x;
				1150	int base_shift = 0;
				1151	if (base_x < (min_base_x - 1)) {
				1152	base_shift = (min_base_x - base_x - 1) >> upsample_above;
				1153	}
				1154	int base_min_diff =
				1155	(min_base_x - base_x + upsample_above) >> upsample_above;
				1156	if (base_min_diff > 4) {
				1157	base_min_diff = 4;
				1158	} else {
				1159	if (base_min_diff < 0) base_min_diff = 0;
				1160	}
				1161
				1162	if (base_shift > 3) {
				1163	a0_x = v_zero;
				1164	a1_x = v_zero;
				1165	v_shift.val[0] = vreinterpret_u16_u8(v_zero_u8);
				1166	v_shift.val[1] = vreinterpret_u16_u8(v_zero_u8);
				1167	} else {
				1168	ydx = vdup_n_u16(y * dx);
				1169
				1170	if (upsample_above) {
				1171	uint8x8x2_t v_tmp;
				1172	v_tmp.val[0] = vld1_u8(above + base_x + base_shift);
				1173	v_tmp.val[1] = vld1_u8(above + base_x + base_shift + 8);
				1174	uint8x8_t v_index_low = vld1_u8(EvenOddMaskx[base_shift]);
				1175	uint8x8_t v_index_high = vld1_u8(EvenOddMaskx[base_shift] + 8);
				1176	a0_x = vmovl_u8(vtbl2_u8(v_tmp, v_index_low));
				1177	a1_x = vmovl_u8(vtbl2_u8(v_tmp, v_index_high));
				1178	v_shift.val[0] = vshr_n_u16(
				1179	vand_u16(vshl_u16(vsub_u16(r6, ydx), v_upsample_above), v_c3f), 1);
				1180	} else {
				1181	uint8x8_t v_a0_x64 = vld1_u8(above + base_x + base_shift);
				1182	vector_shift_x4(&v_a0_x64, &v_zero_u8, base_shift);
				1183	uint8x8_t v_a1_x64 = vext_u8(v_a0_x64, v_zero_u8, 1);
				1184	v_shift.val[0] = vshr_n_u16(vand_u16(vsub_u16(r6, ydx), v_c3f), 1);
				1185	a0_x = vmovl_u8(v_a0_x64);
				1186	a1_x = vmovl_u8(v_a1_x64);
				1187	}
				1188	}
				1189
				1190	// y calc
				1191	uint8x8_t a0_y, a1_y;
				1192	if (base_x < min_base_x) {
				1193	DECLARE_ALIGNED(32, int16_t, base_y_c[4]);
				1194	int16x4_t v_r6 = vdup_n_s16(r << 6);
				1195	int16x4_t y_c64 = vmls_s16(v_r6, v_1234, dy64);
				1196	int16x4_t base_y_c64 = vshl_s16(y_c64, v_frac_bits_y);
				1197	uint16x4_t mask64 = vcgt_s16(min_base_y64, base_y_c64);
				1198
				1199	base_y_c64 = vbic_s16(base_y_c64, vreinterpret_s16_u16(mask64));
				1200	vst1_s16(base_y_c, base_y_c64);
				1201	a0_y = v_zero_u8;
				1202	a0_y = vld1_lane_u8(left + base_y_c[0], a0_y, 0);
				1203	a0_y = vld1_lane_u8(left + base_y_c[1], a0_y, 2);
				1204	a0_y = vld1_lane_u8(left + base_y_c[2], a0_y, 4);
				1205	a0_y = vld1_lane_u8(left + base_y_c[3], a0_y, 6);
				1206
				1207	base_y_c64 = vadd_s16(base_y_c64, v_one);
				1208	vst1_s16(base_y_c, base_y_c64);
				1209	a1_y = v_zero_u8;
				1210	a1_y = vld1_lane_u8(left + base_y_c[0], a1_y, 0);
				1211	a1_y = vld1_lane_u8(left + base_y_c[1], a1_y, 2);
				1212	a1_y = vld1_lane_u8(left + base_y_c[2], a1_y, 4);
				1213	a1_y = vld1_lane_u8(left + base_y_c[3], a1_y, 6);
				1214
				1215	if (upsample_left) {
				1216	v_shift.val[1] = vshr_n_u16(
				1217	vand_u16(vshl_u16(vreinterpret_u16_s16(y_c64), v_upsample_left),
				1218	v_c3f),
				1219	1);
				1220	} else {
				1221	v_shift.val[1] =
				1222	vshr_n_u16(vand_u16(vreinterpret_u16_s16(y_c64), v_c3f), 1);
				1223	}
				1224
				1225	a0_x = vcombine_u16(vget_low_u16(a0_x), vreinterpret_u16_u8(a0_y));
				1226	a1_x = vcombine_u16(vget_low_u16(a1_x), vreinterpret_u16_u8(a1_y));
				1227	}
				1228	shift = vcombine_u16(v_shift.val[0], v_shift.val[1]);
				1229	diff = vsubq_u16(a1_x, a0_x); // a[x+1] - a[x]
				1230	a32 = vmlaq_u16(a16, a0_x, v_32); // a[x] * 32 + 16
				1231	res = vmlaq_u16(a32, diff, shift);
				1232	resx = vshrn_n_u16(res, 5);
				1233	resy = vext_u8(resx, v_zero_u8, 4);
				1234
				1235	uint8x8_t mask = vld1_u8(BaseMask[base_min_diff]);
				1236	uint8x8_t v_resxy = vorr_u8(vand_u8(mask, resy), vbic_u8(resx, mask));
				1237	vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(v_resxy), 0);
				1238
				1239	dst += stride;
				1240	}
				1241	}
				1242
				1243	static AOM_FORCE_INLINE void vector_shuffle(uint8x16_t vec, uint8x16_t vzero,
				1244	int shift_value) {
				1245	switch (shift_value) {
				1246	case 1: vec = vextq_u8(vzero, *vec, 15); break;
				1247	case 2: vec = vextq_u8(vzero, *vec, 14); break;
				1248	case 3: vec = vextq_u8(vzero, *vec, 13); break;
				1249	case 4: vec = vextq_u8(vzero, *vec, 12); break;
				1250	case 5: vec = vextq_u8(vzero, *vec, 11); break;
				1251	case 6: vec = vextq_u8(vzero, *vec, 10); break;
				1252	case 7: vec = vextq_u8(vzero, *vec, 9); break;
				1253	case 8: vec = vextq_u8(vzero, *vec, 8); break;
				1254	case 9: vec = vextq_u8(vzero, *vec, 7); break;
				1255	case 10: vec = vextq_u8(vzero, *vec, 6); break;
				1256	case 11: vec = vextq_u8(vzero, *vec, 5); break;
				1257	case 12: vec = vextq_u8(vzero, *vec, 4); break;
				1258	case 13: vec = vextq_u8(vzero, *vec, 3); break;
				1259	case 14: vec = vextq_u8(vzero, *vec, 2); break;
				1260	case 15: vec = vextq_u8(vzero, *vec, 1); break;
				1261	default: break;
				1262	}
				1263	}
				1264
				1265	static void dr_prediction_z2_Nx8_neon(int N, uint8_t *dst, ptrdiff_t stride,
				1266	const uint8_t above, const uint8_t left,
				1267	int upsample_above, int upsample_left,
				1268	int dx, int dy) {
				1269	const int min_base_x = -(1 << upsample_above);
				1270	const int min_base_y = -(1 << upsample_left);
				1271	const int frac_bits_x = 6 - upsample_above;
				1272	const int frac_bits_y = 6 - upsample_left;
				1273
				1274	// pre-filter above pixels
				1275	// store in temp buffers:
				1276	// above[x] * 32 + 16
				1277	// above[x+1] - above[x]
				1278	// final pixels will be calculated as:
				1279	// (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
				1280	uint8x16x2_t a0_x, a1_x;
				1281	uint16x8x2_t diff, a32;
				1282	uint16x8_t c1234, a16, c3f;
				1283	uint8x16_t a0_x128, a1_x128;
				1284	int16x8_t min_base_y128, dy128;
				1285	uint16x8_t v_32 = vdupq_n_u16(32);
				1286	uint8x16_t v_zero = vdupq_n_u8(0);
				1287	int16x8_t v_upsample_left = vdupq_n_s16(upsample_left);
				1288	int16x8_t v_upsample_above = vdupq_n_s16(upsample_above);
				1289	int16x8_t v_frac_bits_y = vdupq_n_s16(-frac_bits_y);
				1290
				1291	a16 = vdupq_n_u16(16);
				1292	c3f = vdupq_n_u16(0x3f);
				1293	min_base_y128 = vdupq_n_s16(min_base_y);
				1294	dy128 = vdupq_n_s16(dy);
				1295	c1234 = vcombine_u16(vcreate_u16(0x0004000300020001),
				1296	vcreate_u16(0x0008000700060005));
				1297
				1298	for (int r = 0; r < N; r++) {
				1299	uint8x8_t resx, resy, resxy;
				1300	uint16x8_t r6, ydx;
				1301	uint16x8x2_t res, shift;
				1302
				1303	int y = r + 1;
				1304	int base_x = (-y * dx) >> frac_bits_x;
				1305	int base_shift = 0;
				1306	if (base_x < (min_base_x - 1)) {
				1307	base_shift = (min_base_x - base_x - 1) >> upsample_above;
				1308	}
				1309	int base_min_diff =
				1310	(min_base_x - base_x + upsample_above) >> upsample_above;
				1311	if (base_min_diff > 8) {
				1312	base_min_diff = 8;
				1313	} else {
				1314	if (base_min_diff < 0) base_min_diff = 0;
				1315	}
				1316
				1317	if (base_shift > 7) {
				1318	a0_x.val[0] = v_zero;
				1319	a0_x.val[1] = v_zero;
				1320	a1_x.val[0] = v_zero;
				1321	a1_x.val[1] = v_zero;
				1322	shift.val[0] = vreinterpretq_u16_u8(v_zero);
				1323	shift.val[1] = vreinterpretq_u16_u8(v_zero);
				1324	} else {
				1325	ydx = vdupq_n_u16(y * dx);
				1326	r6 = vshlq_n_u16(vextq_u16(c1234, vreinterpretq_u16_u8(v_zero), 2), 6);
				1327
				1328	if (upsample_above) {
				1329	uint8x8x2_t v_tmp;
				1330	v_tmp.val[0] = vld1_u8(above + base_x + base_shift);
				1331	v_tmp.val[1] = vld1_u8(above + base_x + base_shift + 8);
				1332	uint8x8_t v_index_low = vld1_u8(EvenOddMaskx[base_shift]);
				1333	uint8x8_t v_index_high = vld1_u8(EvenOddMaskx[base_shift] + 8);
				1334	shift.val[0] = vshrq_n_u16(
				1335	vandq_u16(vshlq_u16(vsubq_u16(r6, ydx), v_upsample_above), c3f), 1);
				1336	a0_x.val[0] =
				1337	vreinterpretq_u8_u16(vmovl_u8(vtbl2_u8(v_tmp, v_index_low)));
				1338	a1_x.val[0] =
				1339	vreinterpretq_u8_u16(vmovl_u8(vtbl2_u8(v_tmp, v_index_high)));
				1340	} else {
				1341	a0_x128 = vld1q_u8(above + base_x + base_shift);
				1342	a1_x128 = vextq_u8(a0_x128, v_zero, 1);
				1343	vector_shuffle(&a0_x128, &v_zero, base_shift);
				1344	vector_shuffle(&a1_x128, &v_zero, base_shift);
				1345	shift.val[0] = vshrq_n_u16(vandq_u16(vsubq_u16(r6, ydx), c3f), 1);
				1346	a0_x.val[0] = vreinterpretq_u8_u16(vmovl_u8(vget_low_u8(a0_x128)));
				1347	a1_x.val[0] = vreinterpretq_u8_u16(vmovl_u8(vget_low_u8(a1_x128)));
				1348	}
				1349	}
				1350
				1351	// y calc
				1352	if (base_x < min_base_x) {
				1353	DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
				1354	int16x8_t y_c128, base_y_c128;
				1355	uint16x8_t mask128;
				1356	int16x8_t v_r6 = vdupq_n_s16(r << 6);
				1357
				1358	y_c128 = vmlsq_s16(v_r6, vreinterpretq_s16_u16(c1234), dy128);
				1359	base_y_c128 = vshlq_s16(y_c128, v_frac_bits_y);
				1360	mask128 = vcgtq_s16(min_base_y128, base_y_c128);
				1361
				1362	base_y_c128 = vbicq_s16(base_y_c128, vreinterpretq_s16_u16(mask128));
				1363	vst1q_s16(base_y_c, base_y_c128);
				1364	a0_x.val[1] = v_zero;
				1365	a0_x.val[1] = vld1q_lane_u8(left + base_y_c[0], a0_x.val[1], 0);
				1366	a0_x.val[1] = vld1q_lane_u8(left + base_y_c[1], a0_x.val[1], 2);
				1367	a0_x.val[1] = vld1q_lane_u8(left + base_y_c[2], a0_x.val[1], 4);
				1368	a0_x.val[1] = vld1q_lane_u8(left + base_y_c[3], a0_x.val[1], 6);
				1369	a0_x.val[1] = vld1q_lane_u8(left + base_y_c[4], a0_x.val[1], 8);
				1370	a0_x.val[1] = vld1q_lane_u8(left + base_y_c[5], a0_x.val[1], 10);
				1371	a0_x.val[1] = vld1q_lane_u8(left + base_y_c[6], a0_x.val[1], 12);
				1372	a0_x.val[1] = vld1q_lane_u8(left + base_y_c[7], a0_x.val[1], 14);
				1373
				1374	base_y_c128 =
				1375	vaddq_s16(base_y_c128, vreinterpretq_s16_u16(vshrq_n_u16(a16, 4)));
				1376	vst1q_s16(base_y_c, base_y_c128);
				1377	a1_x.val[1] = v_zero;
				1378	a1_x.val[1] = vld1q_lane_u8(left + base_y_c[0], a1_x.val[1], 0);
				1379	a1_x.val[1] = vld1q_lane_u8(left + base_y_c[1], a1_x.val[1], 2);
				1380	a1_x.val[1] = vld1q_lane_u8(left + base_y_c[2], a1_x.val[1], 4);
				1381	a1_x.val[1] = vld1q_lane_u8(left + base_y_c[3], a1_x.val[1], 6);
				1382	a1_x.val[1] = vld1q_lane_u8(left + base_y_c[4], a1_x.val[1], 8);
				1383	a1_x.val[1] = vld1q_lane_u8(left + base_y_c[5], a1_x.val[1], 10);
				1384	a1_x.val[1] = vld1q_lane_u8(left + base_y_c[6], a1_x.val[1], 12);
				1385	a1_x.val[1] = vld1q_lane_u8(left + base_y_c[7], a1_x.val[1], 14);
				1386
				1387	if (upsample_left) {
				1388	shift.val[1] = vshrq_n_u16(
				1389	vandq_u16(vshlq_u16(vreinterpretq_u16_s16(y_c128), v_upsample_left),
				1390	c3f),
				1391	1);
				1392	} else {
				1393	shift.val[1] =
				1394	vshrq_n_u16(vandq_u16(vreinterpretq_u16_s16(y_c128), c3f), 1);
				1395	}
				1396	}
				1397	diff.val[0] =
				1398	vsubq_u16(vreinterpretq_u16_u8(a1_x.val[0]),
				1399	vreinterpretq_u16_u8(a0_x.val[0])); // a[x+1] - a[x]
				1400	diff.val[1] =
				1401	vsubq_u16(vreinterpretq_u16_u8(a1_x.val[1]),
				1402	vreinterpretq_u16_u8(a0_x.val[1])); // a[x+1] - a[x]
				1403	a32.val[0] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0_x.val[0]),
				1404	v_32); // a[x] * 32 + 16
				1405	a32.val[1] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0_x.val[1]),
				1406	v_32); // a[x] * 32 + 16
				1407	res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shift.val[0]);
				1408	res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shift.val[1]);
				1409	resx = vshrn_n_u16(res.val[0], 5);
				1410	resy = vshrn_n_u16(res.val[1], 5);
				1411
				1412	uint8x8_t mask = vld1_u8(BaseMask[base_min_diff]);
				1413
				1414	resxy = vorr_u8(vand_u8(mask, resy), vbic_u8(resx, mask));
				1415	vst1_u8(dst, resxy);
				1416	dst += stride;
				1417	}
				1418	}
				1419
				1420	static void dr_prediction_z2_HxW_neon(int H, int W, uint8_t *dst,
				1421	ptrdiff_t stride, const uint8_t *above,
				1422	const uint8_t *left, int upsample_above,
				1423	int upsample_left, int dx, int dy) {
				1424	// here upsample_above and upsample_left are 0 by design of
				1425	// av1_use_intra_edge_upsample
				1426	const int min_base_x = -1;
				1427	const int min_base_y = -1;
				1428	(void)upsample_above;
				1429	(void)upsample_left;
				1430	const int frac_bits_x = 6;
				1431	const int frac_bits_y = 6;
				1432
				1433	uint16x8_t a16, c1, c3f;
				1434	int16x8_t min_base_y256, dy256;
				1435	uint16x8x2_t a32, c0123, c1234, diff, shifty;
				1436	uint8x16x2_t a0_x, a1_x, a0_y, a1_y;
				1437	uint8x16_t a0_x128, a1_x128;
				1438	uint16x8_t v_32 = vdupq_n_u16(32);
				1439	uint8x16_t v_zero = vdupq_n_u8(0);
				1440	int16x8_t v_frac_bits_y = vdupq_n_s16(-frac_bits_y);
				1441
				1442	DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
				1443
				1444	a16 = vdupq_n_u16(16);
				1445	c1 = vshrq_n_u16(a16, 4);
				1446	min_base_y256 = vdupq_n_s16(min_base_y);
				1447	c3f = vdupq_n_u16(0x3f);
				1448	dy256 = vdupq_n_s16(dy);
				1449	c0123.val[0] = vcombine_u16(vcreate_u16(0x0003000200010000),
				1450	vcreate_u16(0x0007000600050004));
				1451	c0123.val[1] = vcombine_u16(vcreate_u16(0x000B000A00090008),
				1452	vcreate_u16(0x000F000E000D000C));
				1453	c1234.val[0] = vaddq_u16(c0123.val[0], c1);
				1454	c1234.val[1] = vaddq_u16(c0123.val[1], c1);
				1455
				1456	for (int r = 0; r < H; r++) {
				1457	uint16x8x2_t res, r6, shift;
				1458	uint16x8_t ydx, j256;
				1459	uint8x16_t resx, resy, resxy;
				1460	int y = r + 1;
				1461	ydx = vdupq_n_u16((uint16_t)(y * dx));
				1462
				1463	int base_x = (-y * dx) >> frac_bits_x;
				1464	for (int j = 0; j < W; j += 16) {
				1465	j256 = vdupq_n_u16(j);
				1466
				1467	int base_shift = 0;
				1468	if ((base_x + j) < (min_base_x - 1)) {
				1469	base_shift = (min_base_x - (base_x + j) - 1);
				1470	}
				1471	int base_min_diff = (min_base_x - base_x - j);
				1472	if (base_min_diff > 16) {
				1473	base_min_diff = 16;
				1474	} else {
				1475	if (base_min_diff < 0) base_min_diff = 0;
				1476	}
				1477
				1478	if (base_shift < 16) {
				1479	a0_x128 = vld1q_u8(above + base_x + base_shift + j);
				1480	a1_x128 = vld1q_u8(above + base_x + base_shift + 1 + j);
				1481	vector_shuffle(&a0_x128, &v_zero, base_shift);
				1482	vector_shuffle(&a1_x128, &v_zero, base_shift);
				1483	a0_x = vzipq_u8(a0_x128, v_zero);
				1484	a1_x = vzipq_u8(a1_x128, v_zero);
				1485	r6.val[0] = vshlq_n_u16(vaddq_u16(c0123.val[0], j256), 6);
				1486	r6.val[1] = vshlq_n_u16(vaddq_u16(c0123.val[1], j256), 6);
				1487	shift.val[0] =
				1488	vshrq_n_u16(vandq_u16(vsubq_u16(r6.val[0], ydx), c3f), 1);
				1489	shift.val[1] =
				1490	vshrq_n_u16(vandq_u16(vsubq_u16(r6.val[1], ydx), c3f), 1);
				1491	diff.val[0] =
				1492	vsubq_u16(vreinterpretq_u16_u8(a1_x.val[0]),
				1493	vreinterpretq_u16_u8(a0_x.val[0])); // a[x+1] - a[x]
				1494	diff.val[1] =
				1495	vsubq_u16(vreinterpretq_u16_u8(a1_x.val[1]),
				1496	vreinterpretq_u16_u8(a0_x.val[1])); // a[x+1] - a[x]
				1497	a32.val[0] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0_x.val[0]),
				1498	v_32); // a[x] * 32 + 16
				1499	a32.val[1] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0_x.val[1]),
				1500	v_32); // a[x] * 32 + 16
				1501	res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shift.val[0]);
				1502	res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shift.val[1]);
				1503	resx =
				1504	vcombine_u8(vshrn_n_u16(res.val[0], 5), vshrn_n_u16(res.val[1], 5));
				1505	} else {
				1506	resx = v_zero;
				1507	}
				1508
				1509	// y calc
				1510	if (base_x < min_base_x) {
				1511	uint16x8x2_t mask256;
				1512	int16x8x2_t c256, y_c256, base_y_c256, mul16;
				1513	int16x8_t v_r6 = vdupq_n_s16(r << 6);
				1514
				1515	c256.val[0] = vaddq_s16(vreinterpretq_s16_u16(j256),
				1516	vreinterpretq_s16_u16(c1234.val[0]));
				1517	c256.val[1] = vaddq_s16(vreinterpretq_s16_u16(j256),
				1518	vreinterpretq_s16_u16(c1234.val[1]));
				1519	mul16.val[0] = vminq_s16(vmulq_s16(c256.val[0], dy256),
				1520	vreinterpretq_s16_u16(vshrq_n_u16(
				1521	vreinterpretq_u16_s16(min_base_y256), 1)));
				1522	mul16.val[1] = vminq_s16(vmulq_s16(c256.val[1], dy256),
				1523	vreinterpretq_s16_u16(vshrq_n_u16(
				1524	vreinterpretq_u16_s16(min_base_y256), 1)));
				1525	y_c256.val[0] = vsubq_s16(v_r6, mul16.val[0]);
				1526	y_c256.val[1] = vsubq_s16(v_r6, mul16.val[1]);
				1527
				1528	base_y_c256.val[0] = vshlq_s16(y_c256.val[0], v_frac_bits_y);
				1529	base_y_c256.val[1] = vshlq_s16(y_c256.val[1], v_frac_bits_y);
				1530	mask256.val[0] = vcgtq_s16(min_base_y256, base_y_c256.val[0]);
				1531	mask256.val[1] = vcgtq_s16(min_base_y256, base_y_c256.val[1]);
				1532
				1533	base_y_c256.val[0] = vorrq_s16(
				1534	vandq_s16(vreinterpretq_s16_u16(mask256.val[0]), min_base_y256),
				1535	vbicq_s16(base_y_c256.val[0],
				1536	vreinterpretq_s16_u16(mask256.val[0])));
				1537	base_y_c256.val[1] = vorrq_s16(
				1538	vandq_s16(vreinterpretq_s16_u16(mask256.val[1]), min_base_y256),
				1539	vbicq_s16(base_y_c256.val[1],
				1540	vreinterpretq_s16_u16(mask256.val[1])));
				1541
				1542	int16_t min_y = vgetq_lane_s16(base_y_c256.val[1], 7);
				1543	int16_t max_y = vgetq_lane_s16(base_y_c256.val[0], 0);
				1544	int16_t offset_diff = max_y - min_y;
				1545
				1546	if (offset_diff < 16) {
				1547	int16x8_t min_y256 =
				1548	vdupq_lane_s16(vget_high_s16(base_y_c256.val[1]), 3);
				1549
				1550	int16x8x2_t base_y_offset;
				1551	base_y_offset.val[0] = vsubq_s16(base_y_c256.val[0], min_y256);
				1552	base_y_offset.val[1] = vsubq_s16(base_y_c256.val[1], min_y256);
				1553
				1554	int8x16_t base_y_offset128 =
				1555	vcombine_s8(vqmovn_s16(base_y_offset.val[0]),
				1556	vqmovn_s16(base_y_offset.val[1]));
				1557
				1558	uint8x16_t a0_y128, a1_y128;
				1559	uint8x16_t v_loadmaskz2 = vld1q_u8(LoadMaskz2[offset_diff / 4]);
				1560	a0_y128 = vld1q_u8(left + min_y);
				1561	a0_y128 = vandq_u8(a0_y128, v_loadmaskz2);
				1562	a1_y128 = vld1q_u8(left + min_y + 1);
				1563	a1_y128 = vandq_u8(a1_y128, v_loadmaskz2);
				1564	#if defined(__aarch64__)
				1565	a0_y128 = vqtbl1q_u8(a0_y128, vreinterpretq_u8_s8(base_y_offset128));
				1566	a1_y128 = vqtbl1q_u8(a1_y128, vreinterpretq_u8_s8(base_y_offset128));
				1567	#else
				1568	uint8x8x2_t v_tmp;
				1569	uint8x8x2_t v_res;
				1570	uint8x8_t v_index_low =
				1571	vget_low_u8(vreinterpretq_u8_s8(base_y_offset128));
				1572	uint8x8_t v_index_high =
				1573	vget_high_u8(vreinterpretq_u8_s8(base_y_offset128));
				1574	v_tmp.val[0] = vget_low_u8(a0_y128);
				1575	v_tmp.val[1] = vget_high_u8(a0_y128);
				1576	v_res.val[0] = vtbl2_u8(v_tmp, v_index_low);
				1577	v_res.val[1] = vtbl2_u8(v_tmp, v_index_high);
				1578	a0_y128 = vcombine_u8(v_res.val[0], v_res.val[1]);
				1579	v_tmp.val[0] = vget_low_u8(a1_y128);
				1580	v_tmp.val[1] = vget_high_u8(a1_y128);
				1581	v_res.val[0] = vtbl2_u8(v_tmp, v_index_low);
				1582	v_res.val[1] = vtbl2_u8(v_tmp, v_index_high);
				1583	a1_y128 = vcombine_u8(v_res.val[0], v_res.val[1]);
				1584	#endif
				1585	a0_y = vzipq_u8(a0_y128, v_zero);
				1586	a1_y = vzipq_u8(a1_y128, v_zero);
				1587	} else {
				1588	base_y_c256.val[0] = vbicq_s16(base_y_c256.val[0],
				1589	vreinterpretq_s16_u16(mask256.val[0]));
				1590	base_y_c256.val[1] = vbicq_s16(base_y_c256.val[1],
				1591	vreinterpretq_s16_u16(mask256.val[1]));
				1592	vst1q_s16(base_y_c, base_y_c256.val[0]);
				1593	vst1q_s16(base_y_c + 8, base_y_c256.val[1]);
				1594	a0_y.val[0] = v_zero;
				1595	a0_y.val[1] = v_zero;
				1596	a0_y.val[0] = vld1q_lane_u8(left + base_y_c[0], a0_y.val[0], 0);
				1597	a0_y.val[0] = vld1q_lane_u8(left + base_y_c[1], a0_y.val[0], 2);
				1598	a0_y.val[0] = vld1q_lane_u8(left + base_y_c[2], a0_y.val[0], 4);
				1599	a0_y.val[0] = vld1q_lane_u8(left + base_y_c[3], a0_y.val[0], 6);
				1600	a0_y.val[0] = vld1q_lane_u8(left + base_y_c[4], a0_y.val[0], 8);
				1601	a0_y.val[0] = vld1q_lane_u8(left + base_y_c[5], a0_y.val[0], 10);
				1602	a0_y.val[0] = vld1q_lane_u8(left + base_y_c[6], a0_y.val[0], 12);
				1603	a0_y.val[0] = vld1q_lane_u8(left + base_y_c[7], a0_y.val[0], 14);
				1604	a0_y.val[1] = vld1q_lane_u8(left + base_y_c[8], a0_y.val[1], 0);
				1605	a0_y.val[1] = vld1q_lane_u8(left + base_y_c[9], a0_y.val[1], 2);
				1606	a0_y.val[1] = vld1q_lane_u8(left + base_y_c[10], a0_y.val[1], 4);
				1607	a0_y.val[1] = vld1q_lane_u8(left + base_y_c[11], a0_y.val[1], 6);
				1608	a0_y.val[1] = vld1q_lane_u8(left + base_y_c[12], a0_y.val[1], 8);
				1609	a0_y.val[1] = vld1q_lane_u8(left + base_y_c[13], a0_y.val[1], 10);
				1610	a0_y.val[1] = vld1q_lane_u8(left + base_y_c[14], a0_y.val[1], 12);
				1611	a0_y.val[1] = vld1q_lane_u8(left + base_y_c[15], a0_y.val[1], 14);
				1612
				1613	base_y_c256.val[0] =
				1614	vaddq_s16(base_y_c256.val[0], vreinterpretq_s16_u16(c1));
				1615	base_y_c256.val[1] =
				1616	vaddq_s16(base_y_c256.val[1], vreinterpretq_s16_u16(c1));
				1617	vst1q_s16(base_y_c, base_y_c256.val[0]);
				1618	vst1q_s16(base_y_c + 8, base_y_c256.val[1]);
				1619	a1_y.val[0] = v_zero;
				1620	a1_y.val[1] = v_zero;
				1621	a1_y.val[0] = vld1q_lane_u8(left + base_y_c[0], a1_y.val[0], 0);
				1622	a1_y.val[0] = vld1q_lane_u8(left + base_y_c[1], a1_y.val[0], 2);
				1623	a1_y.val[0] = vld1q_lane_u8(left + base_y_c[2], a1_y.val[0], 4);
				1624	a1_y.val[0] = vld1q_lane_u8(left + base_y_c[3], a1_y.val[0], 6);
				1625	a1_y.val[0] = vld1q_lane_u8(left + base_y_c[4], a1_y.val[0], 8);
				1626	a1_y.val[0] = vld1q_lane_u8(left + base_y_c[5], a1_y.val[0], 10);
				1627	a1_y.val[0] = vld1q_lane_u8(left + base_y_c[6], a1_y.val[0], 12);
				1628	a1_y.val[0] = vld1q_lane_u8(left + base_y_c[7], a1_y.val[0], 14);
				1629	a1_y.val[1] = vld1q_lane_u8(left + base_y_c[8], a1_y.val[1], 0);
				1630	a1_y.val[1] = vld1q_lane_u8(left + base_y_c[9], a1_y.val[1], 2);
				1631	a1_y.val[1] = vld1q_lane_u8(left + base_y_c[10], a1_y.val[1], 4);
				1632	a1_y.val[1] = vld1q_lane_u8(left + base_y_c[11], a1_y.val[1], 6);
				1633	a1_y.val[1] = vld1q_lane_u8(left + base_y_c[12], a1_y.val[1], 8);
				1634	a1_y.val[1] = vld1q_lane_u8(left + base_y_c[13], a1_y.val[1], 10);
				1635	a1_y.val[1] = vld1q_lane_u8(left + base_y_c[14], a1_y.val[1], 12);
				1636	a1_y.val[1] = vld1q_lane_u8(left + base_y_c[15], a1_y.val[1], 14);
				1637	}
				1638	shifty.val[0] = vshrq_n_u16(
				1639	vandq_u16(vreinterpretq_u16_s16(y_c256.val[0]), c3f), 1);
				1640	shifty.val[1] = vshrq_n_u16(
				1641	vandq_u16(vreinterpretq_u16_s16(y_c256.val[1]), c3f), 1);
				1642	diff.val[0] =
				1643	vsubq_u16(vreinterpretq_u16_u8(a1_y.val[0]),
				1644	vreinterpretq_u16_u8(a0_y.val[0])); // a[x+1] - a[x]
				1645	diff.val[1] =
				1646	vsubq_u16(vreinterpretq_u16_u8(a1_y.val[1]),
				1647	vreinterpretq_u16_u8(a0_y.val[1])); // a[x+1] - a[x]
				1648	a32.val[0] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0_y.val[0]),
				1649	v_32); // a[x] * 32 + 16
				1650	a32.val[1] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0_y.val[1]),
				1651	v_32); // a[x] * 32 + 16
				1652	res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shifty.val[0]);
				1653	res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shifty.val[1]);
				1654
				1655	resy =
				1656	vcombine_u8(vshrn_n_u16(res.val[0], 5), vshrn_n_u16(res.val[1], 5));
				1657	} else {
				1658	resy = v_zero;
				1659	}
				1660	uint8x16_t mask = vld1q_u8(BaseMask[base_min_diff]);
				1661	resxy = vorrq_u8(vandq_u8(mask, resy), vbicq_u8(resx, mask));
				1662	vst1q_u8(dst + j, resxy);
				1663	} // for j
				1664	dst += stride;
				1665	}
				1666	}
				1667
				1668	// Directional prediction, zone 2: 90 < angle < 180
				1669	void av1_dr_prediction_z2_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
				1670	const uint8_t above, const uint8_t left,
				1671	int upsample_above, int upsample_left, int dx,
				1672	int dy) {
				1673	assert(dx > 0);
				1674	assert(dy > 0);
				1675
				1676	switch (bw) {
				1677	case 4:
				1678	dr_prediction_z2_Nx4_neon(bh, dst, stride, above, left, upsample_above,
				1679	upsample_left, dx, dy);
				1680	break;
				1681	case 8:
				1682	dr_prediction_z2_Nx8_neon(bh, dst, stride, above, left, upsample_above,
				1683	upsample_left, dx, dy);
				1684	break;
				1685	default:
				1686	dr_prediction_z2_HxW_neon(bh, bw, dst, stride, above, left,
				1687	upsample_above, upsample_left, dx, dy);
				1688	break;
				1689	}
				1690	return;
				1691	}
				1692
				1693	/* ---------------------P R E D I C T I O N Z 3--------------------------- */
				1694
				1695	static AOM_FORCE_INLINE void transpose4x16_neon(uint8x16_t *x,
				1696	uint16x8x2_t *d) {
				1697	uint8x16x2_t w0, w1;
				1698
				1699	w0 = vzipq_u8(x[0], x[1]);
				1700	w1 = vzipq_u8(x[2], x[3]);
				1701
				1702	d[0] = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
				1703	vreinterpretq_u16_u8(w1.val[0]));
				1704	d[1] = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
				1705	vreinterpretq_u16_u8(w1.val[1]));
				1706	}
				1707
				1708	static AOM_FORCE_INLINE void transpose4x8_8x4_low_neon(uint8x8_t *x,
				1709	uint16x4x2_t *d) {
				1710	uint8x8x2_t w0, w1;
				1711
				1712	w0 = vzip_u8(x[0], x[1]);
				1713	w1 = vzip_u8(x[2], x[3]);
				1714
				1715	*d = vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]));
				1716	}
				1717
				1718	static AOM_FORCE_INLINE void transpose4x8_8x4_neon(uint8x8_t *x,
				1719	uint16x4x2_t *d) {
				1720	uint8x8x2_t w0, w1;
				1721
				1722	w0 = vzip_u8(x[0], x[1]);
				1723	w1 = vzip_u8(x[2], x[3]);
				1724
				1725	d[0] =
				1726	vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]));
				1727	d[1] =
				1728	vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1]));
				1729	}
				1730
				1731	static AOM_FORCE_INLINE void transpose8x8_low_neon(uint8x8_t *x,
				1732	uint32x2x2_t *d) {
				1733	uint8x8x2_t w0, w1, w2, w3;
				1734	uint16x4x2_t w4, w5;
				1735
				1736	w0 = vzip_u8(x[0], x[1]);
				1737	w1 = vzip_u8(x[2], x[3]);
				1738	w2 = vzip_u8(x[4], x[5]);
				1739	w3 = vzip_u8(x[6], x[7]);
				1740
				1741	w4 = vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]));
				1742	w5 = vzip_u16(vreinterpret_u16_u8(w2.val[0]), vreinterpret_u16_u8(w3.val[0]));
				1743
				1744	d[0] = vzip_u32(vreinterpret_u32_u16(w4.val[0]),
				1745	vreinterpret_u32_u16(w5.val[0]));
				1746	d[1] = vzip_u32(vreinterpret_u32_u16(w4.val[1]),
				1747	vreinterpret_u32_u16(w5.val[1]));
				1748	}
				1749
				1750	static AOM_FORCE_INLINE void transpose8x8_neon(uint8x8_t x, uint32x2x2_t d) {
				1751	uint8x8x2_t w0, w1, w2, w3;
				1752	uint16x4x2_t w4, w5, w6, w7;
				1753
				1754	w0 = vzip_u8(x[0], x[1]);
				1755	w1 = vzip_u8(x[2], x[3]);
				1756	w2 = vzip_u8(x[4], x[5]);
				1757	w3 = vzip_u8(x[6], x[7]);
				1758
				1759	w4 = vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]));
				1760	w5 = vzip_u16(vreinterpret_u16_u8(w2.val[0]), vreinterpret_u16_u8(w3.val[0]));
				1761
				1762	d[0] = vzip_u32(vreinterpret_u32_u16(w4.val[0]),
				1763	vreinterpret_u32_u16(w5.val[0]));
				1764	d[1] = vzip_u32(vreinterpret_u32_u16(w4.val[1]),
				1765	vreinterpret_u32_u16(w5.val[1]));
				1766
				1767	w6 = vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1]));
				1768	w7 = vzip_u16(vreinterpret_u16_u8(w2.val[1]), vreinterpret_u16_u8(w3.val[1]));
				1769
				1770	d[2] = vzip_u32(vreinterpret_u32_u16(w6.val[0]),
				1771	vreinterpret_u32_u16(w7.val[0]));
				1772	d[3] = vzip_u32(vreinterpret_u32_u16(w6.val[1]),
				1773	vreinterpret_u32_u16(w7.val[1]));
				1774	}
				1775
				1776	static AOM_FORCE_INLINE void transpose16x8_8x16_neon(uint8x8_t *x,
				1777	uint64x2_t *d) {
				1778	uint8x8x2_t w0, w1, w2, w3, w8, w9, w10, w11;
				1779	uint16x4x2_t w4, w5, w12, w13;
				1780	uint32x2x2_t w6, w7, w14, w15;
				1781
				1782	w0 = vzip_u8(x[0], x[1]);
				1783	w1 = vzip_u8(x[2], x[3]);
				1784	w2 = vzip_u8(x[4], x[5]);
				1785	w3 = vzip_u8(x[6], x[7]);
				1786
				1787	w8 = vzip_u8(x[8], x[9]);
				1788	w9 = vzip_u8(x[10], x[11]);
				1789	w10 = vzip_u8(x[12], x[13]);
				1790	w11 = vzip_u8(x[14], x[15]);
				1791
				1792	w4 = vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]));
				1793	w5 = vzip_u16(vreinterpret_u16_u8(w2.val[0]), vreinterpret_u16_u8(w3.val[0]));
				1794	w12 =
				1795	vzip_u16(vreinterpret_u16_u8(w8.val[0]), vreinterpret_u16_u8(w9.val[0]));
				1796	w13 = vzip_u16(vreinterpret_u16_u8(w10.val[0]),
				1797	vreinterpret_u16_u8(w11.val[0]));
				1798
				1799	w6 = vzip_u32(vreinterpret_u32_u16(w4.val[0]),
				1800	vreinterpret_u32_u16(w5.val[0]));
				1801	w7 = vzip_u32(vreinterpret_u32_u16(w4.val[1]),
				1802	vreinterpret_u32_u16(w5.val[1]));
				1803	w14 = vzip_u32(vreinterpret_u32_u16(w12.val[0]),
				1804	vreinterpret_u32_u16(w13.val[0]));
				1805	w15 = vzip_u32(vreinterpret_u32_u16(w12.val[1]),
				1806	vreinterpret_u32_u16(w13.val[1]));
				1807
				1808	// Store first 4-line result
				1809	d[0] = vcombine_u64(vreinterpret_u64_u32(w6.val[0]),
				1810	vreinterpret_u64_u32(w14.val[0]));
				1811	d[1] = vcombine_u64(vreinterpret_u64_u32(w6.val[1]),
				1812	vreinterpret_u64_u32(w14.val[1]));
				1813	d[2] = vcombine_u64(vreinterpret_u64_u32(w7.val[0]),
				1814	vreinterpret_u64_u32(w15.val[0]));
				1815	d[3] = vcombine_u64(vreinterpret_u64_u32(w7.val[1]),
				1816	vreinterpret_u64_u32(w15.val[1]));
				1817
				1818	w4 = vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1]));
				1819	w5 = vzip_u16(vreinterpret_u16_u8(w2.val[1]), vreinterpret_u16_u8(w3.val[1]));
				1820	w12 =
				1821	vzip_u16(vreinterpret_u16_u8(w8.val[1]), vreinterpret_u16_u8(w9.val[1]));
				1822	w13 = vzip_u16(vreinterpret_u16_u8(w10.val[1]),
				1823	vreinterpret_u16_u8(w11.val[1]));
				1824
				1825	w6 = vzip_u32(vreinterpret_u32_u16(w4.val[0]),
				1826	vreinterpret_u32_u16(w5.val[0]));
				1827	w7 = vzip_u32(vreinterpret_u32_u16(w4.val[1]),
				1828	vreinterpret_u32_u16(w5.val[1]));
				1829	w14 = vzip_u32(vreinterpret_u32_u16(w12.val[0]),
				1830	vreinterpret_u32_u16(w13.val[0]));
				1831	w15 = vzip_u32(vreinterpret_u32_u16(w12.val[1]),
				1832	vreinterpret_u32_u16(w13.val[1]));
				1833
				1834	// Store second 4-line result
				1835	d[4] = vcombine_u64(vreinterpret_u64_u32(w6.val[0]),
				1836	vreinterpret_u64_u32(w14.val[0]));
				1837	d[5] = vcombine_u64(vreinterpret_u64_u32(w6.val[1]),
				1838	vreinterpret_u64_u32(w14.val[1]));
				1839	d[6] = vcombine_u64(vreinterpret_u64_u32(w7.val[0]),
				1840	vreinterpret_u64_u32(w15.val[0]));
				1841	d[7] = vcombine_u64(vreinterpret_u64_u32(w7.val[1]),
				1842	vreinterpret_u64_u32(w15.val[1]));
				1843	}
				1844
				1845	static AOM_FORCE_INLINE void transpose8x16_16x8_neon(uint8x16_t *x,
				1846	uint64x2_t *d) {
				1847	uint8x16x2_t w0, w1, w2, w3;
				1848	uint16x8x2_t w4, w5, w6, w7;
				1849	uint32x4x2_t w8, w9, w10, w11;
				1850
				1851	w0 = vzipq_u8(x[0], x[1]);
				1852	w1 = vzipq_u8(x[2], x[3]);
				1853	w2 = vzipq_u8(x[4], x[5]);
				1854	w3 = vzipq_u8(x[6], x[7]);
				1855
				1856	w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
				1857	vreinterpretq_u16_u8(w1.val[0]));
				1858	w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]),
				1859	vreinterpretq_u16_u8(w3.val[0]));
				1860	w6 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
				1861	vreinterpretq_u16_u8(w1.val[1]));
				1862	w7 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]),
				1863	vreinterpretq_u16_u8(w3.val[1]));
				1864
				1865	w8 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]),
				1866	vreinterpretq_u32_u16(w5.val[0]));
				1867	w9 = vzipq_u32(vreinterpretq_u32_u16(w6.val[0]),
				1868	vreinterpretq_u32_u16(w7.val[0]));
				1869	w10 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]),
				1870	vreinterpretq_u32_u16(w5.val[1]));
				1871	w11 = vzipq_u32(vreinterpretq_u32_u16(w6.val[1]),
				1872	vreinterpretq_u32_u16(w7.val[1]));
				1873
				1874	#if defined(__aarch64__)
				1875	d[0] = vzip1q_u64(vreinterpretq_u64_u32(w8.val[0]),
				1876	vreinterpretq_u64_u32(w9.val[0]));
				1877	d[1] = vzip2q_u64(vreinterpretq_u64_u32(w8.val[0]),
				1878	vreinterpretq_u64_u32(w9.val[0]));
				1879	d[2] = vzip1q_u64(vreinterpretq_u64_u32(w8.val[1]),
				1880	vreinterpretq_u64_u32(w9.val[1]));
				1881	d[3] = vzip2q_u64(vreinterpretq_u64_u32(w8.val[1]),
				1882	vreinterpretq_u64_u32(w9.val[1]));
				1883	d[4] = vzip1q_u64(vreinterpretq_u64_u32(w10.val[0]),
				1884	vreinterpretq_u64_u32(w11.val[0]));
				1885	d[5] = vzip2q_u64(vreinterpretq_u64_u32(w10.val[0]),
				1886	vreinterpretq_u64_u32(w11.val[0]));
				1887	d[6] = vzip1q_u64(vreinterpretq_u64_u32(w10.val[1]),
				1888	vreinterpretq_u64_u32(w11.val[1]));
				1889	d[7] = vzip2q_u64(vreinterpretq_u64_u32(w10.val[1]),
				1890	vreinterpretq_u64_u32(w11.val[1]));
				1891	#else
				1892	d[0] = vreinterpretq_u64_u32(
				1893	vcombine_u32(vget_low_u32(w8.val[0]), vget_low_u32(w9.val[0])));
				1894	d[1] = vreinterpretq_u64_u32(
				1895	vcombine_u32(vget_high_u32(w8.val[0]), vget_high_u32(w9.val[0])));
				1896	d[2] = vreinterpretq_u64_u32(
				1897	vcombine_u32(vget_low_u32(w8.val[1]), vget_low_u32(w9.val[1])));
				1898	d[3] = vreinterpretq_u64_u32(
				1899	vcombine_u32(vget_high_u32(w8.val[1]), vget_high_u32(w9.val[1])));
				1900	d[4] = vreinterpretq_u64_u32(
				1901	vcombine_u32(vget_low_u32(w10.val[0]), vget_low_u32(w11.val[0])));
				1902	d[5] = vreinterpretq_u64_u32(
				1903	vcombine_u32(vget_high_u32(w10.val[0]), vget_high_u32(w11.val[0])));
				1904	d[6] = vreinterpretq_u64_u32(
				1905	vcombine_u32(vget_low_u32(w10.val[1]), vget_low_u32(w11.val[1])));
				1906	d[7] = vreinterpretq_u64_u32(
				1907	vcombine_u32(vget_high_u32(w10.val[1]), vget_high_u32(w11.val[1])));
				1908	#endif
				1909	}
				1910
				1911	static AOM_FORCE_INLINE void transpose16x16_neon(uint8x16_t x, uint64x2_t d) {
				1912	uint8x16x2_t w0, w1, w2, w3, w4, w5, w6, w7;
				1913	uint16x8x2_t w8, w9, w10, w11;
				1914	uint32x4x2_t w12, w13, w14, w15;
				1915
				1916	w0 = vzipq_u8(x[0], x[1]);
				1917	w1 = vzipq_u8(x[2], x[3]);
				1918	w2 = vzipq_u8(x[4], x[5]);
				1919	w3 = vzipq_u8(x[6], x[7]);
				1920
				1921	w4 = vzipq_u8(x[8], x[9]);
				1922	w5 = vzipq_u8(x[10], x[11]);
				1923	w6 = vzipq_u8(x[12], x[13]);
				1924	w7 = vzipq_u8(x[14], x[15]);
				1925
				1926	w8 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
				1927	vreinterpretq_u16_u8(w1.val[0]));
				1928	w9 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]),
				1929	vreinterpretq_u16_u8(w3.val[0]));
				1930	w10 = vzipq_u16(vreinterpretq_u16_u8(w4.val[0]),
				1931	vreinterpretq_u16_u8(w5.val[0]));
				1932	w11 = vzipq_u16(vreinterpretq_u16_u8(w6.val[0]),
				1933	vreinterpretq_u16_u8(w7.val[0]));
				1934
				1935	w12 = vzipq_u32(vreinterpretq_u32_u16(w8.val[0]),
				1936	vreinterpretq_u32_u16(w9.val[0]));
				1937	w13 = vzipq_u32(vreinterpretq_u32_u16(w10.val[0]),
				1938	vreinterpretq_u32_u16(w11.val[0]));
				1939	w14 = vzipq_u32(vreinterpretq_u32_u16(w8.val[1]),
				1940	vreinterpretq_u32_u16(w9.val[1]));
				1941	w15 = vzipq_u32(vreinterpretq_u32_u16(w10.val[1]),
				1942	vreinterpretq_u32_u16(w11.val[1]));
				1943
				1944	#if defined(__aarch64__)
				1945	d[0] = vzip1q_u64(vreinterpretq_u64_u32(w12.val[0]),
				1946	vreinterpretq_u64_u32(w13.val[0]));
				1947	d[1] = vzip2q_u64(vreinterpretq_u64_u32(w12.val[0]),
				1948	vreinterpretq_u64_u32(w13.val[0]));
				1949	d[2] = vzip1q_u64(vreinterpretq_u64_u32(w12.val[1]),
				1950	vreinterpretq_u64_u32(w13.val[1]));
				1951	d[3] = vzip2q_u64(vreinterpretq_u64_u32(w12.val[1]),
				1952	vreinterpretq_u64_u32(w13.val[1]));
				1953	d[4] = vzip1q_u64(vreinterpretq_u64_u32(w14.val[0]),
				1954	vreinterpretq_u64_u32(w15.val[0]));
				1955	d[5] = vzip2q_u64(vreinterpretq_u64_u32(w14.val[0]),
				1956	vreinterpretq_u64_u32(w15.val[0]));
				1957	d[6] = vzip1q_u64(vreinterpretq_u64_u32(w14.val[1]),
				1958	vreinterpretq_u64_u32(w15.val[1]));
				1959	d[7] = vzip2q_u64(vreinterpretq_u64_u32(w14.val[1]),
				1960	vreinterpretq_u64_u32(w15.val[1]));
				1961	#else
				1962	d[0] = vreinterpretq_u64_u32(
				1963	vcombine_u32(vget_low_u32(w12.val[0]), vget_low_u32(w13.val[0])));
				1964	d[1] = vreinterpretq_u64_u32(
				1965	vcombine_u32(vget_high_u32(w12.val[0]), vget_high_u32(w13.val[0])));
				1966	d[2] = vreinterpretq_u64_u32(
				1967	vcombine_u32(vget_low_u32(w12.val[1]), vget_low_u32(w13.val[1])));
				1968	d[3] = vreinterpretq_u64_u32(
				1969	vcombine_u32(vget_high_u32(w12.val[1]), vget_high_u32(w13.val[1])));
				1970	d[4] = vreinterpretq_u64_u32(
				1971	vcombine_u32(vget_low_u32(w14.val[0]), vget_low_u32(w15.val[0])));
				1972	d[5] = vreinterpretq_u64_u32(
				1973	vcombine_u32(vget_high_u32(w14.val[0]), vget_high_u32(w15.val[0])));
				1974	d[6] = vreinterpretq_u64_u32(
				1975	vcombine_u32(vget_low_u32(w14.val[1]), vget_low_u32(w15.val[1])));
				1976	d[7] = vreinterpretq_u64_u32(
				1977	vcombine_u32(vget_high_u32(w14.val[1]), vget_high_u32(w15.val[1])));
				1978	#endif
				1979
				1980	// upper half
				1981	w8 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
				1982	vreinterpretq_u16_u8(w1.val[1]));
				1983	w9 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]),
				1984	vreinterpretq_u16_u8(w3.val[1]));
				1985	w10 = vzipq_u16(vreinterpretq_u16_u8(w4.val[1]),
				1986	vreinterpretq_u16_u8(w5.val[1]));
				1987	w11 = vzipq_u16(vreinterpretq_u16_u8(w6.val[1]),
				1988	vreinterpretq_u16_u8(w7.val[1]));
				1989
				1990	w12 = vzipq_u32(vreinterpretq_u32_u16(w8.val[0]),
				1991	vreinterpretq_u32_u16(w9.val[0]));
				1992	w13 = vzipq_u32(vreinterpretq_u32_u16(w10.val[0]),
				1993	vreinterpretq_u32_u16(w11.val[0]));
				1994	w14 = vzipq_u32(vreinterpretq_u32_u16(w8.val[1]),
				1995	vreinterpretq_u32_u16(w9.val[1]));
				1996	w15 = vzipq_u32(vreinterpretq_u32_u16(w10.val[1]),
				1997	vreinterpretq_u32_u16(w11.val[1]));
				1998
				1999	#if defined(__aarch64__)
				2000	d[8] = vzip1q_u64(vreinterpretq_u64_u32(w12.val[0]),
				2001	vreinterpretq_u64_u32(w13.val[0]));
				2002	d[9] = vzip2q_u64(vreinterpretq_u64_u32(w12.val[0]),
				2003	vreinterpretq_u64_u32(w13.val[0]));
				2004	d[10] = vzip1q_u64(vreinterpretq_u64_u32(w12.val[1]),
				2005	vreinterpretq_u64_u32(w13.val[1]));
				2006	d[11] = vzip2q_u64(vreinterpretq_u64_u32(w12.val[1]),
				2007	vreinterpretq_u64_u32(w13.val[1]));
				2008	d[12] = vzip1q_u64(vreinterpretq_u64_u32(w14.val[0]),
				2009	vreinterpretq_u64_u32(w15.val[0]));
				2010	d[13] = vzip2q_u64(vreinterpretq_u64_u32(w14.val[0]),
				2011	vreinterpretq_u64_u32(w15.val[0]));
				2012	d[14] = vzip1q_u64(vreinterpretq_u64_u32(w14.val[1]),
				2013	vreinterpretq_u64_u32(w15.val[1]));
				2014	d[15] = vzip2q_u64(vreinterpretq_u64_u32(w14.val[1]),
				2015	vreinterpretq_u64_u32(w15.val[1]));
				2016	#else
				2017	d[8] = vreinterpretq_u64_u32(
				2018	vcombine_u32(vget_low_u32(w12.val[0]), vget_low_u32(w13.val[0])));
				2019	d[9] = vreinterpretq_u64_u32(
				2020	vcombine_u32(vget_high_u32(w12.val[0]), vget_high_u32(w13.val[0])));
				2021	d[10] = vreinterpretq_u64_u32(
				2022	vcombine_u32(vget_low_u32(w12.val[1]), vget_low_u32(w13.val[1])));
				2023	d[11] = vreinterpretq_u64_u32(
				2024	vcombine_u32(vget_high_u32(w12.val[1]), vget_high_u32(w13.val[1])));
				2025	d[12] = vreinterpretq_u64_u32(
				2026	vcombine_u32(vget_low_u32(w14.val[0]), vget_low_u32(w15.val[0])));
				2027	d[13] = vreinterpretq_u64_u32(
				2028	vcombine_u32(vget_high_u32(w14.val[0]), vget_high_u32(w15.val[0])));
				2029	d[14] = vreinterpretq_u64_u32(
				2030	vcombine_u32(vget_low_u32(w14.val[1]), vget_low_u32(w15.val[1])));
				2031	d[15] = vreinterpretq_u64_u32(
				2032	vcombine_u32(vget_high_u32(w14.val[1]), vget_high_u32(w15.val[1])));
				2033	#endif
				2034	}
				2035
				2036	static AOM_FORCE_INLINE void transpose16x32_neon(uint8x16x2_t *x,
				2037	uint64x2x2_t *d) {
				2038	uint8x16x2_t w0, w1, w2, w3, w8, w9, w10, w11;
				2039	uint16x8x2_t w4, w5, w12, w13;
				2040	uint32x4x2_t w6, w7, w14, w15;
				2041
				2042	w0 = vzipq_u8(x[0].val[0], x[1].val[0]);
				2043	w1 = vzipq_u8(x[2].val[0], x[3].val[0]);
				2044	w2 = vzipq_u8(x[4].val[0], x[5].val[0]);
				2045	w3 = vzipq_u8(x[6].val[0], x[7].val[0]);
				2046
				2047	w8 = vzipq_u8(x[8].val[0], x[9].val[0]);
				2048	w9 = vzipq_u8(x[10].val[0], x[11].val[0]);
				2049	w10 = vzipq_u8(x[12].val[0], x[13].val[0]);
				2050	w11 = vzipq_u8(x[14].val[0], x[15].val[0]);
				2051
				2052	w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
				2053	vreinterpretq_u16_u8(w1.val[0]));
				2054	w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]),
				2055	vreinterpretq_u16_u8(w3.val[0]));
				2056	w12 = vzipq_u16(vreinterpretq_u16_u8(w8.val[0]),
				2057	vreinterpretq_u16_u8(w9.val[0]));
				2058	w13 = vzipq_u16(vreinterpretq_u16_u8(w10.val[0]),
				2059	vreinterpretq_u16_u8(w11.val[0]));
				2060
				2061	w6 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]),
				2062	vreinterpretq_u32_u16(w5.val[0]));
				2063	w7 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]),
				2064	vreinterpretq_u32_u16(w5.val[1]));
				2065	w14 = vzipq_u32(vreinterpretq_u32_u16(w12.val[0]),
				2066	vreinterpretq_u32_u16(w13.val[0]));
				2067	w15 = vzipq_u32(vreinterpretq_u32_u16(w12.val[1]),
				2068	vreinterpretq_u32_u16(w13.val[1]));
				2069
				2070	// Store first 4-line result
				2071
				2072	#if defined(__aarch64__)
				2073	d[0].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[0]),
				2074	vreinterpretq_u64_u32(w14.val[0]));
				2075	d[0].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[0]),
				2076	vreinterpretq_u64_u32(w14.val[0]));
				2077	d[1].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[1]),
				2078	vreinterpretq_u64_u32(w14.val[1]));
				2079	d[1].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[1]),
				2080	vreinterpretq_u64_u32(w14.val[1]));
				2081	d[2].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[0]),
				2082	vreinterpretq_u64_u32(w15.val[0]));
				2083	d[2].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[0]),
				2084	vreinterpretq_u64_u32(w15.val[0]));
				2085	d[3].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[1]),
				2086	vreinterpretq_u64_u32(w15.val[1]));
				2087	d[3].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[1]),
				2088	vreinterpretq_u64_u32(w15.val[1]));
				2089	#else
				2090	d[0].val[0] = vreinterpretq_u64_u32(
				2091	vcombine_u32(vget_low_u32(w6.val[0]), vget_low_u32(w14.val[0])));
				2092	d[0].val[1] = vreinterpretq_u64_u32(
				2093	vcombine_u32(vget_high_u32(w6.val[0]), vget_high_u32(w14.val[0])));
				2094	d[1].val[0] = vreinterpretq_u64_u32(
				2095	vcombine_u32(vget_low_u32(w6.val[1]), vget_low_u32(w14.val[1])));
				2096	d[1].val[1] = vreinterpretq_u64_u32(
				2097	vcombine_u32(vget_high_u32(w6.val[1]), vget_high_u32(w14.val[1])));
				2098	d[2].val[0] = vreinterpretq_u64_u32(
				2099	vcombine_u32(vget_low_u32(w7.val[0]), vget_low_u32(w15.val[0])));
				2100	d[2].val[1] = vreinterpretq_u64_u32(
				2101	vcombine_u32(vget_high_u32(w7.val[0]), vget_high_u32(w15.val[0])));
				2102	d[3].val[0] = vreinterpretq_u64_u32(
				2103	vcombine_u32(vget_low_u32(w7.val[1]), vget_low_u32(w15.val[1])));
				2104	d[3].val[1] = vreinterpretq_u64_u32(
				2105	vcombine_u32(vget_high_u32(w7.val[1]), vget_high_u32(w15.val[1])));
				2106	#endif
				2107
				2108	w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
				2109	vreinterpretq_u16_u8(w1.val[1]));
				2110	w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]),
				2111	vreinterpretq_u16_u8(w3.val[1]));
				2112	w12 = vzipq_u16(vreinterpretq_u16_u8(w8.val[1]),
				2113	vreinterpretq_u16_u8(w9.val[1]));
				2114	w13 = vzipq_u16(vreinterpretq_u16_u8(w10.val[1]),
				2115	vreinterpretq_u16_u8(w11.val[1]));
				2116
				2117	w6 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]),
				2118	vreinterpretq_u32_u16(w5.val[0]));
				2119	w7 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]),
				2120	vreinterpretq_u32_u16(w5.val[1]));
				2121	w14 = vzipq_u32(vreinterpretq_u32_u16(w12.val[0]),
				2122	vreinterpretq_u32_u16(w13.val[0]));
				2123	w15 = vzipq_u32(vreinterpretq_u32_u16(w12.val[1]),
				2124	vreinterpretq_u32_u16(w13.val[1]));
				2125
				2126	// Store second 4-line result
				2127
				2128	#if defined(__aarch64__)
				2129	d[4].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[0]),
				2130	vreinterpretq_u64_u32(w14.val[0]));
				2131	d[4].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[0]),
				2132	vreinterpretq_u64_u32(w14.val[0]));
				2133	d[5].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[1]),
				2134	vreinterpretq_u64_u32(w14.val[1]));
				2135	d[5].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[1]),
				2136	vreinterpretq_u64_u32(w14.val[1]));
				2137	d[6].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[0]),
				2138	vreinterpretq_u64_u32(w15.val[0]));
				2139	d[6].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[0]),
				2140	vreinterpretq_u64_u32(w15.val[0]));
				2141	d[7].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[1]),
				2142	vreinterpretq_u64_u32(w15.val[1]));
				2143	d[7].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[1]),
				2144	vreinterpretq_u64_u32(w15.val[1]));
				2145	#else
				2146	d[4].val[0] = vreinterpretq_u64_u32(
				2147	vcombine_u32(vget_low_u32(w6.val[0]), vget_low_u32(w14.val[0])));
				2148	d[4].val[1] = vreinterpretq_u64_u32(
				2149	vcombine_u32(vget_high_u32(w6.val[0]), vget_high_u32(w14.val[0])));
				2150	d[5].val[0] = vreinterpretq_u64_u32(
				2151	vcombine_u32(vget_low_u32(w6.val[1]), vget_low_u32(w14.val[1])));
				2152	d[5].val[1] = vreinterpretq_u64_u32(
				2153	vcombine_u32(vget_high_u32(w6.val[1]), vget_high_u32(w14.val[1])));
				2154	d[6].val[0] = vreinterpretq_u64_u32(
				2155	vcombine_u32(vget_low_u32(w7.val[0]), vget_low_u32(w15.val[0])));
				2156	d[6].val[1] = vreinterpretq_u64_u32(
				2157	vcombine_u32(vget_high_u32(w7.val[0]), vget_high_u32(w15.val[0])));
				2158	d[7].val[0] = vreinterpretq_u64_u32(
				2159	vcombine_u32(vget_low_u32(w7.val[1]), vget_low_u32(w15.val[1])));
				2160	d[7].val[1] = vreinterpretq_u64_u32(
				2161	vcombine_u32(vget_high_u32(w7.val[1]), vget_high_u32(w15.val[1])));
				2162	#endif
				2163
				2164	// upper half
				2165	w0 = vzipq_u8(x[0].val[1], x[1].val[1]);
				2166	w1 = vzipq_u8(x[2].val[1], x[3].val[1]);
				2167	w2 = vzipq_u8(x[4].val[1], x[5].val[1]);
				2168	w3 = vzipq_u8(x[6].val[1], x[7].val[1]);
				2169
				2170	w8 = vzipq_u8(x[8].val[1], x[9].val[1]);
				2171	w9 = vzipq_u8(x[10].val[1], x[11].val[1]);
				2172	w10 = vzipq_u8(x[12].val[1], x[13].val[1]);
				2173	w11 = vzipq_u8(x[14].val[1], x[15].val[1]);
				2174
				2175	w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
				2176	vreinterpretq_u16_u8(w1.val[0]));
				2177	w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]),
				2178	vreinterpretq_u16_u8(w3.val[0]));
				2179	w12 = vzipq_u16(vreinterpretq_u16_u8(w8.val[0]),
				2180	vreinterpretq_u16_u8(w9.val[0]));
				2181	w13 = vzipq_u16(vreinterpretq_u16_u8(w10.val[0]),
				2182	vreinterpretq_u16_u8(w11.val[0]));
				2183
				2184	w6 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]),
				2185	vreinterpretq_u32_u16(w5.val[0]));
				2186	w7 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]),
				2187	vreinterpretq_u32_u16(w5.val[1]));
				2188	w14 = vzipq_u32(vreinterpretq_u32_u16(w12.val[0]),
				2189	vreinterpretq_u32_u16(w13.val[0]));
				2190	w15 = vzipq_u32(vreinterpretq_u32_u16(w12.val[1]),
				2191	vreinterpretq_u32_u16(w13.val[1]));
				2192
				2193	// Store first 4-line result
				2194
				2195	#if defined(__aarch64__)
				2196	d[8].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[0]),
				2197	vreinterpretq_u64_u32(w14.val[0]));
				2198	d[8].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[0]),
				2199	vreinterpretq_u64_u32(w14.val[0]));
				2200	d[9].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[1]),
				2201	vreinterpretq_u64_u32(w14.val[1]));
				2202	d[9].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[1]),
				2203	vreinterpretq_u64_u32(w14.val[1]));
				2204	d[10].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[0]),
				2205	vreinterpretq_u64_u32(w15.val[0]));
				2206	d[10].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[0]),
				2207	vreinterpretq_u64_u32(w15.val[0]));
				2208	d[11].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[1]),
				2209	vreinterpretq_u64_u32(w15.val[1]));
				2210	d[11].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[1]),
				2211	vreinterpretq_u64_u32(w15.val[1]));
				2212	#else
				2213	d[8].val[0] = vreinterpretq_u64_u32(
				2214	vcombine_u32(vget_low_u32(w6.val[0]), vget_low_u32(w14.val[0])));
				2215	d[8].val[1] = vreinterpretq_u64_u32(
				2216	vcombine_u32(vget_high_u32(w6.val[0]), vget_high_u32(w14.val[0])));
				2217	d[9].val[0] = vreinterpretq_u64_u32(
				2218	vcombine_u32(vget_low_u32(w6.val[1]), vget_low_u32(w14.val[1])));
				2219	d[9].val[1] = vreinterpretq_u64_u32(
				2220	vcombine_u32(vget_high_u32(w6.val[1]), vget_high_u32(w14.val[1])));
				2221	d[10].val[0] = vreinterpretq_u64_u32(
				2222	vcombine_u32(vget_low_u32(w7.val[0]), vget_low_u32(w15.val[0])));
				2223	d[10].val[1] = vreinterpretq_u64_u32(
				2224	vcombine_u32(vget_high_u32(w7.val[0]), vget_high_u32(w15.val[0])));
				2225	d[11].val[0] = vreinterpretq_u64_u32(
				2226	vcombine_u32(vget_low_u32(w7.val[1]), vget_low_u32(w15.val[1])));
				2227	d[11].val[1] = vreinterpretq_u64_u32(
				2228	vcombine_u32(vget_high_u32(w7.val[1]), vget_high_u32(w15.val[1])));
				2229	#endif
				2230
				2231	w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
				2232	vreinterpretq_u16_u8(w1.val[1]));
				2233	w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]),
				2234	vreinterpretq_u16_u8(w3.val[1]));
				2235	w12 = vzipq_u16(vreinterpretq_u16_u8(w8.val[1]),
				2236	vreinterpretq_u16_u8(w9.val[1]));
				2237	w13 = vzipq_u16(vreinterpretq_u16_u8(w10.val[1]),
				2238	vreinterpretq_u16_u8(w11.val[1]));
				2239
				2240	w6 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]),
				2241	vreinterpretq_u32_u16(w5.val[0]));
				2242	w7 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]),
				2243	vreinterpretq_u32_u16(w5.val[1]));
				2244	w14 = vzipq_u32(vreinterpretq_u32_u16(w12.val[0]),
				2245	vreinterpretq_u32_u16(w13.val[0]));
				2246	w15 = vzipq_u32(vreinterpretq_u32_u16(w12.val[1]),
				2247	vreinterpretq_u32_u16(w13.val[1]));
				2248
				2249	// Store second 4-line result
				2250
				2251	#if defined(__aarch64__)
				2252	d[12].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[0]),
				2253	vreinterpretq_u64_u32(w14.val[0]));
				2254	d[12].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[0]),
				2255	vreinterpretq_u64_u32(w14.val[0]));
				2256	d[13].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[1]),
				2257	vreinterpretq_u64_u32(w14.val[1]));
				2258	d[13].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[1]),
				2259	vreinterpretq_u64_u32(w14.val[1]));
				2260	d[14].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[0]),
				2261	vreinterpretq_u64_u32(w15.val[0]));
				2262	d[14].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[0]),
				2263	vreinterpretq_u64_u32(w15.val[0]));
				2264	d[15].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[1]),
				2265	vreinterpretq_u64_u32(w15.val[1]));
				2266	d[15].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[1]),
				2267	vreinterpretq_u64_u32(w15.val[1]));
				2268	#else
				2269	d[12].val[0] = vreinterpretq_u64_u32(
				2270	vcombine_u32(vget_low_u32(w6.val[0]), vget_low_u32(w14.val[0])));
				2271	d[12].val[1] = vreinterpretq_u64_u32(
				2272	vcombine_u32(vget_high_u32(w6.val[0]), vget_high_u32(w14.val[0])));
				2273	d[13].val[0] = vreinterpretq_u64_u32(
				2274	vcombine_u32(vget_low_u32(w6.val[1]), vget_low_u32(w14.val[1])));
				2275	d[13].val[1] = vreinterpretq_u64_u32(
				2276	vcombine_u32(vget_high_u32(w6.val[1]), vget_high_u32(w14.val[1])));
				2277	d[14].val[0] = vreinterpretq_u64_u32(
				2278	vcombine_u32(vget_low_u32(w7.val[0]), vget_low_u32(w15.val[0])));
				2279	d[14].val[1] = vreinterpretq_u64_u32(
				2280	vcombine_u32(vget_high_u32(w7.val[0]), vget_high_u32(w15.val[0])));
				2281	d[15].val[0] = vreinterpretq_u64_u32(
				2282	vcombine_u32(vget_low_u32(w7.val[1]), vget_low_u32(w15.val[1])));
				2283	d[15].val[1] = vreinterpretq_u64_u32(
				2284	vcombine_u32(vget_high_u32(w7.val[1]), vget_high_u32(w15.val[1])));
				2285	#endif
				2286	}
				2287
				2288	static void transpose_TX_16X16(const uint8_t *src, ptrdiff_t pitchSrc,
				2289	uint8_t *dst, ptrdiff_t pitchDst) {
				2290	uint8x16_t r[16];
				2291	uint64x2_t d[16];
				2292	for (int i = 0; i < 16; i++) {
				2293	r[i] = vld1q_u8(src + i * pitchSrc);
				2294	}
				2295	transpose16x16_neon(r, d);
				2296	for (int i = 0; i < 16; i++) {
				2297	vst1q_u8(dst + i * pitchDst, vreinterpretq_u8_u64(d[i]));
				2298	}
				2299	}
				2300
				2301	static void transpose(const uint8_t src, ptrdiff_t pitchSrc, uint8_t dst,
				2302	ptrdiff_t pitchDst, int width, int height) {
				2303	for (int j = 0; j < height; j += 16) {
				2304	for (int i = 0; i < width; i += 16) {
				2305	transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc,
				2306	dst + j * pitchDst + i, pitchDst);
				2307	}
				2308	}
				2309	}
				2310
				2311	static void dr_prediction_z3_4x4_neon(uint8_t *dst, ptrdiff_t stride,
				2312	const uint8_t *left, int upsample_left,
				2313	int dy) {
				2314	uint8x8_t dstvec[4];
				2315	uint16x4x2_t dest;
				2316
				2317	dr_prediction_z1_HxW_internal_neon_64(4, 4, dstvec, left, upsample_left, dy);
				2318	transpose4x8_8x4_low_neon(dstvec, &dest);
				2319	vst1_lane_u32((uint32_t )(dst + stride 0),
				2320	vreinterpret_u32_u16(dest.val[0]), 0);
				2321	vst1_lane_u32((uint32_t )(dst + stride 1),
				2322	vreinterpret_u32_u16(dest.val[0]), 1);
				2323	vst1_lane_u32((uint32_t )(dst + stride 2),
				2324	vreinterpret_u32_u16(dest.val[1]), 0);
				2325	vst1_lane_u32((uint32_t )(dst + stride 3),
				2326	vreinterpret_u32_u16(dest.val[1]), 1);
				2327	}
				2328
				2329	static void dr_prediction_z3_8x8_neon(uint8_t *dst, ptrdiff_t stride,
				2330	const uint8_t *left, int upsample_left,
				2331	int dy) {
				2332	uint8x8_t dstvec[8];
				2333	uint32x2x2_t d[4];
				2334
				2335	dr_prediction_z1_HxW_internal_neon_64(8, 8, dstvec, left, upsample_left, dy);
				2336	transpose8x8_neon(dstvec, d);
				2337	vst1_u32((uint32_t )(dst + 0 stride), d[0].val[0]);
				2338	vst1_u32((uint32_t )(dst + 1 stride), d[0].val[1]);
				2339	vst1_u32((uint32_t )(dst + 2 stride), d[1].val[0]);
				2340	vst1_u32((uint32_t )(dst + 3 stride), d[1].val[1]);
				2341	vst1_u32((uint32_t )(dst + 4 stride), d[2].val[0]);
				2342	vst1_u32((uint32_t )(dst + 5 stride), d[2].val[1]);
				2343	vst1_u32((uint32_t )(dst + 6 stride), d[3].val[0]);
				2344	vst1_u32((uint32_t )(dst + 7 stride), d[3].val[1]);
				2345	}
				2346
				2347	static void dr_prediction_z3_4x8_neon(uint8_t *dst, ptrdiff_t stride,
				2348	const uint8_t *left, int upsample_left,
				2349	int dy) {
				2350	uint8x8_t dstvec[4];
				2351	uint16x4x2_t d[2];
				2352
				2353	dr_prediction_z1_HxW_internal_neon_64(8, 4, dstvec, left, upsample_left, dy);
				2354	transpose4x8_8x4_neon(dstvec, d);
				2355	vst1_lane_u32((uint32_t )(dst + stride 0),
				2356	vreinterpret_u32_u16(d[0].val[0]), 0);
				2357	vst1_lane_u32((uint32_t )(dst + stride 1),
				2358	vreinterpret_u32_u16(d[0].val[0]), 1);
				2359	vst1_lane_u32((uint32_t )(dst + stride 2),
				2360	vreinterpret_u32_u16(d[0].val[1]), 0);
				2361	vst1_lane_u32((uint32_t )(dst + stride 3),
				2362	vreinterpret_u32_u16(d[0].val[1]), 1);
				2363	vst1_lane_u32((uint32_t )(dst + stride 4),
				2364	vreinterpret_u32_u16(d[1].val[0]), 0);
				2365	vst1_lane_u32((uint32_t )(dst + stride 5),
				2366	vreinterpret_u32_u16(d[1].val[0]), 1);
				2367	vst1_lane_u32((uint32_t )(dst + stride 6),
				2368	vreinterpret_u32_u16(d[1].val[1]), 0);
				2369	vst1_lane_u32((uint32_t )(dst + stride 7),
				2370	vreinterpret_u32_u16(d[1].val[1]), 1);
				2371	}
				2372
				2373	static void dr_prediction_z3_8x4_neon(uint8_t *dst, ptrdiff_t stride,
				2374	const uint8_t *left, int upsample_left,
				2375	int dy) {
				2376	uint8x8_t dstvec[8];
				2377	uint32x2x2_t d[2];
				2378
				2379	dr_prediction_z1_HxW_internal_neon_64(4, 8, dstvec, left, upsample_left, dy);
				2380	transpose8x8_low_neon(dstvec, d);
				2381	vst1_u32((uint32_t )(dst + 0 stride), d[0].val[0]);
				2382	vst1_u32((uint32_t )(dst + 1 stride), d[0].val[1]);
				2383	vst1_u32((uint32_t )(dst + 2 stride), d[1].val[0]);
				2384	vst1_u32((uint32_t )(dst + 3 stride), d[1].val[1]);
				2385	}
				2386
				2387	static void dr_prediction_z3_8x16_neon(uint8_t *dst, ptrdiff_t stride,
				2388	const uint8_t *left, int upsample_left,
				2389	int dy) {
				2390	uint8x16_t dstvec[8];
				2391	uint64x2_t d[8];
				2392
				2393	dr_prediction_z1_HxW_internal_neon(16, 8, dstvec, left, upsample_left, dy);
				2394	transpose8x16_16x8_neon(dstvec, d);
				2395	for (int i = 0; i < 8; i++) {
				2396	vst1_u8(dst + i * stride, vreinterpret_u8_u64(vget_low_u64(d[i])));
				2397	vst1_u8(dst + (i + 8) * stride, vreinterpret_u8_u64(vget_high_u64(d[i])));
				2398	}
				2399	}
				2400
				2401	static void dr_prediction_z3_16x8_neon(uint8_t *dst, ptrdiff_t stride,
				2402	const uint8_t *left, int upsample_left,
				2403	int dy) {
				2404	uint8x8_t dstvec[16];
				2405	uint64x2_t d[8];
				2406
				2407	dr_prediction_z1_HxW_internal_neon_64(8, 16, dstvec, left, upsample_left, dy);
				2408	transpose16x8_8x16_neon(dstvec, d);
				2409	for (int i = 0; i < 8; i++) {
				2410	vst1q_u8(dst + i * stride, vreinterpretq_u8_u64(d[i]));
				2411	}
				2412	}
				2413
				2414	static void dr_prediction_z3_4x16_neon(uint8_t *dst, ptrdiff_t stride,
				2415	const uint8_t *left, int upsample_left,
				2416	int dy) {
				2417	uint8x16_t dstvec[4];
				2418	uint16x8x2_t d[2];
				2419
				2420	dr_prediction_z1_HxW_internal_neon(16, 4, dstvec, left, upsample_left, dy);
				2421	transpose4x16_neon(dstvec, d);
				2422	vst1q_lane_u32((uint32_t )(dst + stride 0),
				2423	vreinterpretq_u32_u16(d[0].val[0]), 0);
				2424	vst1q_lane_u32((uint32_t )(dst + stride 1),
				2425	vreinterpretq_u32_u16(d[0].val[0]), 1);
				2426	vst1q_lane_u32((uint32_t )(dst + stride 2),
				2427	vreinterpretq_u32_u16(d[0].val[0]), 2);
				2428	vst1q_lane_u32((uint32_t )(dst + stride 3),
				2429	vreinterpretq_u32_u16(d[0].val[0]), 3);
				2430
				2431	vst1q_lane_u32((uint32_t )(dst + stride 4),
				2432	vreinterpretq_u32_u16(d[0].val[1]), 0);
				2433	vst1q_lane_u32((uint32_t )(dst + stride 5),
				2434	vreinterpretq_u32_u16(d[0].val[1]), 1);
				2435	vst1q_lane_u32((uint32_t )(dst + stride 6),
				2436	vreinterpretq_u32_u16(d[0].val[1]), 2);
				2437	vst1q_lane_u32((uint32_t )(dst + stride 7),
				2438	vreinterpretq_u32_u16(d[0].val[1]), 3);
				2439
				2440	vst1q_lane_u32((uint32_t )(dst + stride 8),
				2441	vreinterpretq_u32_u16(d[1].val[0]), 0);
				2442	vst1q_lane_u32((uint32_t )(dst + stride 9),
				2443	vreinterpretq_u32_u16(d[1].val[0]), 1);
				2444	vst1q_lane_u32((uint32_t )(dst + stride 10),
				2445	vreinterpretq_u32_u16(d[1].val[0]), 2);
				2446	vst1q_lane_u32((uint32_t )(dst + stride 11),
				2447	vreinterpretq_u32_u16(d[1].val[0]), 3);
				2448
				2449	vst1q_lane_u32((uint32_t )(dst + stride 12),
				2450	vreinterpretq_u32_u16(d[1].val[1]), 0);
				2451	vst1q_lane_u32((uint32_t )(dst + stride 13),
				2452	vreinterpretq_u32_u16(d[1].val[1]), 1);
				2453	vst1q_lane_u32((uint32_t )(dst + stride 14),
				2454	vreinterpretq_u32_u16(d[1].val[1]), 2);
				2455	vst1q_lane_u32((uint32_t )(dst + stride 15),
				2456	vreinterpretq_u32_u16(d[1].val[1]), 3);
				2457	}
				2458
				2459	static void dr_prediction_z3_16x4_neon(uint8_t *dst, ptrdiff_t stride,
				2460	const uint8_t *left, int upsample_left,
				2461	int dy) {
				2462	uint8x8_t dstvec[16];
				2463	uint64x2_t d[8];
				2464
				2465	dr_prediction_z1_HxW_internal_neon_64(4, 16, dstvec, left, upsample_left, dy);
				2466	transpose16x8_8x16_neon(dstvec, d);
				2467	for (int i = 0; i < 4; i++) {
				2468	vst1q_u8(dst + i * stride, vreinterpretq_u8_u64(d[i]));
				2469	}
				2470	}
				2471
				2472	static void dr_prediction_z3_8x32_neon(uint8_t *dst, ptrdiff_t stride,
				2473	const uint8_t *left, int upsample_left,
				2474	int dy) {
				2475	uint8x16x2_t dstvec[16];
				2476	uint64x2x2_t d[16];
				2477	uint8x16_t v_zero = vdupq_n_u8(0);
				2478
				2479	dr_prediction_z1_32xN_internal_neon(8, dstvec, left, upsample_left, dy);
				2480	for (int i = 8; i < 16; i++) {
				2481	dstvec[i].val[0] = v_zero;
				2482	dstvec[i].val[1] = v_zero;
				2483	}
				2484	transpose16x32_neon(dstvec, d);
				2485	for (int i = 0; i < 16; i++) {
				2486	vst1_u8(dst + 2 * i * stride,
				2487	vreinterpret_u8_u64(vget_low_u64(d[i].val[0])));
				2488	vst1_u8(dst + (2 * i + 1) * stride,
				2489	vreinterpret_u8_u64(vget_low_u64(d[i].val[1])));
				2490	}
				2491	}
				2492
				2493	static void dr_prediction_z3_32x8_neon(uint8_t *dst, ptrdiff_t stride,
				2494	const uint8_t *left, int upsample_left,
				2495	int dy) {
				2496	uint8x8_t dstvec[32];
				2497	uint64x2_t d[16];
				2498
				2499	dr_prediction_z1_HxW_internal_neon_64(8, 32, dstvec, left, upsample_left, dy);
				2500	transpose16x8_8x16_neon(dstvec, d);
				2501	transpose16x8_8x16_neon(dstvec + 16, d + 8);
				2502	for (int i = 0; i < 8; i++) {
				2503	vst1q_u8(dst + i * stride, vreinterpretq_u8_u64(d[i]));
				2504	vst1q_u8(dst + i * stride + 16, vreinterpretq_u8_u64(d[i + 8]));
				2505	}
				2506	}
				2507
				2508	static void dr_prediction_z3_16x16_neon(uint8_t *dst, ptrdiff_t stride,
				2509	const uint8_t *left, int upsample_left,
				2510	int dy) {
				2511	uint8x16_t dstvec[16];
				2512	uint64x2_t d[16];
				2513
				2514	dr_prediction_z1_HxW_internal_neon(16, 16, dstvec, left, upsample_left, dy);
				2515	transpose16x16_neon(dstvec, d);
				2516	for (int i = 0; i < 16; i++) {
				2517	vst1q_u8(dst + i * stride, vreinterpretq_u8_u64(d[i]));
				2518	}
				2519	}
				2520
				2521	static void dr_prediction_z3_32x32_neon(uint8_t *dst, ptrdiff_t stride,
				2522	const uint8_t *left, int upsample_left,
				2523	int dy) {
				2524	uint8x16x2_t dstvec[32];
				2525	uint64x2x2_t d[32];
				2526
				2527	dr_prediction_z1_32xN_internal_neon(32, dstvec, left, upsample_left, dy);
				2528	transpose16x32_neon(dstvec, d);
				2529	transpose16x32_neon(dstvec + 16, d + 16);
				2530	for (int i = 0; i < 16; i++) {
				2531	vst1q_u8(dst + 2 * i * stride, vreinterpretq_u8_u64(d[i].val[0]));
				2532	vst1q_u8(dst + 2 * i * stride + 16, vreinterpretq_u8_u64(d[i + 16].val[0]));
				2533	vst1q_u8(dst + (2 * i + 1) * stride, vreinterpretq_u8_u64(d[i].val[1]));
				2534	vst1q_u8(dst + (2 * i + 1) * stride + 16,
				2535	vreinterpretq_u8_u64(d[i + 16].val[1]));
				2536	}
				2537	}
				2538
				2539	static void dr_prediction_z3_64x64_neon(uint8_t *dst, ptrdiff_t stride,
				2540	const uint8_t *left, int upsample_left,
				2541	int dy) {
				2542	DECLARE_ALIGNED(16, uint8_t, dstT[64 * 64]);
				2543
				2544	dr_prediction_z1_64xN_neon(64, dstT, 64, left, upsample_left, dy);
				2545	transpose(dstT, 64, dst, stride, 64, 64);
				2546	}
				2547
				2548	static void dr_prediction_z3_16x32_neon(uint8_t *dst, ptrdiff_t stride,
				2549	const uint8_t *left, int upsample_left,
				2550	int dy) {
				2551	uint8x16x2_t dstvec[16];
				2552	uint64x2x2_t d[16];
				2553
				2554	dr_prediction_z1_32xN_internal_neon(16, dstvec, left, upsample_left, dy);
				2555	transpose16x32_neon(dstvec, d);
				2556	for (int i = 0; i < 16; i++) {
				2557	vst1q_u8(dst + 2 * i * stride, vreinterpretq_u8_u64(d[i].val[0]));
				2558	vst1q_u8(dst + (2 * i + 1) * stride, vreinterpretq_u8_u64(d[i].val[1]));
				2559	}
				2560	}
				2561
				2562	static void dr_prediction_z3_32x16_neon(uint8_t *dst, ptrdiff_t stride,
				2563	const uint8_t *left, int upsample_left,
				2564	int dy) {
				2565	uint8x16_t dstvec[32];
				2566	uint64x2_t d[16];
				2567
				2568	dr_prediction_z1_HxW_internal_neon(16, 32, dstvec, left, upsample_left, dy);
				2569	for (int i = 0; i < 32; i += 16) {
				2570	transpose16x16_neon((dstvec + i), d);
				2571	for (int j = 0; j < 16; j++) {
				2572	vst1q_u8(dst + j * stride + i, vreinterpretq_u8_u64(d[j]));
				2573	}
				2574	}
				2575	}
				2576
				2577	static void dr_prediction_z3_32x64_neon(uint8_t *dst, ptrdiff_t stride,
				2578	const uint8_t *left, int upsample_left,
				2579	int dy) {
				2580	uint8_t dstT[64 * 32];
				2581
				2582	dr_prediction_z1_64xN_neon(32, dstT, 64, left, upsample_left, dy);
				2583	transpose(dstT, 64, dst, stride, 32, 64);
				2584	}
				2585
				2586	static void dr_prediction_z3_64x32_neon(uint8_t *dst, ptrdiff_t stride,
				2587	const uint8_t *left, int upsample_left,
				2588	int dy) {
				2589	uint8_t dstT[32 * 64];
				2590
				2591	dr_prediction_z1_32xN_neon(64, dstT, 32, left, upsample_left, dy);
				2592	transpose(dstT, 32, dst, stride, 64, 32);
				2593	}
				2594
				2595	static void dr_prediction_z3_16x64_neon(uint8_t *dst, ptrdiff_t stride,
				2596	const uint8_t *left, int upsample_left,
				2597	int dy) {
				2598	uint8_t dstT[64 * 16];
				2599
				2600	dr_prediction_z1_64xN_neon(16, dstT, 64, left, upsample_left, dy);
				2601	transpose(dstT, 64, dst, stride, 16, 64);
				2602	}
				2603
				2604	static void dr_prediction_z3_64x16_neon(uint8_t *dst, ptrdiff_t stride,
				2605	const uint8_t *left, int upsample_left,
				2606	int dy) {
				2607	uint8x16_t dstvec[64];
				2608	uint64x2_t d[16];
				2609
				2610	dr_prediction_z1_HxW_internal_neon(16, 64, dstvec, left, upsample_left, dy);
				2611	for (int i = 0; i < 64; i += 16) {
				2612	transpose16x16_neon((dstvec + i), d);
				2613	for (int j = 0; j < 16; j++) {
				2614	vst1q_u8(dst + j * stride + i, vreinterpretq_u8_u64(d[j]));
				2615	}
				2616	}
				2617	}
				2618
				2619	void av1_dr_prediction_z3_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
				2620	const uint8_t above, const uint8_t left,
				2621	int upsample_left, int dx, int dy) {
				2622	(void)above;
				2623	(void)dx;
				2624	assert(dx == 1);
				2625	assert(dy > 0);
				2626
				2627	if (bw == bh) {
				2628	switch (bw) {
				2629	case 4:
				2630	dr_prediction_z3_4x4_neon(dst, stride, left, upsample_left, dy);
				2631	break;
				2632	case 8:
				2633	dr_prediction_z3_8x8_neon(dst, stride, left, upsample_left, dy);
				2634	break;
				2635	case 16:
				2636	dr_prediction_z3_16x16_neon(dst, stride, left, upsample_left, dy);
				2637	break;
				2638	case 32:
				2639	dr_prediction_z3_32x32_neon(dst, stride, left, upsample_left, dy);
				2640	break;
				2641	case 64:
				2642	dr_prediction_z3_64x64_neon(dst, stride, left, upsample_left, dy);
				2643	break;
				2644	}
				2645	} else {
				2646	if (bw < bh) {
				2647	if (bw + bw == bh) {
				2648	switch (bw) {
				2649	case 4:
				2650	dr_prediction_z3_4x8_neon(dst, stride, left, upsample_left, dy);
				2651	break;
				2652	case 8:
				2653	dr_prediction_z3_8x16_neon(dst, stride, left, upsample_left, dy);
				2654	break;
				2655	case 16:
				2656	dr_prediction_z3_16x32_neon(dst, stride, left, upsample_left, dy);
				2657	break;
				2658	case 32:
				2659	dr_prediction_z3_32x64_neon(dst, stride, left, upsample_left, dy);
				2660	break;
				2661	}
				2662	} else {
				2663	switch (bw) {
				2664	case 4:
				2665	dr_prediction_z3_4x16_neon(dst, stride, left, upsample_left, dy);
				2666	break;
				2667	case 8:
				2668	dr_prediction_z3_8x32_neon(dst, stride, left, upsample_left, dy);
				2669	break;
				2670	case 16:
				2671	dr_prediction_z3_16x64_neon(dst, stride, left, upsample_left, dy);
				2672	break;
				2673	}
				2674	}
				2675	} else {
				2676	if (bh + bh == bw) {
				2677	switch (bh) {
				2678	case 4:
				2679	dr_prediction_z3_8x4_neon(dst, stride, left, upsample_left, dy);
				2680	break;
				2681	case 8:
				2682	dr_prediction_z3_16x8_neon(dst, stride, left, upsample_left, dy);
				2683	break;
				2684	case 16:
				2685	dr_prediction_z3_32x16_neon(dst, stride, left, upsample_left, dy);
				2686	break;
				2687	case 32:
				2688	dr_prediction_z3_64x32_neon(dst, stride, left, upsample_left, dy);
				2689	break;
				2690	}
				2691	} else {
				2692	switch (bh) {
				2693	case 4:
				2694	dr_prediction_z3_16x4_neon(dst, stride, left, upsample_left, dy);
				2695	break;
				2696	case 8:
				2697	dr_prediction_z3_32x8_neon(dst, stride, left, upsample_left, dy);
				2698	break;
				2699	case 16:
				2700	dr_prediction_z3_64x16_neon(dst, stride, left, upsample_left, dy);
				2701	break;
				2702	}
				2703	}
				2704	}
				2705	}
				2706	}
Vitalii Dziumenko	7b9b739	2020-05-26 04:42:51 +0300	[diff] [blame]	2707	static const int sm_weight_log2_scale = 8;
				2708
				2709	// max(block_size_wide[BLOCK_LARGEST], block_size_high[BLOCK_LARGEST])
				2710	#define MAX_BLOCK_DIM 64
				2711
				2712	/* clang-format off */
				2713	static const uint8_t sm_weight_arrays[2 * MAX_BLOCK_DIM] = {
				2714	// Unused, because we always offset by bs, which is at least 2.
				2715	0, 0,
				2716	// bs = 2
				2717	255, 128,
				2718	// bs = 4
				2719	255, 149, 85, 64,
				2720	// bs = 8
				2721	255, 197, 146, 105, 73, 50, 37, 32,
				2722	// bs = 16
				2723	255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16,
				2724	// bs = 32
				2725	255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
				2726	66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
				2727	// bs = 64
				2728	255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
				2729	150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73,
				2730	69, 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16,
				2731	15, 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4,
				2732	};
				2733	/* clang-format on */
				2734
				2735	// -----------------------------------------------------------------------------
				2736	// SMOOTH_PRED
				2737
				2738	// pixels[0]: above and below_pred interleave vector
				2739	// pixels[1]: left vector
				2740	// pixels[2]: right_pred vector
				2741	static INLINE void load_pixel_w4(const uint8_t above, const uint8_t left,
				2742	int height, uint8x16_t *pixels) {
				2743	uint32x4_t zero = vdupq_n_u32(0);
				2744	const uint8x8_t d = vcreate_u8(((const uint32_t *)above)[0]);
				2745	if (height == 4)
				2746	pixels[1] =
				2747	vreinterpretq_u8_u32(vld1q_lane_u32((const uint32_t *)left, zero, 0));
				2748	else if (height == 8) {
				2749	pixels[1] = vreinterpretq_u8_u64(vsetq_lane_u64(
				2750	((const uint64_t *)left)[0], vreinterpretq_u64_u32(zero), 0));
				2751	} else {
				2752	pixels[1] = vld1q_u8(left);
				2753	}
				2754
				2755	pixels[2] = vreinterpretq_u8_u16(vdupq_n_u16(above[3]));
				2756
				2757	const uint16x8_t bp = vdupq_n_u16(left[height - 1]);
				2758	#if defined(__aarch64__)
				2759	pixels[0] = vreinterpretq_u8_u16(vzip1q_u16(vmovl_u8(d), bp));
				2760	#else
				2761	pixels[0] = vreinterpretq_u8_u16(vzipq_u16(vmovl_u8(d), bp).val[0]);
				2762	#endif // (__aarch64__)
				2763	}
				2764
				2765	// weight_h[0]: weight_h vector
				2766	// weight_h[1]: scale - weight_h vector
				2767	// weight_h[2]: same as [0], second half for height = 16 only
				2768	// weight_h[3]: same as [1], second half for height = 16 only
				2769	// weight_w[0]: weights_w and scale - weights_w interleave vector
				2770	static INLINE void load_weight_w4(const uint8_t *weight_array, int height,
				2771	uint16x8_t weight_h, uint16x8_t weight_w) {
				2772	const uint16x8_t d = vdupq_n_u16((uint16_t)(1 << sm_weight_log2_scale));
				2773	const uint8x8_t t = vcreate_u8(((const uint32_t *)(weight_array))[1]);
				2774	weight_h[0] = vmovl_u8(t);
				2775	weight_h[1] = vsubw_u8(d, t);
				2776	#if defined(__aarch64__)
				2777	weight_w[0] = vzip1q_u16(weight_h[0], weight_h[1]);
				2778	#else
				2779	weight_w[0] = vzipq_u16(weight_h[0], weight_h[1]).val[0];
				2780	#endif // (__aarch64__)
				2781
				2782	if (height == 8) {
				2783	const uint8x8_t weight = vld1_u8(&weight_array[8]);
				2784	weight_h[0] = vmovl_u8(weight);
				2785	weight_h[1] = vsubw_u8(d, weight);
				2786	} else if (height == 16) {
				2787	const uint8x16_t zero = vdupq_n_u8(0);
				2788	const uint8x16_t weight = vld1q_u8(&weight_array[16]);
				2789	const uint8x16x2_t weight_h_02 = vzipq_u8(weight, zero);
				2790	weight_h[0] = vreinterpretq_u16_u8(weight_h_02.val[0]);
				2791	weight_h[1] = vsubq_u16(d, vreinterpretq_u16_u8(weight_h_02.val[0]));
				2792	weight_h[2] = vreinterpretq_u16_u8(weight_h_02.val[1]);
				2793	weight_h[3] = vsubq_u16(d, vreinterpretq_u16_u8(weight_h_02.val[1]));
				2794	}
				2795	}
				2796
				2797	static INLINE void smooth_pred_4xh(const uint8x16_t *pixel,
				2798	const uint16x8_t wh, const uint16x8_t ww,
				2799	int h, uint8_t *dst, ptrdiff_t stride,
				2800	int second_half) {
				2801	const uint16x4_t one = vdup_n_u16(1);
				2802	const uint16x4_t inc = vdup_n_u16(0x202);
				2803	uint16x4_t rep =
				2804	second_half ? vdup_n_u16((uint16_t)0x8008) : vdup_n_u16((uint16_t)0x8000);
				2805	uint16x4_t d = vdup_n_u16(0x100);
				2806	const uint16x4_t v_pixel_0_lo = vmovn_u32(vreinterpretq_u32_u8(pixel[0]));
				2807	const uint16x4_t v_pixel_0_hi =
				2808	vmovn_u32(vreinterpretq_u32_u8(vextq_u8(pixel[0], pixel[0], 2)));
				2809	const uint16x4_t v_pixel_2 = vget_low_u16(vreinterpretq_u16_u8(pixel[2]));
				2810	const uint16x4_t ww_0_lo = vmovn_u32(vreinterpretq_u32_u16(ww[0]));
				2811	const uint16x4_t ww_0_hi =
				2812	vmovn_u32(vreinterpretq_u32_u16(vextq_u16(ww[0], ww[0], 1)));
				2813	const uint8x8_t save_mask = vcreate_u8(0 + (2 << 8) + (4 << 16) + (6 << 24));
				2814
				2815	#if !defined(__aarch64__)
				2816	const uint8x8x2_t v_split1 = { { vget_low_u8(vreinterpretq_u8_u16(wh[0])),
				2817	vget_high_u8(
				2818	vreinterpretq_u8_u16(wh[0])) } };
				2819	const uint8x8x2_t v_split2 = { { vget_low_u8(vreinterpretq_u8_u16(wh[1])),
				2820	vget_high_u8(
				2821	vreinterpretq_u8_u16(wh[1])) } };
				2822	const uint8x8x2_t v_split3 = { { vget_low_u8(pixel[1]),
				2823	vget_high_u8(pixel[1]) } };
				2824	#endif // (__aarch64__)
				2825
				2826	for (int i = 0; i < h; ++i) {
				2827	#if defined(__aarch64__)
				2828	const uint8x8_t wg =
				2829	vqtbl1_u8(vreinterpretq_u8_u16(wh[0]), vreinterpret_u8_u16(d));
				2830	const uint8x8_t sc =
				2831	vqtbl1_u8(vreinterpretq_u8_u16(wh[1]), vreinterpret_u8_u16(d));
				2832	#else
				2833	const uint8x8_t wg = vtbl2_u8(v_split1, vreinterpret_u8_u16(d));
				2834	const uint8x8_t sc = vtbl2_u8(v_split2, vreinterpret_u8_u16(d));
				2835	#endif // (__aarch64__)
				2836
				2837	uint32x4_t sum = vmull_u16(v_pixel_0_lo, vreinterpret_u16_u8(wg));
				2838	sum = vmlal_u16(sum, v_pixel_0_hi, vreinterpret_u16_u8(sc));
				2839
				2840	#if defined(__aarch64__)
				2841	uint8x8_t b = vqtbl1_u8(pixel[1], vreinterpret_u8_u16(rep));
				2842	#else
				2843	uint8x8_t b = vtbl2_u8(v_split3, vreinterpret_u8_u16(rep));
				2844	#endif // (__aarch64__)
				2845
				2846	sum = vmlal_u16(sum, vreinterpret_u16_u8(b), ww_0_lo);
				2847	sum = vmlal_u16(sum, v_pixel_2, ww_0_hi);
				2848	uint8x8_t sum_l = vreinterpret_u8_u16(vqrshrn_n_u32(sum, 9));
				2849	uint32x2_t predsh = vreinterpret_u32_u8(vtbl1_u8(sum_l, save_mask));
				2850	vst1_lane_u32((uint32_t *)dst, predsh, 0);
				2851
				2852	dst += stride;
				2853
				2854	rep = vadd_u16(rep, one);
				2855	d = vadd_u16(d, inc);
				2856	}
				2857	}
				2858
				2859	void aom_smooth_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
				2860	const uint8_t above, const uint8_t left) {
				2861	uint8x16_t pixels[3];
				2862	load_pixel_w4(above, left, 4, pixels);
				2863
				2864	uint16x8_t wh[4], ww[2];
				2865	load_weight_w4(sm_weight_arrays, 4, wh, ww);
				2866
				2867	smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0);
				2868	}
				2869
				2870	void aom_smooth_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride,
				2871	const uint8_t above, const uint8_t left) {
				2872	uint8x16_t pixels[3];
				2873	load_pixel_w4(above, left, 8, pixels);
				2874
				2875	uint16x8_t wh[4], ww[2];
				2876	load_weight_w4(sm_weight_arrays, 8, wh, ww);
				2877
				2878	smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
				2879	}
				2880
				2881	void aom_smooth_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride,
				2882	const uint8_t above, const uint8_t left) {
				2883	uint8x16_t pixels[3];
				2884	load_pixel_w4(above, left, 16, pixels);
				2885
				2886	uint16x8_t wh[4], ww[2];
				2887	load_weight_w4(sm_weight_arrays, 16, wh, ww);
				2888
				2889	smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
				2890	dst += stride << 3;
				2891	smooth_pred_4xh(pixels, &wh[2], ww, 8, dst, stride, 1);
				2892	}
				2893
				2894	// pixels[0]: above and below_pred interleave vector, first half
				2895	// pixels[1]: above and below_pred interleave vector, second half
				2896	// pixels[2]: left vector
				2897	// pixels[3]: right_pred vector
				2898	// pixels[4]: above and below_pred interleave vector, first half
				2899	// pixels[5]: above and below_pred interleave vector, second half
				2900	// pixels[6]: left vector + 16
				2901	// pixels[7]: right_pred vector
				2902	static INLINE void load_pixel_w8(const uint8_t above, const uint8_t left,
				2903	int height, uint8x16_t *pixels) {
				2904	pixels[0] = vreinterpretq_u8_u16(vmovl_u8(vld1_u8(above)));
				2905	pixels[1] = vreinterpretq_u8_u16(vdupq_n_u16((uint16_t)left[height - 1]));
				2906	pixels[3] = vreinterpretq_u8_u16(vdupq_n_u16((uint16_t)above[7]));
				2907
				2908	if (height == 4) {
				2909	const uint32x4_t zero32 = vdupq_n_u32(0);
				2910	pixels[2] =
				2911	vreinterpretq_u8_u32(vld1q_lane_u32((const uint32_t *)left, zero32, 0));
				2912	} else if (height == 8) {
				2913	const uint64x2_t zero64 = vdupq_n_u64(0);
				2914	pixels[2] = vreinterpretq_u8_u64(
				2915	vsetq_lane_u64(((const uint64_t *)left)[0], zero64, 0));
				2916	} else if (height == 16) {
				2917	pixels[2] = vld1q_u8(left);
				2918	} else {
				2919	pixels[2] = vld1q_u8(left);
				2920	pixels[4] = pixels[0];
				2921	pixels[5] = pixels[1];
				2922	pixels[6] = vld1q_u8(left + 16);
				2923	pixels[7] = pixels[3];
				2924	}
				2925	}
				2926
				2927	// weight_h[0]: weight_h vector
				2928	// weight_h[1]: scale - weight_h vector
				2929	// weight_h[2]: same as [0], offset 8
				2930	// weight_h[3]: same as [1], offset 8
				2931	// weight_h[4]: same as [0], offset 16
				2932	// weight_h[5]: same as [1], offset 16
				2933	// weight_h[6]: same as [0], offset 24
				2934	// weight_h[7]: same as [1], offset 24
				2935	// weight_w[0]: weights_w and scale - weights_w interleave vector, first half
				2936	// weight_w[1]: weights_w and scale - weights_w interleave vector, second half
				2937	static INLINE void load_weight_w8(const uint8_t *weight_array, int height,
				2938	uint16x8_t weight_h, uint16x8_t weight_w) {
				2939	const uint8x16_t zero = vdupq_n_u8(0);
				2940	const int we_offset = height < 8 ? 4 : 8;
				2941	uint8x16_t we = vld1q_u8(&weight_array[we_offset]);
				2942	#if defined(__aarch64__)
				2943	weight_h[0] = vreinterpretq_u16_u8(vzip1q_u8(we, zero));
				2944	#else
				2945	weight_h[0] = vreinterpretq_u16_u8(vzipq_u8(we, zero).val[0]);
				2946	#endif // (__aarch64__)
				2947	const uint16x8_t d = vdupq_n_u16(256);
				2948	weight_h[1] = vsubq_u16(d, weight_h[0]);
				2949
				2950	if (height == 4) {
				2951	we = vextq_u8(we, zero, 4);
				2952	#if defined(__aarch64__)
				2953	weight_w[0] = vreinterpretq_u16_u8(vzip1q_u8(we, zero));
				2954	#else
				2955	weight_w[0] = vmovl_u8(vget_low_u8(we));
				2956	#endif // (__aarch64__)
				2957	weight_w[1] = vsubq_u16(d, weight_w[0]);
				2958	} else {
				2959	weight_w[0] = weight_h[0];
				2960	weight_w[1] = weight_h[1];
				2961	}
				2962
				2963	if (height == 16) {
				2964	we = vld1q_u8(&weight_array[16]);
				2965	const uint8x16x2_t weight_h_02 = vzipq_u8(we, zero);
				2966	weight_h[0] = vreinterpretq_u16_u8(weight_h_02.val[0]);
				2967	weight_h[1] = vsubq_u16(d, weight_h[0]);
				2968	weight_h[2] = vreinterpretq_u16_u8(weight_h_02.val[1]);
				2969	weight_h[3] = vsubq_u16(d, weight_h[2]);
				2970	} else if (height == 32) {
				2971	const uint8x16_t weight_lo = vld1q_u8(&weight_array[32]);
				2972	const uint8x16x2_t weight_h_02 = vzipq_u8(weight_lo, zero);
				2973	weight_h[0] = vreinterpretq_u16_u8(weight_h_02.val[0]);
				2974	weight_h[1] = vsubq_u16(d, weight_h[0]);
				2975	weight_h[2] = vreinterpretq_u16_u8(weight_h_02.val[1]);
				2976	weight_h[3] = vsubq_u16(d, weight_h[2]);
				2977	const uint8x16_t weight_hi = vld1q_u8(&weight_array[32 + 16]);
				2978	const uint8x16x2_t weight_h_46 = vzipq_u8(weight_hi, zero);
				2979	weight_h[4] = vreinterpretq_u16_u8(weight_h_46.val[0]);
				2980	weight_h[5] = vsubq_u16(d, weight_h[4]);
				2981	weight_h[6] = vreinterpretq_u16_u8(weight_h_46.val[1]);
				2982	weight_h[7] = vsubq_u16(d, weight_h[6]);
				2983	}
				2984	}
				2985
				2986	static INLINE void smooth_pred_8xh(const uint8x16_t *pixels,
				2987	const uint16x8_t wh, const uint16x8_t ww,
				2988	int h, uint8_t *dst, ptrdiff_t stride,
				2989	int second_half) {
				2990	const uint16x8_t one = vdupq_n_u16(1);
				2991	const uint16x8_t inc = vdupq_n_u16(0x202);
				2992	uint16x8_t rep = second_half ? vdupq_n_u16((uint16_t)0x8008)
				2993	: vdupq_n_u16((uint16_t)0x8000);
				2994	uint16x8_t d = vdupq_n_u16(0x100);
				2995
				2996	#if !defined(__aarch64__)
				2997	const uint8x8x2_t v_split1 = { { vget_low_u8(vreinterpretq_u8_u16(wh[0])),
				2998	vget_high_u8(
				2999	vreinterpretq_u8_u16(wh[0])) } };
				3000	const uint8x8x2_t v_split2 = { { vget_low_u8(vreinterpretq_u8_u16(wh[1])),
				3001	vget_high_u8(
				3002	vreinterpretq_u8_u16(wh[1])) } };
				3003	const uint8x8x2_t v_split3 = { { vget_low_u8(pixels[2]),
				3004	vget_high_u8(pixels[2]) } };
				3005	#endif
				3006
				3007	for (int i = 0; i < h; ++i) {
				3008	#if defined(__aarch64__)
				3009	const uint8x16_t wg_wg =
				3010	vqtbl1q_u8(vreinterpretq_u8_u16(wh[0]), vreinterpretq_u8_u16(d));
				3011	const uint8x16_t sc_sc =
				3012	vqtbl1q_u8(vreinterpretq_u8_u16(wh[1]), vreinterpretq_u8_u16(d));
				3013	#else
				3014	const uint8x8_t v_d_lo = vreinterpret_u8_u16(vget_low_u16(d));
				3015	const uint8x8_t v_d_hi = vreinterpret_u8_u16(vget_high_u16(d));
				3016	const uint8x16_t wg_wg =
				3017	vcombine_u8(vtbl2_u8(v_split1, v_d_lo), vtbl2_u8(v_split1, v_d_hi));
				3018	const uint8x16_t sc_sc =
				3019	vcombine_u8(vtbl2_u8(v_split2, v_d_lo), vtbl2_u8(v_split2, v_d_hi));
				3020	#endif // (__aarch64__)
				3021	uint16x8_t s01 =
				3022	vmulq_u16(vreinterpretq_u16_u8(pixels[0]), vreinterpretq_u16_u8(wg_wg));
				3023	s01 = vmlaq_u16(s01, vreinterpretq_u16_u8(pixels[1]),
				3024	vreinterpretq_u16_u8(sc_sc));
				3025	#if defined(__aarch64__)
				3026	const uint8x16_t b = vqtbl1q_u8(pixels[2], vreinterpretq_u8_u16(rep));
				3027	#else
				3028	const uint8x16_t b = vcombine_u8(
				3029	vtbl2_u8(v_split3, vget_low_u8(vreinterpretq_u8_u16(rep))),
				3030	vtbl2_u8(v_split3, vget_high_u8(vreinterpretq_u8_u16(rep))));
				3031	#endif // (__aarch64__)
				3032	uint16x8_t sum0 = vmulq_u16(vreinterpretq_u16_u8(b), ww[0]);
				3033	sum0 = vmlaq_u16(sum0, vreinterpretq_u16_u8(pixels[3]), ww[1]);
				3034
				3035	uint32x4_t s0 = vaddl_u16(vget_low_u16(s01), vget_low_u16(sum0));
				3036	#if defined(__aarch64__)
				3037	uint32x4_t s1 = vaddl_high_u16(s01, sum0);
				3038	#else
				3039	uint32x4_t s1 = vaddl_u16(vget_high_u16(s01), vget_high_u16(sum0));
				3040	#endif // (__aarch64__)
				3041
				3042	sum0 = vcombine_u16(vqrshrn_n_u32(s0, 9), vqrshrn_n_u32(s1, 9));
				3043	uint8x8_t predsh = vqmovn_u16(sum0);
				3044	vst1_u8(dst, predsh);
				3045
				3046	dst += stride;
				3047	rep = vaddq_u16(rep, one);
				3048	d = vaddq_u16(d, inc);
				3049	}
				3050	}
				3051
				3052	void aom_smooth_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
				3053	const uint8_t above, const uint8_t left) {
				3054	uint8x16_t pixels[4];
				3055	load_pixel_w8(above, left, 4, pixels);
				3056
				3057	uint16x8_t wh[4], ww[2];
				3058	load_weight_w8(sm_weight_arrays, 4, wh, ww);
				3059
				3060	smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0);
				3061	}
				3062
				3063	void aom_smooth_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
				3064	const uint8_t above, const uint8_t left) {
				3065	uint8x16_t pixels[4];
				3066	load_pixel_w8(above, left, 8, pixels);
				3067
				3068	uint16x8_t wh[4], ww[2];
				3069	load_weight_w8(sm_weight_arrays, 8, wh, ww);
				3070
				3071	smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
				3072	}
				3073
				3074	void aom_smooth_predictor_8x16_neon(uint8_t *dst, ptrdiff_t stride,
				3075	const uint8_t above, const uint8_t left) {
				3076	uint8x16_t pixels[4];
				3077	load_pixel_w8(above, left, 16, pixels);
				3078
				3079	uint16x8_t wh[4], ww[2];
				3080	load_weight_w8(sm_weight_arrays, 16, wh, ww);
				3081
				3082	smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
				3083	dst += stride << 3;
				3084	smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1);
				3085	}
				3086
				3087	void aom_smooth_predictor_8x32_neon(uint8_t *dst, ptrdiff_t stride,
				3088	const uint8_t above, const uint8_t left) {
				3089	uint8x16_t pixels[8];
				3090	load_pixel_w8(above, left, 32, pixels);
				3091
				3092	uint16x8_t wh[8], ww[2];
				3093	load_weight_w8(sm_weight_arrays, 32, wh, ww);
				3094
				3095	smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0);
				3096	dst += stride << 3;
				3097	smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1);
				3098	dst += stride << 3;
				3099	smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0);
				3100	dst += stride << 3;
				3101	smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1);
				3102	}
				3103
				3104	static INLINE void smooth_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
				3105	const uint8_t *above,
				3106	const uint8_t *left, uint32_t bw,
				3107	uint32_t bh) {
				3108	const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
				3109	const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
				3110	const uint16x8_t scale_value = vdupq_n_u16(256);
				3111
				3112	for (uint32_t y = 0; y < bh; ++y) {
				3113	const uint8x8_t left_y = vdup_n_u8(left[y]);
				3114	const uint8x8_t weights_y_dup = vdup_n_u8(sm_weights_h[y]);
				3115	const uint32x4_t pred_scaled_bl =
				3116	vdupq_n_u32(256 + (256 - sm_weights_h[y]) * left[bh - 1]);
				3117
				3118	for (uint32_t x = 0; x < bw; x += 8) {
				3119	const uint8x8_t weights_x = vld1_u8(sm_weights_w + x);
				3120	const uint8x8_t top_x = vld1_u8(above + x);
				3121
				3122	uint16x8_t pred_m1, pred_m2;
				3123	uint32x4_t pred_lo, pred_hi;
				3124	pred_m1 = vmull_u8(top_x, weights_y_dup);
				3125	pred_m2 = vmull_u8(weights_x, left_y);
				3126
				3127	pred_lo = vaddl_u16(vget_low_u16(pred_m1), vget_low_u16(pred_m2));
				3128	#if defined(__aarch64__)
				3129	pred_hi = vaddl_high_u16(pred_m1, pred_m2);
				3130	#else
				3131	pred_hi = vaddl_u16(vget_high_u16(pred_m1), vget_high_u16(pred_m2));
				3132	#endif // (__aarch64__)
				3133
				3134	const uint16x8_t scale_m_weights_x = vsubw_u8(scale_value, weights_x);
				3135
				3136	const uint16x8_t swxtr = vmulq_n_u16(scale_m_weights_x, above[bw - 1]);
				3137
				3138	pred_lo = vaddq_u32(pred_lo, pred_scaled_bl);
				3139	pred_hi = vaddq_u32(pred_hi, pred_scaled_bl);
				3140
				3141	pred_lo = vaddw_u16(pred_lo, vget_low_u16(swxtr));
				3142	#if defined(__aarch64__)
				3143	pred_hi = vaddw_high_u16(pred_hi, swxtr);
				3144	#else
				3145	pred_hi = vaddw_u16(pred_hi, vget_high_u16(swxtr));
				3146	#endif // (__aarch64__)
				3147
				3148	uint16x8_t pred =
				3149	vcombine_u16(vshrn_n_u32(pred_lo, 9), vshrn_n_u32(pred_hi, 9));
				3150
				3151	uint8x8_t predsh = vqmovn_u16(pred);
				3152
				3153	vst1_u8(dst + x, predsh);
				3154	}
				3155
				3156	dst += stride;
				3157	}
				3158	}
				3159
				3160	void aom_smooth_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride,
				3161	const uint8_t above, const uint8_t left) {
				3162	smooth_predictor_wxh(dst, stride, above, left, 16, 4);
				3163	}
				3164
				3165	void aom_smooth_predictor_16x8_neon(uint8_t *dst, ptrdiff_t stride,
				3166	const uint8_t above, const uint8_t left) {
				3167	smooth_predictor_wxh(dst, stride, above, left, 16, 8);
				3168	}
				3169
				3170	void aom_smooth_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
				3171	const uint8_t *above,
				3172	const uint8_t *left) {
				3173	smooth_predictor_wxh(dst, stride, above, left, 16, 16);
				3174	}
				3175
				3176	void aom_smooth_predictor_16x32_neon(uint8_t *dst, ptrdiff_t stride,
				3177	const uint8_t *above,
				3178	const uint8_t *left) {
				3179	smooth_predictor_wxh(dst, stride, above, left, 16, 32);
				3180	}
				3181
				3182	void aom_smooth_predictor_32x8_neon(uint8_t *dst, ptrdiff_t stride,
				3183	const uint8_t above, const uint8_t left) {
				3184	smooth_predictor_wxh(dst, stride, above, left, 32, 8);
				3185	}
				3186
				3187	void aom_smooth_predictor_32x16_neon(uint8_t *dst, ptrdiff_t stride,
				3188	const uint8_t *above,
				3189	const uint8_t *left) {
				3190	smooth_predictor_wxh(dst, stride, above, left, 32, 16);
				3191	}
				3192
				3193	void aom_smooth_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
				3194	const uint8_t *above,
				3195	const uint8_t *left) {
				3196	smooth_predictor_wxh(dst, stride, above, left, 32, 32);
				3197	}
				3198
				3199	void aom_smooth_predictor_32x64_neon(uint8_t *dst, ptrdiff_t stride,
				3200	const uint8_t *above,
				3201	const uint8_t *left) {
				3202	smooth_predictor_wxh(dst, stride, above, left, 32, 64);
				3203	}
				3204
				3205	void aom_smooth_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
				3206	const uint8_t *above,
				3207	const uint8_t *left) {
				3208	smooth_predictor_wxh(dst, stride, above, left, 64, 64);
				3209	}
				3210
				3211	void aom_smooth_predictor_64x32_neon(uint8_t *dst, ptrdiff_t stride,
				3212	const uint8_t *above,
				3213	const uint8_t *left) {
				3214	smooth_predictor_wxh(dst, stride, above, left, 64, 32);
				3215	}
				3216
				3217	void aom_smooth_predictor_64x16_neon(uint8_t *dst, ptrdiff_t stride,
				3218	const uint8_t *above,
				3219	const uint8_t *left) {
				3220	smooth_predictor_wxh(dst, stride, above, left, 64, 16);
				3221	}
				3222
				3223	void aom_smooth_predictor_16x64_neon(uint8_t *dst, ptrdiff_t stride,
				3224	const uint8_t *above,
				3225	const uint8_t *left) {
				3226	smooth_predictor_wxh(dst, stride, above, left, 16, 64);
				3227	}