Blame - aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c - aom

blob: 7bc88ebf5f72fa916a021e67ea1ef5ab158ee9cb [file] [log] [blame]

Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	1	/*
James Zern	b7c05bd	2024-06-11 19:15:10 -0700	[diff] [blame^]	2	* Copyright (c) 2016, Alliance for Open Media. All rights reserved.
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	3	*
Yaowu Xu	2ab7ff0	2016-09-02 12:04:54 -0700	[diff] [blame]	4	* This source code is subject to the terms of the BSD 2 Clause License and
				5	* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
				6	* was not distributed with this source code in the LICENSE file, you can
				7	* obtain it at www.aomedia.org/license/software. If the Alliance for Open
				8	* Media Patent License 1.0 was not distributed with this source code in the
				9	* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	10	*/
				11
				12	#include <tmmintrin.h>
				13
Tom Finegan	44702c8	2018-05-22 13:00:39 -0700	[diff] [blame]	14	#include "config/aom_dsp_rtcd.h"
				15
Yaowu Xu	f883b42	2016-08-30 14:01:10 -0700	[diff] [blame]	16	#include "aom_dsp/aom_filter.h"
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	17	#include "aom_dsp/x86/convolve.h"
Jerome Jiang	afa7419	2020-08-19 20:39:35 -0700	[diff] [blame]	18	#include "aom_dsp/x86/convolve_sse2.h"
Jerome Jiang	c62a649	2021-01-12 13:15:34 -0800	[diff] [blame]	19	#include "aom_dsp/x86/convolve_ssse3.h"
Jerome Jiang	afa7419	2020-08-19 20:39:35 -0700	[diff] [blame]	20	#include "aom_dsp/x86/mem_sse2.h"
				21	#include "aom_dsp/x86/transpose_sse2.h"
Yaowu Xu	f883b42	2016-08-30 14:01:10 -0700	[diff] [blame]	22	#include "aom_mem/aom_mem.h"
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	23	#include "aom_ports/mem.h"
				24	#include "aom_ports/emmintrin_compat.h"
				25
Sachin Kumar Garg	18d5555	2018-09-21 19:21:13 +0530	[diff] [blame]	26	DECLARE_ALIGNED(32, static const uint8_t, filt_h4[]) = {
				27	0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1,
				28	2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 2, 3, 3, 4, 4, 5,
				29	5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, 5, 6, 6,
				30	7, 7, 8, 8, 9, 9, 10, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
				31	10, 11, 11, 12, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11,
				32	12, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 6, 7,
				33	7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
				34	};
				35
Sachin Kumar Garg	a65377b	2018-09-26 14:44:53 +0530	[diff] [blame]	36	DECLARE_ALIGNED(32, static const uint8_t, filtd4[]) = {
				37	2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
				38	2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
				39	};
				40
Sachin Kumar Garg	a65377b	2018-09-26 14:44:53 +0530	[diff] [blame]	41	static void aom_filter_block1d4_h4_ssse3(
				42	const uint8_t src_ptr, ptrdiff_t src_pixels_per_line, uint8_t output_ptr,
				43	ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
				44	__m128i filtersReg;
				45	__m128i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1;
				46	unsigned int i;
				47	src_ptr -= 3;
				48	addFilterReg32 = _mm_set1_epi16(32);
				49	filtersReg = _mm_loadu_si128((const __m128i *)filter);
				50	filtersReg = _mm_srai_epi16(filtersReg, 1);
				51	// converting the 16 bit (short) to 8 bit (byte) and have the same data
				52	// in both lanes of 128 bit register.
				53	filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
				54
				55	firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi32(0x5040302u));
				56	filt1Reg = _mm_load_si128((__m128i const *)(filtd4));
				57
				58	for (i = output_height; i > 0; i -= 1) {
				59	// load the 2 strides of source
				60	srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
				61
				62	// filter the source buffer
				63	srcRegFilt32b1_1 = _mm_shuffle_epi8(srcReg32b1, filt1Reg);
				64
				65	// multiply 4 adjacent elements with the filter and add the result
				66	srcRegFilt32b1_1 = _mm_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
				67
				68	srcRegFilt32b1_1 = _mm_hadds_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
				69
				70	// shift by 6 bit each 16 bit
				71	srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
				72	srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
				73
				74	// shrink to 8 bit each 16 bits, the first lane contain the first
				75	// convolve result and the second lane contain the second convolve result
				76	srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
				77
				78	src_ptr += src_pixels_per_line;
				79
James Zern	bf733e6	2022-07-30 19:48:54 -0700	[diff] [blame]	80	((int )(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt32b1_1);
Sachin Kumar Garg	a65377b	2018-09-26 14:44:53 +0530	[diff] [blame]	81	output_ptr += output_pitch;
				82	}
				83	}
				84
				85	static void aom_filter_block1d4_v4_ssse3(
				86	const uint8_t src_ptr, ptrdiff_t src_pitch, uint8_t output_ptr,
				87	ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
				88	__m128i filtersReg;
				89	__m128i addFilterReg32;
				90	__m128i srcReg2, srcReg3, srcReg23, srcReg4, srcReg34, srcReg5, srcReg45,
				91	srcReg6, srcReg56;
				92	__m128i srcReg23_34_lo, srcReg45_56_lo;
				93	__m128i srcReg2345_3456_lo, srcReg2345_3456_hi;
				94	__m128i resReglo, resReghi;
				95	__m128i firstFilters;
				96	unsigned int i;
				97	ptrdiff_t src_stride, dst_stride;
				98
				99	addFilterReg32 = _mm_set1_epi16(32);
				100	filtersReg = _mm_loadu_si128((const __m128i *)filter);
				101	// converting the 16 bit (short) to 8 bit (byte) and have the
				102	// same data in both lanes of 128 bit register.
				103	filtersReg = _mm_srai_epi16(filtersReg, 1);
				104	filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
				105
				106	firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi32(0x5040302u));
				107
				108	// multiple the size of the source and destination stride by two
				109	src_stride = src_pitch << 1;
				110	dst_stride = out_pitch << 1;
				111
				112	srcReg2 = _mm_loadl_epi64((const __m128i )(src_ptr + src_pitch 2));
				113	srcReg3 = _mm_loadl_epi64((const __m128i )(src_ptr + src_pitch 3));
				114	srcReg23 = _mm_unpacklo_epi32(srcReg2, srcReg3);
				115
				116	srcReg4 = _mm_loadl_epi64((const __m128i )(src_ptr + src_pitch 4));
				117
				118	// have consecutive loads on the same 256 register
				119	srcReg34 = _mm_unpacklo_epi32(srcReg3, srcReg4);
				120
				121	srcReg23_34_lo = _mm_unpacklo_epi8(srcReg23, srcReg34);
				122
				123	for (i = output_height; i > 1; i -= 2) {
				124	srcReg5 = _mm_loadl_epi64((const __m128i )(src_ptr + src_pitch 5));
				125	srcReg45 = _mm_unpacklo_epi32(srcReg4, srcReg5);
				126
				127	srcReg6 = _mm_loadl_epi64((const __m128i )(src_ptr + src_pitch 6));
				128	srcReg56 = _mm_unpacklo_epi32(srcReg5, srcReg6);
				129
				130	// merge every two consecutive registers
				131	srcReg45_56_lo = _mm_unpacklo_epi8(srcReg45, srcReg56);
				132
				133	srcReg2345_3456_lo = _mm_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo);
				134	srcReg2345_3456_hi = _mm_unpackhi_epi16(srcReg23_34_lo, srcReg45_56_lo);
				135
				136	// multiply 2 adjacent elements with the filter and add the result
				137	resReglo = _mm_maddubs_epi16(srcReg2345_3456_lo, firstFilters);
				138	resReghi = _mm_maddubs_epi16(srcReg2345_3456_hi, firstFilters);
				139
				140	resReglo = _mm_hadds_epi16(resReglo, _mm_setzero_si128());
				141	resReghi = _mm_hadds_epi16(resReghi, _mm_setzero_si128());
				142
				143	// shift by 6 bit each 16 bit
				144	resReglo = _mm_adds_epi16(resReglo, addFilterReg32);
				145	resReghi = _mm_adds_epi16(resReghi, addFilterReg32);
				146	resReglo = _mm_srai_epi16(resReglo, 6);
				147	resReghi = _mm_srai_epi16(resReghi, 6);
				148
				149	// shrink to 8 bit each 16 bits, the first lane contain the first
				150	// convolve result and the second lane contain the second convolve
				151	// result
				152	resReglo = _mm_packus_epi16(resReglo, resReglo);
				153	resReghi = _mm_packus_epi16(resReghi, resReghi);
				154
				155	src_ptr += src_stride;
				156
James Zern	bf733e6	2022-07-30 19:48:54 -0700	[diff] [blame]	157	((int )(output_ptr)) = _mm_cvtsi128_si32(resReglo);
				158	((int )(output_ptr + out_pitch)) = _mm_cvtsi128_si32(resReghi);
Sachin Kumar Garg	a65377b	2018-09-26 14:44:53 +0530	[diff] [blame]	159
				160	output_ptr += dst_stride;
				161
				162	// save part of the registers for next strides
				163	srcReg23_34_lo = srcReg45_56_lo;
				164	srcReg4 = srcReg6;
				165	}
				166	}
				167
Sachin Kumar Garg	a65377b	2018-09-26 14:44:53 +0530	[diff] [blame]	168	static void aom_filter_block1d8_h4_ssse3(
				169	const uint8_t src_ptr, ptrdiff_t src_pixels_per_line, uint8_t output_ptr,
				170	ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
				171	__m128i filtersReg;
				172	__m128i addFilterReg32, filt2Reg, filt3Reg;
				173	__m128i secondFilters, thirdFilters;
				174	__m128i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
				175	__m128i srcReg32b1;
				176	unsigned int i;
				177	src_ptr -= 3;
				178	addFilterReg32 = _mm_set1_epi16(32);
				179	filtersReg = _mm_loadu_si128((const __m128i *)filter);
				180	filtersReg = _mm_srai_epi16(filtersReg, 1);
				181	// converting the 16 bit (short) to 8 bit (byte) and have the same data
				182	// in both lanes of 128 bit register.
				183	filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
				184
				185	// duplicate only the second 16 bits (third and forth byte)
				186	// across 256 bit register
				187	secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
				188	// duplicate only the third 16 bits (fifth and sixth byte)
				189	// across 256 bit register
				190	thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
				191
				192	filt2Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32));
				193	filt3Reg = _mm_load_si128((__m128i const )(filt_h4 + 32 2));
				194
				195	for (i = output_height; i > 0; i -= 1) {
				196	srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
				197
				198	// filter the source buffer
				199	srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b1, filt2Reg);
				200	srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b1, filt3Reg);
				201
				202	// multiply 2 adjacent elements with the filter and add the result
				203	srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters);
				204	srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters);
				205
				206	srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
				207
				208	// shift by 6 bit each 16 bit
				209	srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
				210	srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
				211
				212	// shrink to 8 bit each 16 bits
				213	srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
				214
				215	src_ptr += src_pixels_per_line;
				216
				217	_mm_storel_epi64((__m128i *)output_ptr, srcRegFilt32b1_1);
				218
				219	output_ptr += output_pitch;
				220	}
				221	}
				222
				223	static void aom_filter_block1d8_v4_ssse3(
				224	const uint8_t src_ptr, ptrdiff_t src_pitch, uint8_t output_ptr,
				225	ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
				226	__m128i filtersReg;
				227	__m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
				228	__m128i srcReg23, srcReg34, srcReg45, srcReg56;
				229	__m128i resReg23, resReg34, resReg45, resReg56;
				230	__m128i resReg23_45, resReg34_56;
				231	__m128i addFilterReg32, secondFilters, thirdFilters;
				232	unsigned int i;
				233	ptrdiff_t src_stride, dst_stride;
				234
				235	addFilterReg32 = _mm_set1_epi16(32);
				236	filtersReg = _mm_loadu_si128((const __m128i *)filter);
				237	// converting the 16 bit (short) to 8 bit (byte) and have the
				238	// same data in both lanes of 128 bit register.
				239	filtersReg = _mm_srai_epi16(filtersReg, 1);
				240	filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
				241
				242	// duplicate only the second 16 bits (third and forth byte)
				243	// across 128 bit register
				244	secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
				245	// duplicate only the third 16 bits (fifth and sixth byte)
				246	// across 128 bit register
				247	thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
				248
				249	// multiple the size of the source and destination stride by two
				250	src_stride = src_pitch << 1;
				251	dst_stride = out_pitch << 1;
				252
				253	srcReg2 = _mm_loadl_epi64((const __m128i )(src_ptr + src_pitch 2));
				254	srcReg3 = _mm_loadl_epi64((const __m128i )(src_ptr + src_pitch 3));
				255	srcReg23 = _mm_unpacklo_epi8(srcReg2, srcReg3);
				256
				257	srcReg4 = _mm_loadl_epi64((const __m128i )(src_ptr + src_pitch 4));
				258
				259	// have consecutive loads on the same 256 register
				260	srcReg34 = _mm_unpacklo_epi8(srcReg3, srcReg4);
				261
				262	for (i = output_height; i > 1; i -= 2) {
				263	srcReg5 = _mm_loadl_epi64((const __m128i )(src_ptr + src_pitch 5));
				264
				265	srcReg45 = _mm_unpacklo_epi8(srcReg4, srcReg5);
				266
				267	srcReg6 = _mm_loadl_epi64((const __m128i )(src_ptr + src_pitch 6));
				268
				269	srcReg56 = _mm_unpacklo_epi8(srcReg5, srcReg6);
				270
				271	// multiply 2 adjacent elements with the filter and add the result
				272	resReg23 = _mm_maddubs_epi16(srcReg23, secondFilters);
				273	resReg34 = _mm_maddubs_epi16(srcReg34, secondFilters);
				274	resReg45 = _mm_maddubs_epi16(srcReg45, thirdFilters);
				275	resReg56 = _mm_maddubs_epi16(srcReg56, thirdFilters);
				276
				277	// add and saturate the results together
				278	resReg23_45 = _mm_adds_epi16(resReg23, resReg45);
				279	resReg34_56 = _mm_adds_epi16(resReg34, resReg56);
				280
				281	// shift by 6 bit each 16 bit
				282	resReg23_45 = _mm_adds_epi16(resReg23_45, addFilterReg32);
				283	resReg34_56 = _mm_adds_epi16(resReg34_56, addFilterReg32);
				284	resReg23_45 = _mm_srai_epi16(resReg23_45, 6);
				285	resReg34_56 = _mm_srai_epi16(resReg34_56, 6);
				286
				287	// shrink to 8 bit each 16 bits, the first lane contain the first
				288	// convolve result and the second lane contain the second convolve
				289	// result
				290	resReg23_45 = _mm_packus_epi16(resReg23_45, _mm_setzero_si128());
				291	resReg34_56 = _mm_packus_epi16(resReg34_56, _mm_setzero_si128());
				292
				293	src_ptr += src_stride;
				294
				295	_mm_storel_epi64((__m128i *)output_ptr, (resReg23_45));
				296	_mm_storel_epi64((__m128i *)(output_ptr + out_pitch), (resReg34_56));
				297
				298	output_ptr += dst_stride;
				299
				300	// save part of the registers for next strides
				301	srcReg23 = srcReg45;
				302	srcReg34 = srcReg56;
				303	srcReg4 = srcReg6;
				304	}
				305	}
				306
Sachin Kumar Garg	18d5555	2018-09-21 19:21:13 +0530	[diff] [blame]	307	static void aom_filter_block1d16_h4_ssse3(
				308	const uint8_t src_ptr, ptrdiff_t src_pixels_per_line, uint8_t output_ptr,
				309	ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
				310	__m128i filtersReg;
				311	__m128i addFilterReg32, filt2Reg, filt3Reg;
				312	__m128i secondFilters, thirdFilters;
				313	__m128i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
				314	__m128i srcReg32b1, srcReg32b2;
				315	unsigned int i;
				316	src_ptr -= 3;
				317	addFilterReg32 = _mm_set1_epi16(32);
				318	filtersReg = _mm_loadu_si128((const __m128i *)filter);
				319	filtersReg = _mm_srai_epi16(filtersReg, 1);
				320	// converting the 16 bit (short) to 8 bit (byte) and have the same data
				321	// in both lanes of 128 bit register.
				322	filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
				323
				324	// duplicate only the second 16 bits (third and forth byte)
				325	// across 256 bit register
				326	secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
				327	// duplicate only the third 16 bits (fifth and sixth byte)
				328	// across 256 bit register
				329	thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
				330
				331	filt2Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32));
				332	filt3Reg = _mm_load_si128((__m128i const )(filt_h4 + 32 2));
				333
				334	for (i = output_height; i > 0; i -= 1) {
				335	srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
				336
				337	// filter the source buffer
				338	srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b1, filt2Reg);
				339	srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b1, filt3Reg);
				340
				341	// multiply 2 adjacent elements with the filter and add the result
				342	srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters);
				343	srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters);
				344
				345	srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
				346
				347	// reading stride of the next 16 bytes
				348	// (part of it was being read by earlier read)
				349	srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
				350
				351	// filter the source buffer
				352	srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b2, filt2Reg);
				353	srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b2, filt3Reg);
				354
				355	// multiply 2 adjacent elements with the filter and add the result
				356	srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters);
				357	srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters);
				358
				359	// add and saturate the results together
				360	srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
				361
				362	// shift by 6 bit each 16 bit
				363	srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
				364	srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
				365	srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
				366	srcRegFilt32b2_1 = _mm_srai_epi16(srcRegFilt32b2_1, 6);
				367
				368	// shrink to 8 bit each 16 bits, the first lane contain the first
				369	// convolve result and the second lane contain the second convolve result
				370	srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
				371
				372	src_ptr += src_pixels_per_line;
				373
				374	_mm_store_si128((__m128i *)output_ptr, srcRegFilt32b1_1);
				375
				376	output_ptr += output_pitch;
				377	}
				378	}
				379
				380	static void aom_filter_block1d16_v4_ssse3(
				381	const uint8_t src_ptr, ptrdiff_t src_pitch, uint8_t output_ptr,
				382	ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
				383	__m128i filtersReg;
				384	__m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
				385	__m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi;
				386	__m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi;
				387	__m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
				388	__m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi;
				389	__m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi;
				390	__m128i resReg23_45, resReg34_56;
				391	__m128i addFilterReg32, secondFilters, thirdFilters;
				392	unsigned int i;
				393	ptrdiff_t src_stride, dst_stride;
				394
				395	addFilterReg32 = _mm_set1_epi16(32);
				396	filtersReg = _mm_loadu_si128((const __m128i *)filter);
				397	// converting the 16 bit (short) to 8 bit (byte) and have the
				398	// same data in both lanes of 128 bit register.
				399	filtersReg = _mm_srai_epi16(filtersReg, 1);
				400	filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
				401
				402	// duplicate only the second 16 bits (third and forth byte)
				403	// across 128 bit register
				404	secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
				405	// duplicate only the third 16 bits (fifth and sixth byte)
				406	// across 128 bit register
				407	thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
				408
				409	// multiple the size of the source and destination stride by two
				410	src_stride = src_pitch << 1;
				411	dst_stride = out_pitch << 1;
				412
				413	srcReg2 = _mm_loadu_si128((const __m128i )(src_ptr + src_pitch 2));
				414	srcReg3 = _mm_loadu_si128((const __m128i )(src_ptr + src_pitch 3));
				415	srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3);
				416	srcReg23_hi = _mm_unpackhi_epi8(srcReg2, srcReg3);
				417
				418	srcReg4 = _mm_loadu_si128((const __m128i )(src_ptr + src_pitch 4));
				419
				420	// have consecutive loads on the same 256 register
				421	srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4);
				422	srcReg34_hi = _mm_unpackhi_epi8(srcReg3, srcReg4);
				423
				424	for (i = output_height; i > 1; i -= 2) {
				425	srcReg5 = _mm_loadu_si128((const __m128i )(src_ptr + src_pitch 5));
				426
				427	srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5);
				428	srcReg45_hi = _mm_unpackhi_epi8(srcReg4, srcReg5);
				429
				430	srcReg6 = _mm_loadu_si128((const __m128i )(src_ptr + src_pitch 6));
				431
				432	srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6);
				433	srcReg56_hi = _mm_unpackhi_epi8(srcReg5, srcReg6);
				434
				435	// multiply 2 adjacent elements with the filter and add the result
				436	resReg23_lo = _mm_maddubs_epi16(srcReg23_lo, secondFilters);
				437	resReg34_lo = _mm_maddubs_epi16(srcReg34_lo, secondFilters);
				438	resReg45_lo = _mm_maddubs_epi16(srcReg45_lo, thirdFilters);
				439	resReg56_lo = _mm_maddubs_epi16(srcReg56_lo, thirdFilters);
				440
				441	// add and saturate the results together
				442	resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo);
				443	resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo);
				444
				445	// multiply 2 adjacent elements with the filter and add the result
				446
				447	resReg23_hi = _mm_maddubs_epi16(srcReg23_hi, secondFilters);
				448	resReg34_hi = _mm_maddubs_epi16(srcReg34_hi, secondFilters);
				449	resReg45_hi = _mm_maddubs_epi16(srcReg45_hi, thirdFilters);
				450	resReg56_hi = _mm_maddubs_epi16(srcReg56_hi, thirdFilters);
				451
				452	// add and saturate the results together
				453	resReg23_45_hi = _mm_adds_epi16(resReg23_hi, resReg45_hi);
				454	resReg34_56_hi = _mm_adds_epi16(resReg34_hi, resReg56_hi);
				455
				456	// shift by 6 bit each 16 bit
				457	resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32);
				458	resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32);
				459	resReg23_45_hi = _mm_adds_epi16(resReg23_45_hi, addFilterReg32);
				460	resReg34_56_hi = _mm_adds_epi16(resReg34_56_hi, addFilterReg32);
				461	resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6);
				462	resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6);
				463	resReg23_45_hi = _mm_srai_epi16(resReg23_45_hi, 6);
				464	resReg34_56_hi = _mm_srai_epi16(resReg34_56_hi, 6);
				465
				466	// shrink to 8 bit each 16 bits, the first lane contain the first
				467	// convolve result and the second lane contain the second convolve
				468	// result
				469	resReg23_45 = _mm_packus_epi16(resReg23_45_lo, resReg23_45_hi);
				470	resReg34_56 = _mm_packus_epi16(resReg34_56_lo, resReg34_56_hi);
				471
				472	src_ptr += src_stride;
				473
				474	_mm_store_si128((__m128i *)output_ptr, (resReg23_45));
				475	_mm_store_si128((__m128i *)(output_ptr + out_pitch), (resReg34_56));
				476
				477	output_ptr += dst_stride;
				478
				479	// save part of the registers for next strides
				480	srcReg23_lo = srcReg45_lo;
				481	srcReg34_lo = srcReg56_lo;
				482	srcReg23_hi = srcReg45_hi;
				483	srcReg34_hi = srcReg56_hi;
				484	srcReg4 = srcReg6;
				485	}
				486	}
				487
Jerome Jiang	afa7419	2020-08-19 20:39:35 -0700	[diff] [blame]	488	static INLINE __m128i shuffle_filter_convolve8_8_ssse3(
				489	const __m128i const s, const int16_t const filter) {
				490	__m128i f[4];
				491	shuffle_filter_ssse3(filter, f);
				492	return convolve8_8_ssse3(s, f);
				493	}
				494
				495	static void filter_horiz_w8_ssse3(const uint8_t *const src,
				496	const ptrdiff_t src_stride,
				497	uint8_t *const dst,
				498	const int16_t *const x_filter) {
				499	__m128i s[8], ss[4], temp;
				500
				501	load_8bit_8x8(src, src_stride, s);
				502	// 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
				503	// 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73
				504	// 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75
				505	// 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77
				506	transpose_16bit_4x8(s, ss);
				507	temp = shuffle_filter_convolve8_8_ssse3(ss, x_filter);
				508	// shrink to 8 bit each 16 bits
				509	temp = _mm_packus_epi16(temp, temp);
				510	// save only 8 bytes convolve result
				511	_mm_storel_epi64((__m128i *)dst, temp);
				512	}
				513
				514	static void transpose8x8_to_dst(const uint8_t *const src,
				515	const ptrdiff_t src_stride, uint8_t *const dst,
				516	const ptrdiff_t dst_stride) {
				517	__m128i s[8];
				518
				519	load_8bit_8x8(src, src_stride, s);
				520	transpose_8bit_8x8(s, s);
				521	store_8bit_8x8(s, dst, dst_stride);
				522	}
				523
				524	static void scaledconvolve_horiz_w8(const uint8_t *src,
				525	const ptrdiff_t src_stride, uint8_t *dst,
				526	const ptrdiff_t dst_stride,
				527	const InterpKernel *const x_filters,
				528	const int x0_q4, const int x_step_q4,
				529	const int w, const int h) {
				530	DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
				531	int x, y, z;
				532	src -= SUBPEL_TAPS / 2 - 1;
				533
				534	// This function processes 8x8 areas. The intermediate height is not always
				535	// a multiple of 8, so force it to be a multiple of 8 here.
				536	y = h + (8 - (h & 0x7));
				537
				538	do {
				539	int x_q4 = x0_q4;
				540	for (x = 0; x < w; x += 8) {
				541	// process 8 src_x steps
				542	for (z = 0; z < 8; ++z) {
				543	const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
				544	const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
				545	if (x_q4 & SUBPEL_MASK) {
				546	filter_horiz_w8_ssse3(src_x, src_stride, temp + (z * 8), x_filter);
				547	} else {
				548	int i;
				549	for (i = 0; i < 8; ++i) {
				550	temp[z * 8 + i] = src_x[i * src_stride + 3];
				551	}
				552	}
				553	x_q4 += x_step_q4;
				554	}
				555
				556	// transpose the 8x8 filters values back to dst
				557	transpose8x8_to_dst(temp, 8, dst + x, dst_stride);
				558	}
				559
				560	src += src_stride * 8;
				561	dst += dst_stride * 8;
				562	} while (y -= 8);
				563	}
				564
				565	static void filter_horiz_w4_ssse3(const uint8_t *const src,
				566	const ptrdiff_t src_stride,
				567	uint8_t *const dst,
				568	const int16_t *const filter) {
Wan-Teh Chang	4c61c6a	2020-12-02 11:33:00 -0800	[diff] [blame]	569	__m128i s[4];
Jerome Jiang	afa7419	2020-08-19 20:39:35 -0700	[diff] [blame]	570	__m128i temp;
				571
				572	load_8bit_8x4(src, src_stride, s);
Wan-Teh Chang	4c61c6a	2020-12-02 11:33:00 -0800	[diff] [blame]	573	transpose_16bit_4x4(s, s);
Jerome Jiang	afa7419	2020-08-19 20:39:35 -0700	[diff] [blame]	574
				575	temp = shuffle_filter_convolve8_8_ssse3(s, filter);
				576	// shrink to 8 bit each 16 bits
				577	temp = _mm_packus_epi16(temp, temp);
				578	// save only 4 bytes
				579	(int )dst = _mm_cvtsi128_si32(temp);
				580	}
				581
				582	static void transpose4x4_to_dst(const uint8_t *const src,
				583	const ptrdiff_t src_stride, uint8_t *const dst,
				584	const ptrdiff_t dst_stride) {
				585	__m128i s[4];
				586
				587	load_8bit_4x4(src, src_stride, s);
				588	s[0] = transpose_8bit_4x4(s);
				589	s[1] = _mm_srli_si128(s[0], 4);
				590	s[2] = _mm_srli_si128(s[0], 8);
				591	s[3] = _mm_srli_si128(s[0], 12);
				592	store_8bit_4x4(s, dst, dst_stride);
				593	}
				594
				595	static void scaledconvolve_horiz_w4(const uint8_t *src,
				596	const ptrdiff_t src_stride, uint8_t *dst,
				597	const ptrdiff_t dst_stride,
				598	const InterpKernel *const x_filters,
				599	const int x0_q4, const int x_step_q4,
				600	const int w, const int h) {
				601	DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
				602	int x, y, z;
				603	src -= SUBPEL_TAPS / 2 - 1;
				604
				605	for (y = 0; y < h; y += 4) {
				606	int x_q4 = x0_q4;
				607	for (x = 0; x < w; x += 4) {
				608	// process 4 src_x steps
				609	for (z = 0; z < 4; ++z) {
				610	const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
				611	const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
				612	if (x_q4 & SUBPEL_MASK) {
				613	filter_horiz_w4_ssse3(src_x, src_stride, temp + (z * 4), x_filter);
				614	} else {
				615	int i;
				616	for (i = 0; i < 4; ++i) {
				617	temp[z * 4 + i] = src_x[i * src_stride + 3];
				618	}
				619	}
				620	x_q4 += x_step_q4;
				621	}
				622
				623	// transpose the 4x4 filters values back to dst
				624	transpose4x4_to_dst(temp, 4, dst + x, dst_stride);
				625	}
				626
				627	src += src_stride * 4;
				628	dst += dst_stride * 4;
				629	}
				630	}
				631
				632	static __m128i filter_vert_kernel(const __m128i *const s,
				633	const int16_t *const filter) {
				634	__m128i ss[4];
				635	__m128i temp;
				636
				637	// 00 10 01 11 02 12 03 13
				638	ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
				639	// 20 30 21 31 22 32 23 33
				640	ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
				641	// 40 50 41 51 42 52 43 53
				642	ss[2] = _mm_unpacklo_epi8(s[4], s[5]);
				643	// 60 70 61 71 62 72 63 73
				644	ss[3] = _mm_unpacklo_epi8(s[6], s[7]);
				645
				646	temp = shuffle_filter_convolve8_8_ssse3(ss, filter);
				647	// shrink to 8 bit each 16 bits
				648	return _mm_packus_epi16(temp, temp);
				649	}
				650
				651	static void filter_vert_w4_ssse3(const uint8_t *const src,
				652	const ptrdiff_t src_stride, uint8_t *const dst,
				653	const int16_t *const filter) {
				654	__m128i s[8];
				655	__m128i temp;
				656
				657	load_8bit_4x8(src, src_stride, s);
				658	temp = filter_vert_kernel(s, filter);
				659	// save only 4 bytes
				660	(int )dst = _mm_cvtsi128_si32(temp);
				661	}
				662
				663	static void scaledconvolve_vert_w4(
				664	const uint8_t src, const ptrdiff_t src_stride, uint8_t const dst,
				665	const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
				666	const int y0_q4, const int y_step_q4, const int w, const int h) {
				667	int y;
				668	int y_q4 = y0_q4;
				669
				670	src -= src_stride * (SUBPEL_TAPS / 2 - 1);
				671	for (y = 0; y < h; ++y) {
				672	const unsigned char src_y = &src[(y_q4 >> SUBPEL_BITS) src_stride];
				673	const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
				674
				675	if (y_q4 & SUBPEL_MASK) {
				676	filter_vert_w4_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
				677	} else {
				678	memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
				679	}
				680
				681	y_q4 += y_step_q4;
				682	}
				683	}
				684
				685	static void filter_vert_w8_ssse3(const uint8_t *const src,
				686	const ptrdiff_t src_stride, uint8_t *const dst,
				687	const int16_t *const filter) {
				688	__m128i s[8], temp;
				689
				690	load_8bit_8x8(src, src_stride, s);
				691	temp = filter_vert_kernel(s, filter);
				692	// save only 8 bytes convolve result
				693	_mm_storel_epi64((__m128i *)dst, temp);
				694	}
				695
				696	static void scaledconvolve_vert_w8(
				697	const uint8_t src, const ptrdiff_t src_stride, uint8_t const dst,
				698	const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
				699	const int y0_q4, const int y_step_q4, const int w, const int h) {
				700	int y;
				701	int y_q4 = y0_q4;
				702
				703	src -= src_stride * (SUBPEL_TAPS / 2 - 1);
				704	for (y = 0; y < h; ++y) {
				705	const unsigned char src_y = &src[(y_q4 >> SUBPEL_BITS) src_stride];
				706	const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
				707	if (y_q4 & SUBPEL_MASK) {
				708	filter_vert_w8_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
				709	} else {
				710	memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
				711	}
				712	y_q4 += y_step_q4;
				713	}
				714	}
				715
				716	static void filter_vert_w16_ssse3(const uint8_t *src,
				717	const ptrdiff_t src_stride,
				718	uint8_t *const dst,
				719	const int16_t *const filter, const int w) {
				720	int i;
				721	__m128i f[4];
				722	shuffle_filter_ssse3(filter, f);
				723
				724	for (i = 0; i < w; i += 16) {
				725	__m128i s[8], s_lo[4], s_hi[4], temp_lo, temp_hi;
				726
				727	loadu_8bit_16x8(src, src_stride, s);
				728
				729	// merge the result together
				730	s_lo[0] = _mm_unpacklo_epi8(s[0], s[1]);
				731	s_hi[0] = _mm_unpackhi_epi8(s[0], s[1]);
				732	s_lo[1] = _mm_unpacklo_epi8(s[2], s[3]);
				733	s_hi[1] = _mm_unpackhi_epi8(s[2], s[3]);
				734	s_lo[2] = _mm_unpacklo_epi8(s[4], s[5]);
				735	s_hi[2] = _mm_unpackhi_epi8(s[4], s[5]);
				736	s_lo[3] = _mm_unpacklo_epi8(s[6], s[7]);
				737	s_hi[3] = _mm_unpackhi_epi8(s[6], s[7]);
				738	temp_lo = convolve8_8_ssse3(s_lo, f);
				739	temp_hi = convolve8_8_ssse3(s_hi, f);
				740
				741	// shrink to 8 bit each 16 bits, the first lane contain the first convolve
				742	// result and the second lane contain the second convolve result
				743	temp_hi = _mm_packus_epi16(temp_lo, temp_hi);
				744	src += 16;
				745	// save 16 bytes convolve result
				746	_mm_store_si128((__m128i *)&dst[i], temp_hi);
				747	}
				748	}
				749
				750	static void scaledconvolve_vert_w16(
				751	const uint8_t src, const ptrdiff_t src_stride, uint8_t const dst,
				752	const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
				753	const int y0_q4, const int y_step_q4, const int w, const int h) {
				754	int y;
				755	int y_q4 = y0_q4;
				756
				757	src -= src_stride * (SUBPEL_TAPS / 2 - 1);
				758	for (y = 0; y < h; ++y) {
				759	const unsigned char src_y = &src[(y_q4 >> SUBPEL_BITS) src_stride];
				760	const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
				761	if (y_q4 & SUBPEL_MASK) {
				762	filter_vert_w16_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter,
				763	w);
				764	} else {
				765	memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
				766	}
				767	y_q4 += y_step_q4;
				768	}
				769	}
				770
				771	void aom_scaled_2d_ssse3(const uint8_t src, ptrdiff_t src_stride, uint8_t dst,
				772	ptrdiff_t dst_stride, const InterpKernel *filter,
				773	int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
				774	int w, int h) {
				775	// Note: Fixed size intermediate buffer, temp, places limits on parameters.
				776	// 2d filtering proceeds in 2 steps:
				777	// (1) Interpolate horizontally into an intermediate buffer, temp.
				778	// (2) Interpolate temp vertically to derive the sub-pixel result.
				779	// Deriving the maximum number of rows in the temp buffer (135):
				780	// --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
				781	// --Largest block size is 64x64 pixels.
				782	// --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
				783	// original frame (in 1/16th pixel units).
				784	// --Must round-up because block may be located at sub-pixel position.
				785	// --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
				786	// --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
				787	// --Require an additional 8 rows for the horiz_w8 transpose tail.
				788	// When calling in frame scaling function, the smallest scaling factor is x1/4
				789	// ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
				790	// big enough.
				791	DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
				792	const int intermediate_height =
				793	(((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
				794
				795	assert(w <= 64);
				796	assert(h <= 64);
				797	assert(y_step_q4 <= 32 \|\| (y_step_q4 <= 64 && h <= 32));
				798	assert(x_step_q4 <= 64);
				799
				800	if (w >= 8) {
				801	scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
				802	src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
				803	intermediate_height);
				804	} else {
				805	scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
				806	src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
				807	intermediate_height);
				808	}
				809
				810	if (w >= 16) {
				811	scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
				812	dst_stride, filter, y0_q4, y_step_q4, w, h);
				813	} else if (w == 8) {
				814	scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
				815	dst_stride, filter, y0_q4, y_step_q4, w, h);
				816	} else {
				817	scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
				818	dst_stride, filter, y0_q4, y_step_q4, w, h);
				819	}
				820	}
				821
Yaowu Xu	f883b42	2016-08-30 14:01:10 -0700	[diff] [blame]	822	filter8_1dfunction aom_filter_block1d16_v8_ssse3;
				823	filter8_1dfunction aom_filter_block1d16_h8_ssse3;
				824	filter8_1dfunction aom_filter_block1d8_v8_ssse3;
				825	filter8_1dfunction aom_filter_block1d8_h8_ssse3;
				826	filter8_1dfunction aom_filter_block1d4_v8_ssse3;
				827	filter8_1dfunction aom_filter_block1d4_h8_ssse3;
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	828
Yaowu Xu	f883b42	2016-08-30 14:01:10 -0700	[diff] [blame]	829	filter8_1dfunction aom_filter_block1d16_v2_ssse3;
				830	filter8_1dfunction aom_filter_block1d16_h2_ssse3;
				831	filter8_1dfunction aom_filter_block1d8_v2_ssse3;
				832	filter8_1dfunction aom_filter_block1d8_h2_ssse3;
				833	filter8_1dfunction aom_filter_block1d4_v2_ssse3;
				834	filter8_1dfunction aom_filter_block1d4_h2_ssse3;
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	835
Yaowu Xu	f883b42	2016-08-30 14:01:10 -0700	[diff] [blame]	836	// void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	837	// uint8_t *dst, ptrdiff_t dst_stride,
				838	// const int16_t *filter_x, int x_step_q4,
				839	// const int16_t *filter_y, int y_step_q4,
				840	// int w, int h);
Yaowu Xu	f883b42	2016-08-30 14:01:10 -0700	[diff] [blame]	841	// void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	842	// uint8_t *dst, ptrdiff_t dst_stride,
				843	// const int16_t *filter_x, int x_step_q4,
				844	// const int16_t *filter_y, int y_step_q4,
				845	// int w, int h);
James Zern	f2658a3	2022-02-09 10:18:38 -0800	[diff] [blame]	846	FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3)
				847	FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3)