blob: 47afd856bc840d4e4d89695c9d37f4a30a0ee5e7 [file] [log] [blame]
Yaowu Xuc27fc142016-08-22 16:08:15 -07001/*
Yaowu Xu9c01aa12016-09-01 14:32:49 -07002 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
Yaowu Xuc27fc142016-08-22 16:08:15 -07003 *
Yaowu Xu9c01aa12016-09-01 14:32:49 -07004 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
Yaowu Xuc27fc142016-08-22 16:08:15 -070010 */
11
Yaowu Xuf883b422016-08-30 14:01:10 -070012#include "./aom_config.h"
13#include "./aom_dsp_rtcd.h"
Yaowu Xuc27fc142016-08-22 16:08:15 -070014#include "aom_dsp/x86/convolve.h"
15
16#if HAVE_SSE2
Yaowu Xuf883b422016-08-30 14:01:10 -070017filter8_1dfunction aom_filter_block1d16_v8_sse2;
18filter8_1dfunction aom_filter_block1d16_h8_sse2;
19filter8_1dfunction aom_filter_block1d8_v8_sse2;
20filter8_1dfunction aom_filter_block1d8_h8_sse2;
21filter8_1dfunction aom_filter_block1d4_v8_sse2;
22filter8_1dfunction aom_filter_block1d4_h8_sse2;
23filter8_1dfunction aom_filter_block1d16_v8_avg_sse2;
24filter8_1dfunction aom_filter_block1d16_h8_avg_sse2;
25filter8_1dfunction aom_filter_block1d8_v8_avg_sse2;
26filter8_1dfunction aom_filter_block1d8_h8_avg_sse2;
27filter8_1dfunction aom_filter_block1d4_v8_avg_sse2;
28filter8_1dfunction aom_filter_block1d4_h8_avg_sse2;
Yaowu Xuc27fc142016-08-22 16:08:15 -070029
Yaowu Xuf883b422016-08-30 14:01:10 -070030filter8_1dfunction aom_filter_block1d16_v2_sse2;
31filter8_1dfunction aom_filter_block1d16_h2_sse2;
32filter8_1dfunction aom_filter_block1d8_v2_sse2;
33filter8_1dfunction aom_filter_block1d8_h2_sse2;
34filter8_1dfunction aom_filter_block1d4_v2_sse2;
35filter8_1dfunction aom_filter_block1d4_h2_sse2;
36filter8_1dfunction aom_filter_block1d16_v2_avg_sse2;
37filter8_1dfunction aom_filter_block1d16_h2_avg_sse2;
38filter8_1dfunction aom_filter_block1d8_v2_avg_sse2;
39filter8_1dfunction aom_filter_block1d8_h2_avg_sse2;
40filter8_1dfunction aom_filter_block1d4_v2_avg_sse2;
41filter8_1dfunction aom_filter_block1d4_h2_avg_sse2;
Yaowu Xuc27fc142016-08-22 16:08:15 -070042
Yaowu Xuf883b422016-08-30 14:01:10 -070043// void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -070044// uint8_t *dst, ptrdiff_t dst_stride,
45// const int16_t *filter_x, int x_step_q4,
46// const int16_t *filter_y, int y_step_q4,
47// int w, int h);
Yaowu Xuf883b422016-08-30 14:01:10 -070048// void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -070049// uint8_t *dst, ptrdiff_t dst_stride,
50// const int16_t *filter_x, int x_step_q4,
51// const int16_t *filter_y, int y_step_q4,
52// int w, int h);
Yaowu Xuf883b422016-08-30 14:01:10 -070053// void aom_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -070054// uint8_t *dst, ptrdiff_t dst_stride,
55// const int16_t *filter_x, int x_step_q4,
56// const int16_t *filter_y, int y_step_q4,
57// int w, int h);
Yaowu Xuf883b422016-08-30 14:01:10 -070058// void aom_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -070059// uint8_t *dst, ptrdiff_t dst_stride,
60// const int16_t *filter_x, int x_step_q4,
61// const int16_t *filter_y, int y_step_q4,
62// int w, int h);
63FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
64FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
65FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
66FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2);
67
Yaowu Xuf883b422016-08-30 14:01:10 -070068// void aom_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -070069// uint8_t *dst, ptrdiff_t dst_stride,
70// const int16_t *filter_x, int x_step_q4,
71// const int16_t *filter_y, int y_step_q4,
72// int w, int h);
Yaowu Xuf883b422016-08-30 14:01:10 -070073// void aom_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -070074// uint8_t *dst, ptrdiff_t dst_stride,
75// const int16_t *filter_x, int x_step_q4,
76// const int16_t *filter_y, int y_step_q4,
77// int w, int h);
78FUN_CONV_2D(, sse2);
79FUN_CONV_2D(avg_, sse2);
80
Yaowu Xud3e7c682017-12-21 14:08:25 -080081#if ARCH_X86_64
Yaowu Xuf883b422016-08-30 14:01:10 -070082highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2;
83highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2;
84highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_sse2;
85highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_sse2;
86highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_sse2;
87highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_sse2;
88highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_avg_sse2;
89highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_avg_sse2;
90highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_avg_sse2;
91highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_avg_sse2;
92highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_avg_sse2;
93highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_avg_sse2;
Yaowu Xuc27fc142016-08-22 16:08:15 -070094
Yaowu Xuf883b422016-08-30 14:01:10 -070095highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_sse2;
96highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_sse2;
97highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_sse2;
98highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_sse2;
99highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_sse2;
100highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_sse2;
101highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_avg_sse2;
102highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_avg_sse2;
103highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_avg_sse2;
104highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_avg_sse2;
105highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_avg_sse2;
106highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_avg_sse2;
Yaowu Xuc27fc142016-08-22 16:08:15 -0700107
Yaowu Xuf883b422016-08-30 14:01:10 -0700108// void aom_highbd_convolve8_horiz_sse2(const uint8_t *src,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700109// ptrdiff_t src_stride,
110// uint8_t *dst,
111// ptrdiff_t dst_stride,
112// const int16_t *filter_x,
113// int x_step_q4,
114// const int16_t *filter_y,
115// int y_step_q4,
116// int w, int h, int bd);
Yaowu Xuf883b422016-08-30 14:01:10 -0700117// void aom_highbd_convolve8_vert_sse2(const uint8_t *src,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700118// ptrdiff_t src_stride,
119// uint8_t *dst,
120// ptrdiff_t dst_stride,
121// const int16_t *filter_x,
122// int x_step_q4,
123// const int16_t *filter_y,
124// int y_step_q4,
125// int w, int h, int bd);
Yaowu Xuf883b422016-08-30 14:01:10 -0700126// void aom_highbd_convolve8_avg_horiz_sse2(const uint8_t *src,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700127// ptrdiff_t src_stride,
128// uint8_t *dst,
129// ptrdiff_t dst_stride,
130// const int16_t *filter_x,
131// int x_step_q4,
132// const int16_t *filter_y,
133// int y_step_q4,
134// int w, int h, int bd);
Yaowu Xuf883b422016-08-30 14:01:10 -0700135// void aom_highbd_convolve8_avg_vert_sse2(const uint8_t *src,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700136// ptrdiff_t src_stride,
137// uint8_t *dst,
138// ptrdiff_t dst_stride,
139// const int16_t *filter_x,
140// int x_step_q4,
141// const int16_t *filter_y,
142// int y_step_q4,
143// int w, int h, int bd);
144HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
145HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
146HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
147HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
148 sse2);
149
Yaowu Xuf883b422016-08-30 14:01:10 -0700150// void aom_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700151// uint8_t *dst, ptrdiff_t dst_stride,
152// const int16_t *filter_x, int x_step_q4,
153// const int16_t *filter_y, int y_step_q4,
154// int w, int h, int bd);
Yaowu Xuf883b422016-08-30 14:01:10 -0700155// void aom_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700156// uint8_t *dst, ptrdiff_t dst_stride,
157// const int16_t *filter_x, int x_step_q4,
158// const int16_t *filter_y, int y_step_q4,
159// int w, int h, int bd);
160HIGH_FUN_CONV_2D(, sse2);
161HIGH_FUN_CONV_2D(avg_, sse2);
David Barkerbe6cc072016-12-15 15:39:10 +0000162
163#if CONFIG_LOOP_RESTORATION
164// The SSE2 highbd convolve functions can deal with coefficients up to 32767.
165// So redirect highbd_convolve8_add_src to regular highbd_convolve8.
166void aom_highbd_convolve8_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride,
167 uint8_t *dst, ptrdiff_t dst_stride,
168 const int16_t *filter_x, int x_step_q4,
169 const int16_t *filter_y, int y_step_q4,
170 int w, int h, int bd) {
171 assert(x_step_q4 == 16);
172 assert(y_step_q4 == 16);
173 ((int16_t *)filter_x)[3] += 128;
174 ((int16_t *)filter_y)[3] += 128;
175 aom_highbd_convolve8_sse2(src, src_stride, dst, dst_stride, filter_x,
176 x_step_q4, filter_y, y_step_q4, w, h, bd);
177 ((int16_t *)filter_x)[3] -= 128;
178 ((int16_t *)filter_y)[3] -= 128;
179}
180#endif // CONFIG_LOOP_RESTORATION
Yaowu Xud3e7c682017-12-21 14:08:25 -0800181#endif // ARCH_X86_64
Yaowu Xuc27fc142016-08-22 16:08:15 -0700182#endif // HAVE_SSE2