blob: 06846857084e8559d4ca39028c8de119c334d775 [file] [log] [blame]
Yaowu Xuc27fc142016-08-22 16:08:15 -07001/*
Yaowu Xu9c01aa12016-09-01 14:32:49 -07002 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
Yaowu Xuc27fc142016-08-22 16:08:15 -07003 *
Yaowu Xu9c01aa12016-09-01 14:32:49 -07004 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
Yaowu Xuc27fc142016-08-22 16:08:15 -070010 */
11
12#include <assert.h>
13#include <string.h>
14
Yaowu Xuf883b422016-08-30 14:01:10 -070015#include "./aom_config.h"
16#include "./aom_dsp_rtcd.h"
17#include "aom/aom_integer.h"
18#include "aom_dsp/aom_convolve.h"
19#include "aom_dsp/aom_dsp_common.h"
20#include "aom_dsp/aom_filter.h"
Yaowu Xuc27fc142016-08-22 16:08:15 -070021#include "aom_ports/mem.h"
22
Sebastien Alaiwanb093b142017-11-09 17:23:58 +010023static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) {
24 int sum = 0;
25 for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
26 return sum;
27}
28
29static INLINE int vert_scalar_product(const uint8_t *a, ptrdiff_t a_stride,
30 const int16_t *b) {
31 int sum = 0;
32 for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
33 return sum;
34}
35
Yaowu Xuc27fc142016-08-22 16:08:15 -070036static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
37 uint8_t *dst, ptrdiff_t dst_stride,
38 const InterpKernel *x_filters, int x0_q4,
39 int x_step_q4, int w, int h) {
Yaowu Xuc27fc142016-08-22 16:08:15 -070040 src -= SUBPEL_TAPS / 2 - 1;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +010041 for (int y = 0; y < h; ++y) {
Yaowu Xuc27fc142016-08-22 16:08:15 -070042 int x_q4 = x0_q4;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +010043 for (int x = 0; x < w; ++x) {
Yaowu Xuc27fc142016-08-22 16:08:15 -070044 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
45 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
Sebastien Alaiwanb093b142017-11-09 17:23:58 +010046 const int sum = horz_scalar_product(src_x, x_filter);
Yaowu Xuc27fc142016-08-22 16:08:15 -070047 dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
48 x_q4 += x_step_q4;
49 }
50 src += src_stride;
51 dst += dst_stride;
52 }
53}
54
Fergus Simpson505f0062017-06-27 11:23:34 -070055static void convolve_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
56 uint8_t *dst, ptrdiff_t dst_stride,
57 const InterpKernel *x_filters, int x0_qn,
58 int x_step_qn, int w, int h) {
Fergus Simpson505f0062017-06-27 11:23:34 -070059 src -= SUBPEL_TAPS / 2 - 1;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +010060 for (int y = 0; y < h; ++y) {
Fergus Simpson505f0062017-06-27 11:23:34 -070061 int x_qn = x0_qn;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +010062 for (int x = 0; x < w; ++x) {
Fergus Simpson505f0062017-06-27 11:23:34 -070063 const uint8_t *const src_x = &src[x_qn >> SCALE_SUBPEL_BITS]; // q8
64 const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
65 assert(x_filter_idx < SUBPEL_SHIFTS);
66 const int16_t *const x_filter = x_filters[x_filter_idx];
Sebastien Alaiwanb093b142017-11-09 17:23:58 +010067 const int sum = horz_scalar_product(src_x, x_filter);
Fergus Simpson505f0062017-06-27 11:23:34 -070068 dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
69 x_qn += x_step_qn;
70 }
71 src += src_stride;
72 dst += dst_stride;
73 }
74}
75
Yaowu Xuc27fc142016-08-22 16:08:15 -070076static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
77 uint8_t *dst, ptrdiff_t dst_stride,
78 const InterpKernel *x_filters, int x0_q4,
79 int x_step_q4, int w, int h) {
Yaowu Xuc27fc142016-08-22 16:08:15 -070080 src -= SUBPEL_TAPS / 2 - 1;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +010081 for (int y = 0; y < h; ++y) {
Yaowu Xuc27fc142016-08-22 16:08:15 -070082 int x_q4 = x0_q4;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +010083 for (int x = 0; x < w; ++x) {
Yaowu Xuc27fc142016-08-22 16:08:15 -070084 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
85 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
Sebastien Alaiwanb093b142017-11-09 17:23:58 +010086 const int sum = horz_scalar_product(src_x, x_filter);
Yaowu Xuc27fc142016-08-22 16:08:15 -070087 dst[x] = ROUND_POWER_OF_TWO(
88 dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
89 x_q4 += x_step_q4;
90 }
91 src += src_stride;
92 dst += dst_stride;
93 }
94}
95
Fergus Simpson505f0062017-06-27 11:23:34 -070096static void convolve_avg_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
97 uint8_t *dst, ptrdiff_t dst_stride,
98 const InterpKernel *x_filters, int x0_qn,
99 int x_step_qn, int w, int h) {
Fergus Simpson505f0062017-06-27 11:23:34 -0700100 src -= SUBPEL_TAPS / 2 - 1;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100101 for (int y = 0; y < h; ++y) {
Fergus Simpson505f0062017-06-27 11:23:34 -0700102 int x_qn = x0_qn;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100103 for (int x = 0; x < w; ++x) {
Fergus Simpson505f0062017-06-27 11:23:34 -0700104 const uint8_t *const src_x = &src[x_qn >> SCALE_SUBPEL_BITS];
105 const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
106 assert(x_filter_idx < SUBPEL_SHIFTS);
107 const int16_t *const x_filter = x_filters[x_filter_idx];
Sebastien Alaiwanb093b142017-11-09 17:23:58 +0100108 const int sum = horz_scalar_product(src_x, x_filter);
Fergus Simpson505f0062017-06-27 11:23:34 -0700109 dst[x] = ROUND_POWER_OF_TWO(
110 dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
111 x_qn += x_step_qn;
112 }
113 src += src_stride;
114 dst += dst_stride;
115 }
116}
117
Yaowu Xuc27fc142016-08-22 16:08:15 -0700118static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
119 uint8_t *dst, ptrdiff_t dst_stride,
120 const InterpKernel *y_filters, int y0_q4,
121 int y_step_q4, int w, int h) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700122 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
123
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100124 for (int x = 0; x < w; ++x) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700125 int y_q4 = y0_q4;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100126 for (int y = 0; y < h; ++y) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700127 const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
128 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
Sebastien Alaiwanb093b142017-11-09 17:23:58 +0100129 const int sum = vert_scalar_product(src_y, src_stride, y_filter);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700130 dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
131 y_q4 += y_step_q4;
132 }
133 ++src;
134 ++dst;
135 }
136}
137
Fergus Simpson505f0062017-06-27 11:23:34 -0700138static void convolve_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
139 uint8_t *dst, ptrdiff_t dst_stride,
140 const InterpKernel *y_filters, int y0_qn,
141 int y_step_qn, int w, int h) {
Fergus Simpson505f0062017-06-27 11:23:34 -0700142 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
143
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100144 for (int x = 0; x < w; ++x) {
Fergus Simpson505f0062017-06-27 11:23:34 -0700145 int y_qn = y0_qn;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100146 for (int y = 0; y < h; ++y) {
Fergus Simpson505f0062017-06-27 11:23:34 -0700147 const unsigned char *src_y =
148 &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
149 const int16_t *const y_filter =
150 y_filters[(y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS];
Sebastien Alaiwanb093b142017-11-09 17:23:58 +0100151 const int sum = vert_scalar_product(src_y, src_stride, y_filter);
Fergus Simpson505f0062017-06-27 11:23:34 -0700152 dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
153 y_qn += y_step_qn;
154 }
155 ++src;
156 ++dst;
157 }
158}
159
Yaowu Xuc27fc142016-08-22 16:08:15 -0700160static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
161 uint8_t *dst, ptrdiff_t dst_stride,
162 const InterpKernel *y_filters, int y0_q4,
163 int y_step_q4, int w, int h) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700164 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
165
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100166 for (int x = 0; x < w; ++x) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700167 int y_q4 = y0_q4;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100168 for (int y = 0; y < h; ++y) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700169 const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
170 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
Sebastien Alaiwanb093b142017-11-09 17:23:58 +0100171 const int sum = vert_scalar_product(src_y, src_stride, y_filter);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700172 dst[y * dst_stride] = ROUND_POWER_OF_TWO(
173 dst[y * dst_stride] +
174 clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),
175 1);
176 y_q4 += y_step_q4;
177 }
178 ++src;
179 ++dst;
180 }
181}
182
Fergus Simpson505f0062017-06-27 11:23:34 -0700183static void convolve_avg_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
184 uint8_t *dst, ptrdiff_t dst_stride,
185 const InterpKernel *y_filters, int y0_qn,
186 int y_step_qn, int w, int h) {
Fergus Simpson505f0062017-06-27 11:23:34 -0700187 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
188
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100189 for (int x = 0; x < w; ++x) {
Fergus Simpson505f0062017-06-27 11:23:34 -0700190 int y_qn = y0_qn;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100191 for (int y = 0; y < h; ++y) {
Fergus Simpson505f0062017-06-27 11:23:34 -0700192 const unsigned char *src_y =
193 &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
194 const int16_t *const y_filter =
195 y_filters[(y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS];
Sebastien Alaiwanb093b142017-11-09 17:23:58 +0100196 const int sum = vert_scalar_product(src_y, src_stride, y_filter);
Fergus Simpson505f0062017-06-27 11:23:34 -0700197 dst[y * dst_stride] = ROUND_POWER_OF_TWO(
198 dst[y * dst_stride] +
199 clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),
200 1);
201 y_qn += y_step_qn;
202 }
203 ++src;
204 ++dst;
205 }
206}
207
Yaowu Xuc27fc142016-08-22 16:08:15 -0700208static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
209 ptrdiff_t dst_stride, const InterpKernel *const x_filters,
210 int x0_q4, int x_step_q4,
211 const InterpKernel *const y_filters, int y0_q4,
212 int y_step_q4, int w, int h) {
213 // Note: Fixed size intermediate buffer, temp, places limits on parameters.
214 // 2d filtering proceeds in 2 steps:
215 // (1) Interpolate horizontally into an intermediate buffer, temp.
216 // (2) Interpolate temp vertically to derive the sub-pixel result.
217 // Deriving the maximum number of rows in the temp buffer (135):
218 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
219 // --Largest block size is 64x64 pixels.
220 // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
221 // original frame (in 1/16th pixel units).
222 // --Must round-up because block may be located at sub-pixel position.
223 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
224 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
225 uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
Sebastien Alaiwan9e9dea02017-11-29 11:53:48 +0100226 const int intermediate_height =
Yaowu Xuc27fc142016-08-22 16:08:15 -0700227 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
228
229 assert(w <= MAX_SB_SIZE);
230 assert(h <= MAX_SB_SIZE);
231
232 assert(y_step_q4 <= 32);
233 assert(x_step_q4 <= 32);
234
235 convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp,
236 MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
237 intermediate_height);
238 convolve_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst,
239 dst_stride, y_filters, y0_q4, y_step_q4, w, h);
240}
241
Fergus Simpson505f0062017-06-27 11:23:34 -0700242static void convolve_scale_c(const uint8_t *src, ptrdiff_t src_stride,
243 uint8_t *dst, ptrdiff_t dst_stride,
244 const InterpKernel *const x_filters, int x0_qn,
245 int x_step_qn, const InterpKernel *const y_filters,
246 int y0_qn, int y_step_qn, int w, int h) {
247 // TODO(afergs): Update comment here
248 // Note: Fixed size intermediate buffer, temp, places limits on parameters.
249 // 2d filtering proceeds in 2 steps:
250 // (1) Interpolate horizontally into an intermediate buffer, temp.
251 // (2) Interpolate temp vertically to derive the sub-pixel result.
252 // Deriving the maximum number of rows in the temp buffer (135):
253 // --Smallest scaling factor is x1/2 ==> y_step_qn = 32 (Normative).
254 // --Largest block size is 64x64 pixels.
255 // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
256 // original frame (in 1/16th pixel units).
257 // --Must round-up because block may be located at sub-pixel position.
258 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
259 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
260 uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
Sebastien Alaiwan9e9dea02017-11-29 11:53:48 +0100261 const int intermediate_height =
Fergus Simpson505f0062017-06-27 11:23:34 -0700262 (((h - 1) * y_step_qn + y0_qn) >> SCALE_SUBPEL_BITS) + SUBPEL_TAPS;
263
264 assert(w <= MAX_SB_SIZE);
265 assert(h <= MAX_SB_SIZE);
266
267 assert(y_step_qn <= SCALE_SUBPEL_BITS * 2);
268 assert(x_step_qn <= SCALE_SUBPEL_BITS * 2);
269
270 convolve_horiz_scale_c(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
271 temp, MAX_SB_SIZE, x_filters, x0_qn, x_step_qn, w,
272 intermediate_height);
273 convolve_vert_scale_c(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE,
274 dst, dst_stride, y_filters, y0_qn, y_step_qn, w, h);
275}
276
Yaowu Xuc27fc142016-08-22 16:08:15 -0700277static const InterpKernel *get_filter_base(const int16_t *filter) {
278 // NOTE: This assumes that the filter table is 256-byte aligned.
279 // TODO(agrange) Modify to make independent of table alignment.
280 return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
281}
282
283static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
284 return (int)((const InterpKernel *)(intptr_t)f - base);
285}
286
Yaowu Xuf883b422016-08-30 14:01:10 -0700287void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700288 uint8_t *dst, ptrdiff_t dst_stride,
289 const int16_t *filter_x, int x_step_q4,
290 const int16_t *filter_y, int y_step_q4, int w,
291 int h) {
292 const InterpKernel *const filters_x = get_filter_base(filter_x);
293 const int x0_q4 = get_filter_offset(filter_x, filters_x);
294
295 (void)filter_y;
296 (void)y_step_q4;
297
298 convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
299 w, h);
300}
301
Fergus Simpson505f0062017-06-27 11:23:34 -0700302void aom_convolve8_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
303 uint8_t *dst, ptrdiff_t dst_stride,
304 const int16_t *filter_x, int subpel_x,
305 int x_step_qn, const int16_t *filter_y,
306 int subpel_y, int y_step_qn, int w, int h) {
307 const InterpKernel *const filters_x = get_filter_base(filter_x);
308
309 (void)subpel_y;
310 (void)filter_y;
311 (void)y_step_qn;
312
313 convolve_horiz_scale_c(src, src_stride, dst, dst_stride, filters_x, subpel_x,
314 x_step_qn, w, h);
315}
316
Yaowu Xuf883b422016-08-30 14:01:10 -0700317void aom_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700318 uint8_t *dst, ptrdiff_t dst_stride,
319 const int16_t *filter_x, int x_step_q4,
320 const int16_t *filter_y, int y_step_q4, int w,
321 int h) {
322 const InterpKernel *const filters_x = get_filter_base(filter_x);
323 const int x0_q4 = get_filter_offset(filter_x, filters_x);
324
325 (void)filter_y;
326 (void)y_step_q4;
327
328 convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
329 x_step_q4, w, h);
330}
331
Fergus Simpson505f0062017-06-27 11:23:34 -0700332void aom_convolve8_avg_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
333 uint8_t *dst, ptrdiff_t dst_stride,
334 const int16_t *filter_x, int subpel_x,
335 int x_step_qn, const int16_t *filter_y,
336 int subpel_y, int y_step_qn, int w,
337 int h) {
338 const InterpKernel *const filters_x = get_filter_base(filter_x);
339
340 (void)subpel_y;
341 (void)filter_y;
342 (void)y_step_qn;
343
344 convolve_avg_horiz_scale_c(src, src_stride, dst, dst_stride, filters_x,
345 subpel_x, x_step_qn, w, h);
346}
347
Yaowu Xuf883b422016-08-30 14:01:10 -0700348void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700349 uint8_t *dst, ptrdiff_t dst_stride,
350 const int16_t *filter_x, int x_step_q4,
351 const int16_t *filter_y, int y_step_q4, int w,
352 int h) {
353 const InterpKernel *const filters_y = get_filter_base(filter_y);
354 const int y0_q4 = get_filter_offset(filter_y, filters_y);
355
356 (void)filter_x;
357 (void)x_step_q4;
358
359 convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4,
360 w, h);
361}
362
Fergus Simpson505f0062017-06-27 11:23:34 -0700363void aom_convolve8_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
364 uint8_t *dst, ptrdiff_t dst_stride,
365 const int16_t *filter_x, int subpel_x,
366 int x_step_qn, const int16_t *filter_y,
367 int subpel_y, int y_step_qn, int w, int h) {
368 const InterpKernel *const filters_y = get_filter_base(filter_y);
369
370 (void)subpel_x;
371 (void)filter_x;
372 (void)x_step_qn;
373
374 convolve_vert_scale_c(src, src_stride, dst, dst_stride, filters_y, subpel_y,
375 y_step_qn, w, h);
376}
377
Yaowu Xuf883b422016-08-30 14:01:10 -0700378void aom_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700379 uint8_t *dst, ptrdiff_t dst_stride,
380 const int16_t *filter_x, int x_step_q4,
381 const int16_t *filter_y, int y_step_q4, int w,
382 int h) {
383 const InterpKernel *const filters_y = get_filter_base(filter_y);
384 const int y0_q4 = get_filter_offset(filter_y, filters_y);
385
386 (void)filter_x;
387 (void)x_step_q4;
388
389 convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
390 y_step_q4, w, h);
391}
392
Fergus Simpson505f0062017-06-27 11:23:34 -0700393void aom_convolve8_avg_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
394 uint8_t *dst, ptrdiff_t dst_stride,
395 const int16_t *filter_x, int subpel_x,
396 int x_step_qn, const int16_t *filter_y,
397 int subpel_y, int y_step_qn, int w, int h) {
398 const InterpKernel *const filters_y = get_filter_base(filter_y);
399
400 (void)subpel_x;
401 (void)filter_x;
402 (void)x_step_qn;
403
404 convolve_avg_vert_scale_c(src, src_stride, dst, dst_stride, filters_y,
405 subpel_y, y_step_qn, w, h);
406}
407
Yaowu Xuf883b422016-08-30 14:01:10 -0700408void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700409 ptrdiff_t dst_stride, const int16_t *filter_x,
410 int x_step_q4, const int16_t *filter_y, int y_step_q4,
411 int w, int h) {
412 const InterpKernel *const filters_x = get_filter_base(filter_x);
413 const int x0_q4 = get_filter_offset(filter_x, filters_x);
414
415 const InterpKernel *const filters_y = get_filter_base(filter_y);
416 const int y0_q4 = get_filter_offset(filter_y, filters_y);
417
418 convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
419 filters_y, y0_q4, y_step_q4, w, h);
420}
421
Fergus Simpson505f0062017-06-27 11:23:34 -0700422void aom_convolve8_scale_c(const uint8_t *src, ptrdiff_t src_stride,
423 uint8_t *dst, ptrdiff_t dst_stride,
424 const int16_t *filter_x, int subpel_x, int x_step_qn,
425 const int16_t *filter_y, int subpel_y, int y_step_qn,
426 int w, int h) {
427 const InterpKernel *const filters_x = get_filter_base(filter_x);
428
429 const InterpKernel *const filters_y = get_filter_base(filter_y);
430
431 convolve_scale_c(src, src_stride, dst, dst_stride, filters_x, subpel_x,
432 x_step_qn, filters_y, subpel_y, y_step_qn, w, h);
433}
434
Yaowu Xuf883b422016-08-30 14:01:10 -0700435void aom_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700436 ptrdiff_t dst_stride, const int16_t *filter_x,
437 int x_step_q4, const int16_t *filter_y, int y_step_q4,
438 int w, int h) {
439 /* Fixed size intermediate buffer places limits on parameters. */
440 DECLARE_ALIGNED(16, uint8_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
441 assert(w <= MAX_SB_SIZE);
442 assert(h <= MAX_SB_SIZE);
443
Yaowu Xuf883b422016-08-30 14:01:10 -0700444 aom_convolve8_c(src, src_stride, temp, MAX_SB_SIZE, filter_x, x_step_q4,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700445 filter_y, y_step_q4, w, h);
Yaowu Xuf883b422016-08-30 14:01:10 -0700446 aom_convolve_avg_c(temp, MAX_SB_SIZE, dst, dst_stride, NULL, 0, NULL, 0, w,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700447 h);
448}
449
Fergus Simpson505f0062017-06-27 11:23:34 -0700450void aom_convolve8_avg_scale_c(const uint8_t *src, ptrdiff_t src_stride,
451 uint8_t *dst, ptrdiff_t dst_stride,
452 const int16_t *filter_x, int subpel_x,
453 int x_step_qn, const int16_t *filter_y,
454 int subpel_y, int y_step_qn, int w, int h) {
455 /* Fixed size intermediate buffer places limits on parameters. */
456 DECLARE_ALIGNED(16, uint8_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
457 assert(w <= MAX_SB_SIZE);
458 assert(h <= MAX_SB_SIZE);
459
460 aom_convolve8_scale_c(src, src_stride, temp, MAX_SB_SIZE, filter_x, subpel_x,
461 x_step_qn, filter_y, subpel_y, y_step_qn, w, h);
462 aom_convolve_avg_c(temp, MAX_SB_SIZE, dst, dst_stride, NULL, 0, NULL, 0, w,
463 h);
464}
465
Yaowu Xuf883b422016-08-30 14:01:10 -0700466void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700467 ptrdiff_t dst_stride, const int16_t *filter_x,
468 int filter_x_stride, const int16_t *filter_y,
469 int filter_y_stride, int w, int h) {
470 int r;
471
472 (void)filter_x;
473 (void)filter_x_stride;
474 (void)filter_y;
475 (void)filter_y_stride;
476
477 for (r = h; r > 0; --r) {
478 memcpy(dst, src, w);
479 src += src_stride;
480 dst += dst_stride;
481 }
482}
483
Yaowu Xuf883b422016-08-30 14:01:10 -0700484void aom_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700485 ptrdiff_t dst_stride, const int16_t *filter_x,
486 int filter_x_stride, const int16_t *filter_y,
487 int filter_y_stride, int w, int h) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700488 (void)filter_x;
489 (void)filter_x_stride;
490 (void)filter_y;
491 (void)filter_y_stride;
492
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100493 for (int y = 0; y < h; ++y) {
494 for (int x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700495
496 src += src_stride;
497 dst += dst_stride;
498 }
499}
500
Yaowu Xuf883b422016-08-30 14:01:10 -0700501void aom_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700502 ptrdiff_t dst_stride, const int16_t *filter_x,
503 int x_step_q4, const int16_t *filter_y, int y_step_q4,
504 int w, int h) {
Yaowu Xuf883b422016-08-30 14:01:10 -0700505 aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700506 filter_y, y_step_q4, w, h);
507}
508
Yaowu Xuf883b422016-08-30 14:01:10 -0700509void aom_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700510 ptrdiff_t dst_stride, const int16_t *filter_x,
511 int x_step_q4, const int16_t *filter_y, int y_step_q4,
512 int w, int h) {
Yaowu Xuf883b422016-08-30 14:01:10 -0700513 aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700514 filter_y, y_step_q4, w, h);
515}
516
Yaowu Xuf883b422016-08-30 14:01:10 -0700517void aom_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700518 ptrdiff_t dst_stride, const int16_t *filter_x,
519 int x_step_q4, const int16_t *filter_y, int y_step_q4,
520 int w, int h) {
Yaowu Xuf883b422016-08-30 14:01:10 -0700521 aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700522 filter_y, y_step_q4, w, h);
523}
524
Yaowu Xuf883b422016-08-30 14:01:10 -0700525void aom_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700526 uint8_t *dst, ptrdiff_t dst_stride,
527 const int16_t *filter_x, int x_step_q4,
528 const int16_t *filter_y, int y_step_q4, int w,
529 int h) {
Yaowu Xuf883b422016-08-30 14:01:10 -0700530 aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700531 x_step_q4, filter_y, y_step_q4, w, h);
532}
533
Yaowu Xuf883b422016-08-30 14:01:10 -0700534void aom_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700535 uint8_t *dst, ptrdiff_t dst_stride,
536 const int16_t *filter_x, int x_step_q4,
537 const int16_t *filter_y, int y_step_q4, int w,
538 int h) {
Yaowu Xuf883b422016-08-30 14:01:10 -0700539 aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700540 x_step_q4, filter_y, y_step_q4, w, h);
541}
542
Yaowu Xuf883b422016-08-30 14:01:10 -0700543void aom_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700544 ptrdiff_t dst_stride, const int16_t *filter_x,
545 int x_step_q4, const int16_t *filter_y, int y_step_q4,
546 int w, int h) {
Yaowu Xuf883b422016-08-30 14:01:10 -0700547 aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700548 filter_y, y_step_q4, w, h);
549}
550
Sebastien Alaiwanb093b142017-11-09 17:23:58 +0100551#if CONFIG_HIGHBITDEPTH || CONFIG_LOOP_RESTORATION
552static INLINE int highbd_vert_scalar_product(const uint16_t *a,
553 ptrdiff_t a_stride,
554 const int16_t *b) {
555 int sum = 0;
556 for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
557 return sum;
558}
559#endif
560
Fergus Simpson505f0062017-06-27 11:23:34 -0700561// TODO(afergs): Make sure this works too
David Barkerbe6cc072016-12-15 15:39:10 +0000562#if CONFIG_LOOP_RESTORATION
563static void convolve_add_src_horiz(const uint8_t *src, ptrdiff_t src_stride,
564 uint8_t *dst, ptrdiff_t dst_stride,
565 const InterpKernel *x_filters, int x0_q4,
566 int x_step_q4, int w, int h) {
David Barkerbe6cc072016-12-15 15:39:10 +0000567 src -= SUBPEL_TAPS / 2 - 1;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100568 for (int y = 0; y < h; ++y) {
David Barkerbe6cc072016-12-15 15:39:10 +0000569 int x_q4 = x0_q4;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100570 for (int x = 0; x < w; ++x) {
David Barkerbe6cc072016-12-15 15:39:10 +0000571 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
572 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
Sebastien Alaiwanb093b142017-11-09 17:23:58 +0100573
574 const int sum = horz_scalar_product(src_x, x_filter);
David Barkerbe6cc072016-12-15 15:39:10 +0000575 dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
576 src_x[SUBPEL_TAPS / 2 - 1]);
577 x_q4 += x_step_q4;
578 }
579 src += src_stride;
580 dst += dst_stride;
581 }
582}
583
584static void convolve_add_src_vert(const uint8_t *src, ptrdiff_t src_stride,
585 uint8_t *dst, ptrdiff_t dst_stride,
586 const InterpKernel *y_filters, int y0_q4,
587 int y_step_q4, int w, int h) {
David Barkerbe6cc072016-12-15 15:39:10 +0000588 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
589
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100590 for (int x = 0; x < w; ++x) {
David Barkerbe6cc072016-12-15 15:39:10 +0000591 int y_q4 = y0_q4;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100592 for (int y = 0; y < h; ++y) {
David Barkerbe6cc072016-12-15 15:39:10 +0000593 const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
594 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
Sebastien Alaiwanb093b142017-11-09 17:23:58 +0100595 const int sum = vert_scalar_product(src_y, src_stride, y_filter);
David Barkerbe6cc072016-12-15 15:39:10 +0000596 dst[y * dst_stride] =
597 clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
598 src_y[(SUBPEL_TAPS / 2 - 1) * src_stride]);
599 y_q4 += y_step_q4;
600 }
601 ++src;
602 ++dst;
603 }
604}
605
606static void convolve_add_src(const uint8_t *src, ptrdiff_t src_stride,
607 uint8_t *dst, ptrdiff_t dst_stride,
608 const InterpKernel *const x_filters, int x0_q4,
609 int x_step_q4, const InterpKernel *const y_filters,
610 int y0_q4, int y_step_q4, int w, int h) {
611 uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
Sebastien Alaiwan9e9dea02017-11-29 11:53:48 +0100612 const int intermediate_height =
David Barkerbe6cc072016-12-15 15:39:10 +0000613 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
614
615 assert(w <= MAX_SB_SIZE);
616 assert(h <= MAX_SB_SIZE);
617
618 assert(y_step_q4 <= 32);
619 assert(x_step_q4 <= 32);
620
621 convolve_add_src_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
622 temp, MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
623 intermediate_height);
624 convolve_add_src_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE,
625 dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h);
626}
627
628void aom_convolve8_add_src_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
629 uint8_t *dst, ptrdiff_t dst_stride,
630 const int16_t *filter_x, int x_step_q4,
631 const int16_t *filter_y, int y_step_q4,
632 int w, int h) {
633 const InterpKernel *const filters_x = get_filter_base(filter_x);
634 const int x0_q4 = get_filter_offset(filter_x, filters_x);
635
636 (void)filter_y;
637 (void)y_step_q4;
638
639 convolve_add_src_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
640 x_step_q4, w, h);
641}
642
643void aom_convolve8_add_src_vert_c(const uint8_t *src, ptrdiff_t src_stride,
644 uint8_t *dst, ptrdiff_t dst_stride,
645 const int16_t *filter_x, int x_step_q4,
646 const int16_t *filter_y, int y_step_q4, int w,
647 int h) {
648 const InterpKernel *const filters_y = get_filter_base(filter_y);
649 const int y0_q4 = get_filter_offset(filter_y, filters_y);
650
651 (void)filter_x;
652 (void)x_step_q4;
653
654 convolve_add_src_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
655 y_step_q4, w, h);
656}
657
658void aom_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
659 uint8_t *dst, ptrdiff_t dst_stride,
660 const int16_t *filter_x, int x_step_q4,
661 const int16_t *filter_y, int y_step_q4, int w,
662 int h) {
663 const InterpKernel *const filters_x = get_filter_base(filter_x);
664 const int x0_q4 = get_filter_offset(filter_x, filters_x);
665
666 const InterpKernel *const filters_y = get_filter_base(filter_y);
667 const int y0_q4 = get_filter_offset(filter_y, filters_y);
668
669 convolve_add_src(src, src_stride, dst, dst_stride, filters_x, x0_q4,
670 x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
671}
Debargha Mukherjee28d15c72017-05-12 10:44:03 -0700672
Debargha Mukherjee28d15c72017-05-12 10:44:03 -0700673static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride,
674 uint16_t *dst, ptrdiff_t dst_stride,
675 const InterpKernel *x_filters, int x0_q4,
676 int x_step_q4, int w, int h) {
Debargha Mukherjee11cf46f2017-05-25 12:07:47 -0700677 const int bd = 8;
Debargha Mukherjee28d15c72017-05-12 10:44:03 -0700678 src -= SUBPEL_TAPS / 2 - 1;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100679 for (int y = 0; y < h; ++y) {
Debargha Mukherjee28d15c72017-05-12 10:44:03 -0700680 int x_q4 = x0_q4;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100681 for (int x = 0; x < w; ++x) {
Debargha Mukherjee28d15c72017-05-12 10:44:03 -0700682 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
683 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
Sebastien Alaiwanb093b142017-11-09 17:23:58 +0100684 const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
685 (1 << (bd + FILTER_BITS - 1));
686 const int sum = horz_scalar_product(src_x, x_filter) + rounding;
Debargha Mukherjee28d15c72017-05-12 10:44:03 -0700687 dst[x] =
688 (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, FILTER_BITS - EXTRAPREC_BITS),
Debargha Mukherjee11cf46f2017-05-25 12:07:47 -0700689 0, EXTRAPREC_CLAMP_LIMIT(bd) - 1);
Debargha Mukherjee28d15c72017-05-12 10:44:03 -0700690 x_q4 += x_step_q4;
691 }
692 src += src_stride;
693 dst += dst_stride;
694 }
695}
696
697static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride,
698 uint8_t *dst, ptrdiff_t dst_stride,
699 const InterpKernel *y_filters, int y0_q4,
700 int y_step_q4, int w, int h) {
Debargha Mukherjee11cf46f2017-05-25 12:07:47 -0700701 const int bd = 8;
Debargha Mukherjee28d15c72017-05-12 10:44:03 -0700702 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
703
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100704 for (int x = 0; x < w; ++x) {
Debargha Mukherjee28d15c72017-05-12 10:44:03 -0700705 int y_q4 = y0_q4;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100706 for (int y = 0; y < h; ++y) {
Debargha Mukherjee28d15c72017-05-12 10:44:03 -0700707 const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
708 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
Sebastien Alaiwanb093b142017-11-09 17:23:58 +0100709 const int rounding =
Debargha Mukherjee11cf46f2017-05-25 12:07:47 -0700710 ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
711 (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1));
Sebastien Alaiwanb093b142017-11-09 17:23:58 +0100712 const int sum =
713 highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
Debargha Mukherjee28d15c72017-05-12 10:44:03 -0700714 dst[y * dst_stride] =
715 clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS + EXTRAPREC_BITS));
716 y_q4 += y_step_q4;
717 }
718 ++src;
719 ++dst;
720 }
721}
722
723static void convolve_add_src_hip(const uint8_t *src, ptrdiff_t src_stride,
724 uint8_t *dst, ptrdiff_t dst_stride,
725 const InterpKernel *const x_filters, int x0_q4,
726 int x_step_q4,
727 const InterpKernel *const y_filters, int y0_q4,
728 int y_step_q4, int w, int h) {
729 uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
Sebastien Alaiwan9e9dea02017-11-29 11:53:48 +0100730 const int intermediate_height =
Debargha Mukherjee28d15c72017-05-12 10:44:03 -0700731 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
732
733 assert(w <= MAX_SB_SIZE);
734 assert(h <= MAX_SB_SIZE);
735
736 assert(y_step_q4 <= 32);
737 assert(x_step_q4 <= 32);
738
739 convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
740 src_stride, temp, MAX_SB_SIZE, x_filters, x0_q4,
741 x_step_q4, w, intermediate_height);
742 convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
743 MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4,
744 y_step_q4, w, h);
745}
746
747void aom_convolve8_add_src_horiz_hip_c(const uint8_t *src, ptrdiff_t src_stride,
748 uint16_t *dst, ptrdiff_t dst_stride,
749 const int16_t *filter_x, int x_step_q4,
750 const int16_t *filter_y, int y_step_q4,
751 int w, int h) {
752 const InterpKernel *const filters_x = get_filter_base(filter_x);
753 const int x0_q4 = get_filter_offset(filter_x, filters_x);
754
755 (void)filter_y;
756 (void)y_step_q4;
757
758 convolve_add_src_horiz_hip(src, src_stride, dst, dst_stride, filters_x, x0_q4,
759 x_step_q4, w, h);
760}
761
762void aom_convolve8_add_src_vert_hip_c(const uint16_t *src, ptrdiff_t src_stride,
763 uint8_t *dst, ptrdiff_t dst_stride,
764 const int16_t *filter_x, int x_step_q4,
765 const int16_t *filter_y, int y_step_q4,
766 int w, int h) {
767 const InterpKernel *const filters_y = get_filter_base(filter_y);
768 const int y0_q4 = get_filter_offset(filter_y, filters_y);
769
770 (void)filter_x;
771 (void)x_step_q4;
772
773 convolve_add_src_vert_hip(src, src_stride, dst, dst_stride, filters_y, y0_q4,
774 y_step_q4, w, h);
775}
776
777void aom_convolve8_add_src_hip_c(const uint8_t *src, ptrdiff_t src_stride,
778 uint8_t *dst, ptrdiff_t dst_stride,
779 const int16_t *filter_x, int x_step_q4,
780 const int16_t *filter_y, int y_step_q4, int w,
781 int h) {
782 const InterpKernel *const filters_x = get_filter_base(filter_x);
783 const int x0_q4 = get_filter_offset(filter_x, filters_x);
784
785 const InterpKernel *const filters_y = get_filter_base(filter_y);
786 const int y0_q4 = get_filter_offset(filter_y, filters_y);
787
788 convolve_add_src_hip(src, src_stride, dst, dst_stride, filters_x, x0_q4,
789 x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
790}
David Barkerbe6cc072016-12-15 15:39:10 +0000791#endif // CONFIG_LOOP_RESTORATION
792
Fergus Simpson505f0062017-06-27 11:23:34 -0700793// TODO(afergs): Make sure this works too
Sebastien Alaiwan71e87842017-04-12 16:03:28 +0200794#if CONFIG_HIGHBITDEPTH
Sebastien Alaiwanb093b142017-11-09 17:23:58 +0100795
796static INLINE int highbd_horz_scalar_product(const uint16_t *a,
797 const int16_t *b) {
798 int sum = 0;
799 for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
800 return sum;
801}
802
Yaowu Xuc27fc142016-08-22 16:08:15 -0700803static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
804 uint8_t *dst8, ptrdiff_t dst_stride,
805 const InterpKernel *x_filters, int x0_q4,
806 int x_step_q4, int w, int h, int bd) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700807 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
808 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
809 src -= SUBPEL_TAPS / 2 - 1;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100810 for (int y = 0; y < h; ++y) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700811 int x_q4 = x0_q4;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100812 for (int x = 0; x < w; ++x) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700813 const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
814 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
Sebastien Alaiwanb093b142017-11-09 17:23:58 +0100815 const int sum = highbd_horz_scalar_product(src_x, x_filter);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700816 dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
817 x_q4 += x_step_q4;
818 }
819 src += src_stride;
820 dst += dst_stride;
821 }
822}
823
824static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride,
825 uint8_t *dst8, ptrdiff_t dst_stride,
826 const InterpKernel *x_filters, int x0_q4,
827 int x_step_q4, int w, int h, int bd) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700828 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
829 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
830 src -= SUBPEL_TAPS / 2 - 1;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100831 for (int y = 0; y < h; ++y) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700832 int x_q4 = x0_q4;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100833 for (int x = 0; x < w; ++x) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700834 const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
835 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
Sebastien Alaiwanb093b142017-11-09 17:23:58 +0100836 const int sum = highbd_horz_scalar_product(src_x, x_filter);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700837 dst[x] = ROUND_POWER_OF_TWO(
838 dst[x] + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
839 1);
840 x_q4 += x_step_q4;
841 }
842 src += src_stride;
843 dst += dst_stride;
844 }
845}
846
847static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
848 uint8_t *dst8, ptrdiff_t dst_stride,
849 const InterpKernel *y_filters, int y0_q4,
850 int y_step_q4, int w, int h, int bd) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700851 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
852 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
853 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100854 for (int x = 0; x < w; ++x) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700855 int y_q4 = y0_q4;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100856 for (int y = 0; y < h; ++y) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700857 const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
858 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
Sebastien Alaiwanb093b142017-11-09 17:23:58 +0100859 const int sum = highbd_vert_scalar_product(src_y, src_stride, y_filter);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700860 dst[y * dst_stride] =
861 clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
862 y_q4 += y_step_q4;
863 }
864 ++src;
865 ++dst;
866 }
867}
868
869static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride,
870 uint8_t *dst8, ptrdiff_t dst_stride,
871 const InterpKernel *y_filters, int y0_q4,
872 int y_step_q4, int w, int h, int bd) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700873 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
874 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
875 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100876 for (int x = 0; x < w; ++x) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700877 int y_q4 = y0_q4;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100878 for (int y = 0; y < h; ++y) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700879 const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
880 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
Sebastien Alaiwanb093b142017-11-09 17:23:58 +0100881 const int sum = highbd_vert_scalar_product(src_y, src_stride, y_filter);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700882 dst[y * dst_stride] = ROUND_POWER_OF_TWO(
883 dst[y * dst_stride] +
884 clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
885 1);
886 y_q4 += y_step_q4;
887 }
888 ++src;
889 ++dst;
890 }
891}
892
893static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride,
894 uint8_t *dst, ptrdiff_t dst_stride,
895 const InterpKernel *const x_filters, int x0_q4,
896 int x_step_q4, const InterpKernel *const y_filters,
897 int y0_q4, int y_step_q4, int w, int h, int bd) {
898 // Note: Fixed size intermediate buffer, temp, places limits on parameters.
899 // 2d filtering proceeds in 2 steps:
900 // (1) Interpolate horizontally into an intermediate buffer, temp.
901 // (2) Interpolate temp vertically to derive the sub-pixel result.
902 // Deriving the maximum number of rows in the temp buffer (135):
903 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
904 // --Largest block size is 64x64 pixels.
905 // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
906 // original frame (in 1/16th pixel units).
907 // --Must round-up because block may be located at sub-pixel position.
908 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
909 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
910 uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
Sebastien Alaiwan9e9dea02017-11-29 11:53:48 +0100911 const int intermediate_height =
Yaowu Xuc27fc142016-08-22 16:08:15 -0700912 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
913
914 assert(w <= MAX_SB_SIZE);
915 assert(h <= MAX_SB_SIZE);
916 assert(y_step_q4 <= 32);
917 assert(x_step_q4 <= 32);
918
919 highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
920 CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, x_filters, x0_q4,
921 x_step_q4, w, intermediate_height, bd);
922 highbd_convolve_vert(
923 CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
924 MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, bd);
925}
926
Yaowu Xuf883b422016-08-30 14:01:10 -0700927void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700928 uint8_t *dst, ptrdiff_t dst_stride,
929 const int16_t *filter_x, int x_step_q4,
930 const int16_t *filter_y, int y_step_q4, int w,
931 int h, int bd) {
932 const InterpKernel *const filters_x = get_filter_base(filter_x);
933 const int x0_q4 = get_filter_offset(filter_x, filters_x);
934 (void)filter_y;
935 (void)y_step_q4;
936
937 highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
938 x_step_q4, w, h, bd);
939}
940
Yaowu Xuf883b422016-08-30 14:01:10 -0700941void aom_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700942 uint8_t *dst, ptrdiff_t dst_stride,
943 const int16_t *filter_x, int x_step_q4,
944 const int16_t *filter_y, int y_step_q4,
945 int w, int h, int bd) {
946 const InterpKernel *const filters_x = get_filter_base(filter_x);
947 const int x0_q4 = get_filter_offset(filter_x, filters_x);
948 (void)filter_y;
949 (void)y_step_q4;
950
951 highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
952 x_step_q4, w, h, bd);
953}
954
Yaowu Xuf883b422016-08-30 14:01:10 -0700955void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700956 uint8_t *dst, ptrdiff_t dst_stride,
957 const int16_t *filter_x, int x_step_q4,
958 const int16_t *filter_y, int y_step_q4, int w,
959 int h, int bd) {
960 const InterpKernel *const filters_y = get_filter_base(filter_y);
961 const int y0_q4 = get_filter_offset(filter_y, filters_y);
962 (void)filter_x;
963 (void)x_step_q4;
964
965 highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
966 y_step_q4, w, h, bd);
967}
968
Yaowu Xuf883b422016-08-30 14:01:10 -0700969void aom_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700970 uint8_t *dst, ptrdiff_t dst_stride,
971 const int16_t *filter_x, int x_step_q4,
972 const int16_t *filter_y, int y_step_q4,
973 int w, int h, int bd) {
974 const InterpKernel *const filters_y = get_filter_base(filter_y);
975 const int y0_q4 = get_filter_offset(filter_y, filters_y);
976 (void)filter_x;
977 (void)x_step_q4;
978
979 highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
980 y_step_q4, w, h, bd);
981}
982
Yaowu Xuf883b422016-08-30 14:01:10 -0700983void aom_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700984 uint8_t *dst, ptrdiff_t dst_stride,
985 const int16_t *filter_x, int x_step_q4,
986 const int16_t *filter_y, int y_step_q4, int w,
987 int h, int bd) {
988 const InterpKernel *const filters_x = get_filter_base(filter_x);
989 const int x0_q4 = get_filter_offset(filter_x, filters_x);
990
991 const InterpKernel *const filters_y = get_filter_base(filter_y);
992 const int y0_q4 = get_filter_offset(filter_y, filters_y);
993
994 highbd_convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
995 filters_y, y0_q4, y_step_q4, w, h, bd);
996}
997
Yaowu Xuf883b422016-08-30 14:01:10 -0700998void aom_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700999 uint8_t *dst, ptrdiff_t dst_stride,
1000 const int16_t *filter_x, int x_step_q4,
1001 const int16_t *filter_y, int y_step_q4, int w,
1002 int h, int bd) {
1003 // Fixed size intermediate buffer places limits on parameters.
1004 DECLARE_ALIGNED(16, uint16_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
1005 assert(w <= MAX_SB_SIZE);
1006 assert(h <= MAX_SB_SIZE);
1007
Yaowu Xuf883b422016-08-30 14:01:10 -07001008 aom_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE,
Yaowu Xuc27fc142016-08-22 16:08:15 -07001009 filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);
Yaowu Xuf883b422016-08-30 14:01:10 -07001010 aom_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, dst,
Yaowu Xuc27fc142016-08-22 16:08:15 -07001011 dst_stride, NULL, 0, NULL, 0, w, h, bd);
1012}
1013
Yaowu Xuf883b422016-08-30 14:01:10 -07001014void aom_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -07001015 uint8_t *dst8, ptrdiff_t dst_stride,
1016 const int16_t *filter_x, int filter_x_stride,
1017 const int16_t *filter_y, int filter_y_stride,
1018 int w, int h, int bd) {
1019 int r;
1020 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1021 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1022 (void)filter_x;
1023 (void)filter_y;
1024 (void)filter_x_stride;
1025 (void)filter_y_stride;
1026 (void)bd;
1027
1028 for (r = h; r > 0; --r) {
1029 memcpy(dst, src, w * sizeof(uint16_t));
1030 src += src_stride;
1031 dst += dst_stride;
1032 }
1033}
1034
Yaowu Xuf883b422016-08-30 14:01:10 -07001035void aom_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -07001036 uint8_t *dst8, ptrdiff_t dst_stride,
1037 const int16_t *filter_x, int filter_x_stride,
1038 const int16_t *filter_y, int filter_y_stride,
1039 int w, int h, int bd) {
Yaowu Xuc27fc142016-08-22 16:08:15 -07001040 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1041 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1042 (void)filter_x;
1043 (void)filter_y;
1044 (void)filter_x_stride;
1045 (void)filter_y_stride;
1046 (void)bd;
1047
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +01001048 for (int y = 0; y < h; ++y) {
1049 for (int x = 0; x < w; ++x) {
Yaowu Xuc27fc142016-08-22 16:08:15 -07001050 dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
1051 }
1052 src += src_stride;
1053 dst += dst_stride;
1054 }
1055}
David Barkerbe6cc072016-12-15 15:39:10 +00001056
1057#if CONFIG_LOOP_RESTORATION
1058static void highbd_convolve_add_src_horiz(const uint8_t *src8,
1059 ptrdiff_t src_stride, uint8_t *dst8,
1060 ptrdiff_t dst_stride,
1061 const InterpKernel *x_filters,
1062 int x0_q4, int x_step_q4, int w,
1063 int h, int bd) {
David Barkerbe6cc072016-12-15 15:39:10 +00001064 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1065 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1066 src -= SUBPEL_TAPS / 2 - 1;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +01001067 for (int y = 0; y < h; ++y) {
David Barkerbe6cc072016-12-15 15:39:10 +00001068 int x_q4 = x0_q4;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +01001069 for (int x = 0; x < w; ++x) {
David Barkerbe6cc072016-12-15 15:39:10 +00001070 const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
1071 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
Sebastien Alaiwanb093b142017-11-09 17:23:58 +01001072 const int sum = highbd_horz_scalar_product(src_x, x_filter);
David Barkerbe6cc072016-12-15 15:39:10 +00001073 dst[x] = clip_pixel_highbd(
1074 ROUND_POWER_OF_TWO(sum, FILTER_BITS) + src_x[SUBPEL_TAPS / 2 - 1],
1075 bd);
1076 x_q4 += x_step_q4;
1077 }
1078 src += src_stride;
1079 dst += dst_stride;
1080 }
1081}
1082
1083static void highbd_convolve_add_src_vert(const uint8_t *src8,
1084 ptrdiff_t src_stride, uint8_t *dst8,
1085 ptrdiff_t dst_stride,
1086 const InterpKernel *y_filters,
1087 int y0_q4, int y_step_q4, int w, int h,
1088 int bd) {
David Barkerbe6cc072016-12-15 15:39:10 +00001089 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1090 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1091 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +01001092 for (int x = 0; x < w; ++x) {
David Barkerbe6cc072016-12-15 15:39:10 +00001093 int y_q4 = y0_q4;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +01001094 for (int y = 0; y < h; ++y) {
David Barkerbe6cc072016-12-15 15:39:10 +00001095 const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1096 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
Sebastien Alaiwanb093b142017-11-09 17:23:58 +01001097 const int sum = highbd_vert_scalar_product(src_y, src_stride, y_filter);
David Barkerbe6cc072016-12-15 15:39:10 +00001098 dst[y * dst_stride] =
1099 clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
1100 src_y[(SUBPEL_TAPS / 2 - 1) * src_stride],
1101 bd);
1102 y_q4 += y_step_q4;
1103 }
1104 ++src;
1105 ++dst;
1106 }
1107}
1108
1109static void highbd_convolve_add_src(const uint8_t *src, ptrdiff_t src_stride,
1110 uint8_t *dst, ptrdiff_t dst_stride,
1111 const InterpKernel *const x_filters,
1112 int x0_q4, int x_step_q4,
1113 const InterpKernel *const y_filters,
1114 int y0_q4, int y_step_q4, int w, int h,
1115 int bd) {
1116 // Note: Fixed size intermediate buffer, temp, places limits on parameters.
1117 // 2d filtering proceeds in 2 steps:
1118 // (1) Interpolate horizontally into an intermediate buffer, temp.
1119 // (2) Interpolate temp vertically to derive the sub-pixel result.
1120 // Deriving the maximum number of rows in the temp buffer (135):
1121 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
1122 // --Largest block size is 64x64 pixels.
1123 // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
1124 // original frame (in 1/16th pixel units).
1125 // --Must round-up because block may be located at sub-pixel position.
1126 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
1127 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
1128 uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
Sebastien Alaiwan9e9dea02017-11-29 11:53:48 +01001129 const int intermediate_height =
David Barkerbe6cc072016-12-15 15:39:10 +00001130 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
1131
1132 assert(w <= MAX_SB_SIZE);
1133 assert(h <= MAX_SB_SIZE);
1134 assert(y_step_q4 <= 32);
1135 assert(x_step_q4 <= 32);
1136
1137 highbd_convolve_add_src_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1138 src_stride, CONVERT_TO_BYTEPTR(temp),
1139 MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
1140 intermediate_height, bd);
1141 highbd_convolve_add_src_vert(
1142 CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
1143 MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, bd);
1144}
1145
David Barkerbe6cc072016-12-15 15:39:10 +00001146void aom_highbd_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
1147 uint8_t *dst, ptrdiff_t dst_stride,
1148 const int16_t *filter_x, int x_step_q4,
1149 const int16_t *filter_y, int y_step_q4,
1150 int w, int h, int bd) {
1151 const InterpKernel *const filters_x = get_filter_base(filter_x);
1152 const int x0_q4 = get_filter_offset(filter_x, filters_x);
1153
1154 const InterpKernel *const filters_y = get_filter_base(filter_y);
1155 const int y0_q4 = get_filter_offset(filter_y, filters_y);
1156
1157 highbd_convolve_add_src(src, src_stride, dst, dst_stride, filters_x, x0_q4,
1158 x_step_q4, filters_y, y0_q4, y_step_q4, w, h, bd);
1159}
Debargha Mukherjee28d15c72017-05-12 10:44:03 -07001160
1161static void highbd_convolve_add_src_horiz_hip(
1162 const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst,
1163 ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4,
1164 int x_step_q4, int w, int h, int bd) {
Debargha Mukherjee11cf46f2017-05-25 12:07:47 -07001165 const int extraprec_clamp_limit = EXTRAPREC_CLAMP_LIMIT(bd);
Debargha Mukherjee28d15c72017-05-12 10:44:03 -07001166 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1167 src -= SUBPEL_TAPS / 2 - 1;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +01001168 for (int y = 0; y < h; ++y) {
Debargha Mukherjee28d15c72017-05-12 10:44:03 -07001169 int x_q4 = x0_q4;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +01001170 for (int x = 0; x < w; ++x) {
Debargha Mukherjee28d15c72017-05-12 10:44:03 -07001171 const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
1172 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
Sebastien Alaiwanb093b142017-11-09 17:23:58 +01001173 const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
1174 (1 << (bd + FILTER_BITS - 1));
1175 const int sum = highbd_horz_scalar_product(src_x, x_filter) + rounding;
Debargha Mukherjee28d15c72017-05-12 10:44:03 -07001176 dst[x] =
1177 (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, FILTER_BITS - EXTRAPREC_BITS),
1178 0, extraprec_clamp_limit - 1);
1179 x_q4 += x_step_q4;
1180 }
1181 src += src_stride;
1182 dst += dst_stride;
1183 }
1184}
1185
1186static void highbd_convolve_add_src_vert_hip(
1187 const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst8,
1188 ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4,
1189 int y_step_q4, int w, int h, int bd) {
Debargha Mukherjee28d15c72017-05-12 10:44:03 -07001190 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1191 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +01001192 for (int x = 0; x < w; ++x) {
Debargha Mukherjee28d15c72017-05-12 10:44:03 -07001193 int y_q4 = y0_q4;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +01001194 for (int y = 0; y < h; ++y) {
Debargha Mukherjee28d15c72017-05-12 10:44:03 -07001195 const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1196 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
Sebastien Alaiwanb093b142017-11-09 17:23:58 +01001197 const int rounding =
Debargha Mukherjee11cf46f2017-05-25 12:07:47 -07001198 ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
1199 (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1));
Sebastien Alaiwanb093b142017-11-09 17:23:58 +01001200 const int sum =
1201 highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
Debargha Mukherjee28d15c72017-05-12 10:44:03 -07001202 dst[y * dst_stride] = clip_pixel_highbd(
1203 ROUND_POWER_OF_TWO(sum, FILTER_BITS + EXTRAPREC_BITS), bd);
1204 y_q4 += y_step_q4;
1205 }
1206 ++src;
1207 ++dst;
1208 }
1209}
1210
1211static void highbd_convolve_add_src_hip(
1212 const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
1213 ptrdiff_t dst_stride, const InterpKernel *const x_filters, int x0_q4,
1214 int x_step_q4, const InterpKernel *const y_filters, int y0_q4,
1215 int y_step_q4, int w, int h, int bd) {
1216 // Note: Fixed size intermediate buffer, temp, places limits on parameters.
1217 // 2d filtering proceeds in 2 steps:
1218 // (1) Interpolate horizontally into an intermediate buffer, temp.
1219 // (2) Interpolate temp vertically to derive the sub-pixel result.
1220 // Deriving the maximum number of rows in the temp buffer (135):
1221 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
1222 // --Largest block size is 64x64 pixels.
1223 // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
1224 // original frame (in 1/16th pixel units).
1225 // --Must round-up because block may be located at sub-pixel position.
1226 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
1227 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
1228 uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
Sebastien Alaiwan9e9dea02017-11-29 11:53:48 +01001229 const int intermediate_height =
Debargha Mukherjee28d15c72017-05-12 10:44:03 -07001230 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
1231
1232 assert(w <= MAX_SB_SIZE);
1233 assert(h <= MAX_SB_SIZE);
1234 assert(y_step_q4 <= 32);
1235 assert(x_step_q4 <= 32);
1236
1237 highbd_convolve_add_src_horiz_hip(
1238 src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, MAX_SB_SIZE,
1239 x_filters, x0_q4, x_step_q4, w, intermediate_height, bd);
1240 highbd_convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
1241 MAX_SB_SIZE, dst, dst_stride, y_filters,
1242 y0_q4, y_step_q4, w, h, bd);
1243}
1244
Debargha Mukherjee28d15c72017-05-12 10:44:03 -07001245void aom_highbd_convolve8_add_src_hip_c(const uint8_t *src,
1246 ptrdiff_t src_stride, uint8_t *dst,
1247 ptrdiff_t dst_stride,
1248 const int16_t *filter_x, int x_step_q4,
1249 const int16_t *filter_y, int y_step_q4,
1250 int w, int h, int bd) {
1251 const InterpKernel *const filters_x = get_filter_base(filter_x);
1252 const int x0_q4 = get_filter_offset(filter_x, filters_x);
1253
1254 const InterpKernel *const filters_y = get_filter_base(filter_y);
1255 const int y0_q4 = get_filter_offset(filter_y, filters_y);
1256
1257 highbd_convolve_add_src_hip(src, src_stride, dst, dst_stride, filters_x,
1258 x0_q4, x_step_q4, filters_y, y0_q4, y_step_q4, w,
1259 h, bd);
1260}
1261
David Barkerbe6cc072016-12-15 15:39:10 +00001262#endif // CONFIG_LOOP_RESTORATION
Sebastien Alaiwan71e87842017-04-12 16:03:28 +02001263#endif // CONFIG_HIGHBITDEPTH