blob: c99b191a10a333634573dff30ec97f1309077269 [file] [log] [blame]
Yaowu Xuc27fc142016-08-22 16:08:15 -07001/*
Yaowu Xu9c01aa12016-09-01 14:32:49 -07002 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
Yaowu Xuc27fc142016-08-22 16:08:15 -07003 *
Yaowu Xu9c01aa12016-09-01 14:32:49 -07004 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
Yaowu Xuc27fc142016-08-22 16:08:15 -070010 */
11
12#include <assert.h>
13#include <string.h>
14
Yaowu Xuf883b422016-08-30 14:01:10 -070015#include "./aom_config.h"
16#include "./aom_dsp_rtcd.h"
17#include "aom/aom_integer.h"
18#include "aom_dsp/aom_convolve.h"
19#include "aom_dsp/aom_dsp_common.h"
20#include "aom_dsp/aom_filter.h"
Yaowu Xuc27fc142016-08-22 16:08:15 -070021#include "aom_ports/mem.h"
22
Sebastien Alaiwanb093b142017-11-09 17:23:58 +010023static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) {
24 int sum = 0;
25 for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
26 return sum;
27}
28
29static INLINE int vert_scalar_product(const uint8_t *a, ptrdiff_t a_stride,
30 const int16_t *b) {
31 int sum = 0;
32 for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
33 return sum;
34}
35
Yaowu Xuc27fc142016-08-22 16:08:15 -070036static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
37 uint8_t *dst, ptrdiff_t dst_stride,
38 const InterpKernel *x_filters, int x0_q4,
39 int x_step_q4, int w, int h) {
Yaowu Xuc27fc142016-08-22 16:08:15 -070040 src -= SUBPEL_TAPS / 2 - 1;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +010041 for (int y = 0; y < h; ++y) {
Yaowu Xuc27fc142016-08-22 16:08:15 -070042 int x_q4 = x0_q4;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +010043 for (int x = 0; x < w; ++x) {
Yaowu Xuc27fc142016-08-22 16:08:15 -070044 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
45 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
Sebastien Alaiwanb093b142017-11-09 17:23:58 +010046 const int sum = horz_scalar_product(src_x, x_filter);
Yaowu Xuc27fc142016-08-22 16:08:15 -070047 dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
48 x_q4 += x_step_q4;
49 }
50 src += src_stride;
51 dst += dst_stride;
52 }
53}
54
Fergus Simpson505f0062017-06-27 11:23:34 -070055static void convolve_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
56 uint8_t *dst, ptrdiff_t dst_stride,
57 const InterpKernel *x_filters, int x0_qn,
58 int x_step_qn, int w, int h) {
Fergus Simpson505f0062017-06-27 11:23:34 -070059 src -= SUBPEL_TAPS / 2 - 1;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +010060 for (int y = 0; y < h; ++y) {
Fergus Simpson505f0062017-06-27 11:23:34 -070061 int x_qn = x0_qn;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +010062 for (int x = 0; x < w; ++x) {
Fergus Simpson505f0062017-06-27 11:23:34 -070063 const uint8_t *const src_x = &src[x_qn >> SCALE_SUBPEL_BITS]; // q8
64 const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
65 assert(x_filter_idx < SUBPEL_SHIFTS);
66 const int16_t *const x_filter = x_filters[x_filter_idx];
Sebastien Alaiwanb093b142017-11-09 17:23:58 +010067 const int sum = horz_scalar_product(src_x, x_filter);
Fergus Simpson505f0062017-06-27 11:23:34 -070068 dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
69 x_qn += x_step_qn;
70 }
71 src += src_stride;
72 dst += dst_stride;
73 }
74}
75
Yaowu Xuc27fc142016-08-22 16:08:15 -070076static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
77 uint8_t *dst, ptrdiff_t dst_stride,
78 const InterpKernel *x_filters, int x0_q4,
79 int x_step_q4, int w, int h) {
Yaowu Xuc27fc142016-08-22 16:08:15 -070080 src -= SUBPEL_TAPS / 2 - 1;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +010081 for (int y = 0; y < h; ++y) {
Yaowu Xuc27fc142016-08-22 16:08:15 -070082 int x_q4 = x0_q4;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +010083 for (int x = 0; x < w; ++x) {
Yaowu Xuc27fc142016-08-22 16:08:15 -070084 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
85 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
Sebastien Alaiwanb093b142017-11-09 17:23:58 +010086 const int sum = horz_scalar_product(src_x, x_filter);
Yaowu Xuc27fc142016-08-22 16:08:15 -070087 dst[x] = ROUND_POWER_OF_TWO(
88 dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
89 x_q4 += x_step_q4;
90 }
91 src += src_stride;
92 dst += dst_stride;
93 }
94}
95
Fergus Simpson505f0062017-06-27 11:23:34 -070096static void convolve_avg_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
97 uint8_t *dst, ptrdiff_t dst_stride,
98 const InterpKernel *x_filters, int x0_qn,
99 int x_step_qn, int w, int h) {
Fergus Simpson505f0062017-06-27 11:23:34 -0700100 src -= SUBPEL_TAPS / 2 - 1;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100101 for (int y = 0; y < h; ++y) {
Fergus Simpson505f0062017-06-27 11:23:34 -0700102 int x_qn = x0_qn;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100103 for (int x = 0; x < w; ++x) {
Fergus Simpson505f0062017-06-27 11:23:34 -0700104 const uint8_t *const src_x = &src[x_qn >> SCALE_SUBPEL_BITS];
105 const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
106 assert(x_filter_idx < SUBPEL_SHIFTS);
107 const int16_t *const x_filter = x_filters[x_filter_idx];
Sebastien Alaiwanb093b142017-11-09 17:23:58 +0100108 const int sum = horz_scalar_product(src_x, x_filter);
Fergus Simpson505f0062017-06-27 11:23:34 -0700109 dst[x] = ROUND_POWER_OF_TWO(
110 dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
111 x_qn += x_step_qn;
112 }
113 src += src_stride;
114 dst += dst_stride;
115 }
116}
117
Yaowu Xuc27fc142016-08-22 16:08:15 -0700118static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
119 uint8_t *dst, ptrdiff_t dst_stride,
120 const InterpKernel *y_filters, int y0_q4,
121 int y_step_q4, int w, int h) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700122 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
123
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100124 for (int x = 0; x < w; ++x) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700125 int y_q4 = y0_q4;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100126 for (int y = 0; y < h; ++y) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700127 const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
128 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
Sebastien Alaiwanb093b142017-11-09 17:23:58 +0100129 const int sum = vert_scalar_product(src_y, src_stride, y_filter);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700130 dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
131 y_q4 += y_step_q4;
132 }
133 ++src;
134 ++dst;
135 }
136}
137
Fergus Simpson505f0062017-06-27 11:23:34 -0700138static void convolve_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
139 uint8_t *dst, ptrdiff_t dst_stride,
140 const InterpKernel *y_filters, int y0_qn,
141 int y_step_qn, int w, int h) {
Fergus Simpson505f0062017-06-27 11:23:34 -0700142 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
143
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100144 for (int x = 0; x < w; ++x) {
Fergus Simpson505f0062017-06-27 11:23:34 -0700145 int y_qn = y0_qn;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100146 for (int y = 0; y < h; ++y) {
Fergus Simpson505f0062017-06-27 11:23:34 -0700147 const unsigned char *src_y =
148 &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
149 const int16_t *const y_filter =
150 y_filters[(y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS];
Sebastien Alaiwanb093b142017-11-09 17:23:58 +0100151 const int sum = vert_scalar_product(src_y, src_stride, y_filter);
Fergus Simpson505f0062017-06-27 11:23:34 -0700152 dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
153 y_qn += y_step_qn;
154 }
155 ++src;
156 ++dst;
157 }
158}
159
Yaowu Xuc27fc142016-08-22 16:08:15 -0700160static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
161 uint8_t *dst, ptrdiff_t dst_stride,
162 const InterpKernel *y_filters, int y0_q4,
163 int y_step_q4, int w, int h) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700164 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
165
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100166 for (int x = 0; x < w; ++x) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700167 int y_q4 = y0_q4;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100168 for (int y = 0; y < h; ++y) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700169 const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
170 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
Sebastien Alaiwanb093b142017-11-09 17:23:58 +0100171 const int sum = vert_scalar_product(src_y, src_stride, y_filter);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700172 dst[y * dst_stride] = ROUND_POWER_OF_TWO(
173 dst[y * dst_stride] +
174 clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),
175 1);
176 y_q4 += y_step_q4;
177 }
178 ++src;
179 ++dst;
180 }
181}
182
Fergus Simpson505f0062017-06-27 11:23:34 -0700183static void convolve_avg_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
184 uint8_t *dst, ptrdiff_t dst_stride,
185 const InterpKernel *y_filters, int y0_qn,
186 int y_step_qn, int w, int h) {
Fergus Simpson505f0062017-06-27 11:23:34 -0700187 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
188
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100189 for (int x = 0; x < w; ++x) {
Fergus Simpson505f0062017-06-27 11:23:34 -0700190 int y_qn = y0_qn;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100191 for (int y = 0; y < h; ++y) {
Fergus Simpson505f0062017-06-27 11:23:34 -0700192 const unsigned char *src_y =
193 &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
194 const int16_t *const y_filter =
195 y_filters[(y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS];
Sebastien Alaiwanb093b142017-11-09 17:23:58 +0100196 const int sum = vert_scalar_product(src_y, src_stride, y_filter);
Fergus Simpson505f0062017-06-27 11:23:34 -0700197 dst[y * dst_stride] = ROUND_POWER_OF_TWO(
198 dst[y * dst_stride] +
199 clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),
200 1);
201 y_qn += y_step_qn;
202 }
203 ++src;
204 ++dst;
205 }
206}
207
Yaowu Xuc27fc142016-08-22 16:08:15 -0700208static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
209 ptrdiff_t dst_stride, const InterpKernel *const x_filters,
210 int x0_q4, int x_step_q4,
211 const InterpKernel *const y_filters, int y0_q4,
212 int y_step_q4, int w, int h) {
213 // Note: Fixed size intermediate buffer, temp, places limits on parameters.
214 // 2d filtering proceeds in 2 steps:
215 // (1) Interpolate horizontally into an intermediate buffer, temp.
216 // (2) Interpolate temp vertically to derive the sub-pixel result.
217 // Deriving the maximum number of rows in the temp buffer (135):
218 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
219 // --Largest block size is 64x64 pixels.
220 // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
221 // original frame (in 1/16th pixel units).
222 // --Must round-up because block may be located at sub-pixel position.
223 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
224 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
225 uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
Sebastien Alaiwan9e9dea02017-11-29 11:53:48 +0100226 const int intermediate_height =
Yaowu Xuc27fc142016-08-22 16:08:15 -0700227 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
228
229 assert(w <= MAX_SB_SIZE);
230 assert(h <= MAX_SB_SIZE);
231
232 assert(y_step_q4 <= 32);
233 assert(x_step_q4 <= 32);
234
235 convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp,
236 MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
237 intermediate_height);
238 convolve_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst,
239 dst_stride, y_filters, y0_q4, y_step_q4, w, h);
240}
241
Fergus Simpson505f0062017-06-27 11:23:34 -0700242static void convolve_scale_c(const uint8_t *src, ptrdiff_t src_stride,
243 uint8_t *dst, ptrdiff_t dst_stride,
244 const InterpKernel *const x_filters, int x0_qn,
245 int x_step_qn, const InterpKernel *const y_filters,
246 int y0_qn, int y_step_qn, int w, int h) {
247 // TODO(afergs): Update comment here
248 // Note: Fixed size intermediate buffer, temp, places limits on parameters.
249 // 2d filtering proceeds in 2 steps:
250 // (1) Interpolate horizontally into an intermediate buffer, temp.
251 // (2) Interpolate temp vertically to derive the sub-pixel result.
252 // Deriving the maximum number of rows in the temp buffer (135):
253 // --Smallest scaling factor is x1/2 ==> y_step_qn = 32 (Normative).
254 // --Largest block size is 64x64 pixels.
255 // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
256 // original frame (in 1/16th pixel units).
257 // --Must round-up because block may be located at sub-pixel position.
258 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
259 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
260 uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
Sebastien Alaiwan9e9dea02017-11-29 11:53:48 +0100261 const int intermediate_height =
Fergus Simpson505f0062017-06-27 11:23:34 -0700262 (((h - 1) * y_step_qn + y0_qn) >> SCALE_SUBPEL_BITS) + SUBPEL_TAPS;
263
264 assert(w <= MAX_SB_SIZE);
265 assert(h <= MAX_SB_SIZE);
266
267 assert(y_step_qn <= SCALE_SUBPEL_BITS * 2);
268 assert(x_step_qn <= SCALE_SUBPEL_BITS * 2);
269
270 convolve_horiz_scale_c(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
271 temp, MAX_SB_SIZE, x_filters, x0_qn, x_step_qn, w,
272 intermediate_height);
273 convolve_vert_scale_c(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE,
274 dst, dst_stride, y_filters, y0_qn, y_step_qn, w, h);
275}
276
Yaowu Xuc27fc142016-08-22 16:08:15 -0700277static const InterpKernel *get_filter_base(const int16_t *filter) {
278 // NOTE: This assumes that the filter table is 256-byte aligned.
279 // TODO(agrange) Modify to make independent of table alignment.
280 return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
281}
282
283static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
284 return (int)((const InterpKernel *)(intptr_t)f - base);
285}
286
Yaowu Xuf883b422016-08-30 14:01:10 -0700287void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700288 uint8_t *dst, ptrdiff_t dst_stride,
289 const int16_t *filter_x, int x_step_q4,
290 const int16_t *filter_y, int y_step_q4, int w,
291 int h) {
292 const InterpKernel *const filters_x = get_filter_base(filter_x);
293 const int x0_q4 = get_filter_offset(filter_x, filters_x);
294
295 (void)filter_y;
296 (void)y_step_q4;
297
298 convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
299 w, h);
300}
301
Fergus Simpson505f0062017-06-27 11:23:34 -0700302void aom_convolve8_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
303 uint8_t *dst, ptrdiff_t dst_stride,
304 const int16_t *filter_x, int subpel_x,
305 int x_step_qn, const int16_t *filter_y,
306 int subpel_y, int y_step_qn, int w, int h) {
307 const InterpKernel *const filters_x = get_filter_base(filter_x);
308
309 (void)subpel_y;
310 (void)filter_y;
311 (void)y_step_qn;
312
313 convolve_horiz_scale_c(src, src_stride, dst, dst_stride, filters_x, subpel_x,
314 x_step_qn, w, h);
315}
316
Yaowu Xuf883b422016-08-30 14:01:10 -0700317void aom_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700318 uint8_t *dst, ptrdiff_t dst_stride,
319 const int16_t *filter_x, int x_step_q4,
320 const int16_t *filter_y, int y_step_q4, int w,
321 int h) {
322 const InterpKernel *const filters_x = get_filter_base(filter_x);
323 const int x0_q4 = get_filter_offset(filter_x, filters_x);
324
325 (void)filter_y;
326 (void)y_step_q4;
327
328 convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
329 x_step_q4, w, h);
330}
331
Fergus Simpson505f0062017-06-27 11:23:34 -0700332void aom_convolve8_avg_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
333 uint8_t *dst, ptrdiff_t dst_stride,
334 const int16_t *filter_x, int subpel_x,
335 int x_step_qn, const int16_t *filter_y,
336 int subpel_y, int y_step_qn, int w,
337 int h) {
338 const InterpKernel *const filters_x = get_filter_base(filter_x);
339
340 (void)subpel_y;
341 (void)filter_y;
342 (void)y_step_qn;
343
344 convolve_avg_horiz_scale_c(src, src_stride, dst, dst_stride, filters_x,
345 subpel_x, x_step_qn, w, h);
346}
347
Yaowu Xuf883b422016-08-30 14:01:10 -0700348void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700349 uint8_t *dst, ptrdiff_t dst_stride,
350 const int16_t *filter_x, int x_step_q4,
351 const int16_t *filter_y, int y_step_q4, int w,
352 int h) {
353 const InterpKernel *const filters_y = get_filter_base(filter_y);
354 const int y0_q4 = get_filter_offset(filter_y, filters_y);
355
356 (void)filter_x;
357 (void)x_step_q4;
358
359 convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4,
360 w, h);
361}
362
Fergus Simpson505f0062017-06-27 11:23:34 -0700363void aom_convolve8_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
364 uint8_t *dst, ptrdiff_t dst_stride,
365 const int16_t *filter_x, int subpel_x,
366 int x_step_qn, const int16_t *filter_y,
367 int subpel_y, int y_step_qn, int w, int h) {
368 const InterpKernel *const filters_y = get_filter_base(filter_y);
369
370 (void)subpel_x;
371 (void)filter_x;
372 (void)x_step_qn;
373
374 convolve_vert_scale_c(src, src_stride, dst, dst_stride, filters_y, subpel_y,
375 y_step_qn, w, h);
376}
377
Yaowu Xuf883b422016-08-30 14:01:10 -0700378void aom_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700379 uint8_t *dst, ptrdiff_t dst_stride,
380 const int16_t *filter_x, int x_step_q4,
381 const int16_t *filter_y, int y_step_q4, int w,
382 int h) {
383 const InterpKernel *const filters_y = get_filter_base(filter_y);
384 const int y0_q4 = get_filter_offset(filter_y, filters_y);
385
386 (void)filter_x;
387 (void)x_step_q4;
388
389 convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
390 y_step_q4, w, h);
391}
392
Fergus Simpson505f0062017-06-27 11:23:34 -0700393void aom_convolve8_avg_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
394 uint8_t *dst, ptrdiff_t dst_stride,
395 const int16_t *filter_x, int subpel_x,
396 int x_step_qn, const int16_t *filter_y,
397 int subpel_y, int y_step_qn, int w, int h) {
398 const InterpKernel *const filters_y = get_filter_base(filter_y);
399
400 (void)subpel_x;
401 (void)filter_x;
402 (void)x_step_qn;
403
404 convolve_avg_vert_scale_c(src, src_stride, dst, dst_stride, filters_y,
405 subpel_y, y_step_qn, w, h);
406}
407
Yaowu Xuf883b422016-08-30 14:01:10 -0700408void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700409 ptrdiff_t dst_stride, const int16_t *filter_x,
410 int x_step_q4, const int16_t *filter_y, int y_step_q4,
411 int w, int h) {
412 const InterpKernel *const filters_x = get_filter_base(filter_x);
413 const int x0_q4 = get_filter_offset(filter_x, filters_x);
414
415 const InterpKernel *const filters_y = get_filter_base(filter_y);
416 const int y0_q4 = get_filter_offset(filter_y, filters_y);
417
418 convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
419 filters_y, y0_q4, y_step_q4, w, h);
420}
421
Fergus Simpson505f0062017-06-27 11:23:34 -0700422void aom_convolve8_scale_c(const uint8_t *src, ptrdiff_t src_stride,
423 uint8_t *dst, ptrdiff_t dst_stride,
424 const int16_t *filter_x, int subpel_x, int x_step_qn,
425 const int16_t *filter_y, int subpel_y, int y_step_qn,
426 int w, int h) {
427 const InterpKernel *const filters_x = get_filter_base(filter_x);
428
429 const InterpKernel *const filters_y = get_filter_base(filter_y);
430
431 convolve_scale_c(src, src_stride, dst, dst_stride, filters_x, subpel_x,
432 x_step_qn, filters_y, subpel_y, y_step_qn, w, h);
433}
434
Yaowu Xuf883b422016-08-30 14:01:10 -0700435void aom_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700436 ptrdiff_t dst_stride, const int16_t *filter_x,
437 int x_step_q4, const int16_t *filter_y, int y_step_q4,
438 int w, int h) {
439 /* Fixed size intermediate buffer places limits on parameters. */
440 DECLARE_ALIGNED(16, uint8_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
441 assert(w <= MAX_SB_SIZE);
442 assert(h <= MAX_SB_SIZE);
443
Yaowu Xuf883b422016-08-30 14:01:10 -0700444 aom_convolve8_c(src, src_stride, temp, MAX_SB_SIZE, filter_x, x_step_q4,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700445 filter_y, y_step_q4, w, h);
Yaowu Xuf883b422016-08-30 14:01:10 -0700446 aom_convolve_avg_c(temp, MAX_SB_SIZE, dst, dst_stride, NULL, 0, NULL, 0, w,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700447 h);
448}
449
Fergus Simpson505f0062017-06-27 11:23:34 -0700450void aom_convolve8_avg_scale_c(const uint8_t *src, ptrdiff_t src_stride,
451 uint8_t *dst, ptrdiff_t dst_stride,
452 const int16_t *filter_x, int subpel_x,
453 int x_step_qn, const int16_t *filter_y,
454 int subpel_y, int y_step_qn, int w, int h) {
455 /* Fixed size intermediate buffer places limits on parameters. */
456 DECLARE_ALIGNED(16, uint8_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
457 assert(w <= MAX_SB_SIZE);
458 assert(h <= MAX_SB_SIZE);
459
460 aom_convolve8_scale_c(src, src_stride, temp, MAX_SB_SIZE, filter_x, subpel_x,
461 x_step_qn, filter_y, subpel_y, y_step_qn, w, h);
462 aom_convolve_avg_c(temp, MAX_SB_SIZE, dst, dst_stride, NULL, 0, NULL, 0, w,
463 h);
464}
465
Yaowu Xuf883b422016-08-30 14:01:10 -0700466void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700467 ptrdiff_t dst_stride, const int16_t *filter_x,
468 int filter_x_stride, const int16_t *filter_y,
469 int filter_y_stride, int w, int h) {
470 int r;
471
472 (void)filter_x;
473 (void)filter_x_stride;
474 (void)filter_y;
475 (void)filter_y_stride;
476
477 for (r = h; r > 0; --r) {
478 memcpy(dst, src, w);
479 src += src_stride;
480 dst += dst_stride;
481 }
482}
483
Yaowu Xuf883b422016-08-30 14:01:10 -0700484void aom_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700485 ptrdiff_t dst_stride, const int16_t *filter_x,
486 int filter_x_stride, const int16_t *filter_y,
487 int filter_y_stride, int w, int h) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700488 (void)filter_x;
489 (void)filter_x_stride;
490 (void)filter_y;
491 (void)filter_y_stride;
492
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100493 for (int y = 0; y < h; ++y) {
494 for (int x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700495
496 src += src_stride;
497 dst += dst_stride;
498 }
499}
500
Yaowu Xuf883b422016-08-30 14:01:10 -0700501void aom_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700502 ptrdiff_t dst_stride, const int16_t *filter_x,
503 int x_step_q4, const int16_t *filter_y, int y_step_q4,
504 int w, int h) {
Yaowu Xuf883b422016-08-30 14:01:10 -0700505 aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700506 filter_y, y_step_q4, w, h);
507}
508
Yaowu Xuf883b422016-08-30 14:01:10 -0700509void aom_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700510 ptrdiff_t dst_stride, const int16_t *filter_x,
511 int x_step_q4, const int16_t *filter_y, int y_step_q4,
512 int w, int h) {
Yaowu Xuf883b422016-08-30 14:01:10 -0700513 aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700514 filter_y, y_step_q4, w, h);
515}
516
Yaowu Xuf883b422016-08-30 14:01:10 -0700517void aom_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700518 ptrdiff_t dst_stride, const int16_t *filter_x,
519 int x_step_q4, const int16_t *filter_y, int y_step_q4,
520 int w, int h) {
Yaowu Xuf883b422016-08-30 14:01:10 -0700521 aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700522 filter_y, y_step_q4, w, h);
523}
524
Yaowu Xuf883b422016-08-30 14:01:10 -0700525void aom_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700526 uint8_t *dst, ptrdiff_t dst_stride,
527 const int16_t *filter_x, int x_step_q4,
528 const int16_t *filter_y, int y_step_q4, int w,
529 int h) {
Yaowu Xuf883b422016-08-30 14:01:10 -0700530 aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700531 x_step_q4, filter_y, y_step_q4, w, h);
532}
533
Yaowu Xuf883b422016-08-30 14:01:10 -0700534void aom_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700535 uint8_t *dst, ptrdiff_t dst_stride,
536 const int16_t *filter_x, int x_step_q4,
537 const int16_t *filter_y, int y_step_q4, int w,
538 int h) {
Yaowu Xuf883b422016-08-30 14:01:10 -0700539 aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700540 x_step_q4, filter_y, y_step_q4, w, h);
541}
542
Yaowu Xuf883b422016-08-30 14:01:10 -0700543void aom_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700544 ptrdiff_t dst_stride, const int16_t *filter_x,
545 int x_step_q4, const int16_t *filter_y, int y_step_q4,
546 int w, int h) {
Yaowu Xuf883b422016-08-30 14:01:10 -0700547 aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700548 filter_y, y_step_q4, w, h);
549}
550
Sebastien Alaiwanb093b142017-11-09 17:23:58 +0100551static INLINE int highbd_vert_scalar_product(const uint16_t *a,
552 ptrdiff_t a_stride,
553 const int16_t *b) {
554 int sum = 0;
555 for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
556 return sum;
557}
Sebastien Alaiwanb093b142017-11-09 17:23:58 +0100558
Fergus Simpson505f0062017-06-27 11:23:34 -0700559// TODO(afergs): Make sure this works too
David Barkerbe6cc072016-12-15 15:39:10 +0000560#if CONFIG_LOOP_RESTORATION
561static void convolve_add_src_horiz(const uint8_t *src, ptrdiff_t src_stride,
562 uint8_t *dst, ptrdiff_t dst_stride,
563 const InterpKernel *x_filters, int x0_q4,
564 int x_step_q4, int w, int h) {
David Barkerbe6cc072016-12-15 15:39:10 +0000565 src -= SUBPEL_TAPS / 2 - 1;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100566 for (int y = 0; y < h; ++y) {
David Barkerbe6cc072016-12-15 15:39:10 +0000567 int x_q4 = x0_q4;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100568 for (int x = 0; x < w; ++x) {
David Barkerbe6cc072016-12-15 15:39:10 +0000569 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
570 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
Sebastien Alaiwanb093b142017-11-09 17:23:58 +0100571
572 const int sum = horz_scalar_product(src_x, x_filter);
David Barkerbe6cc072016-12-15 15:39:10 +0000573 dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
574 src_x[SUBPEL_TAPS / 2 - 1]);
575 x_q4 += x_step_q4;
576 }
577 src += src_stride;
578 dst += dst_stride;
579 }
580}
581
582static void convolve_add_src_vert(const uint8_t *src, ptrdiff_t src_stride,
583 uint8_t *dst, ptrdiff_t dst_stride,
584 const InterpKernel *y_filters, int y0_q4,
585 int y_step_q4, int w, int h) {
David Barkerbe6cc072016-12-15 15:39:10 +0000586 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
587
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100588 for (int x = 0; x < w; ++x) {
David Barkerbe6cc072016-12-15 15:39:10 +0000589 int y_q4 = y0_q4;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100590 for (int y = 0; y < h; ++y) {
David Barkerbe6cc072016-12-15 15:39:10 +0000591 const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
592 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
Sebastien Alaiwanb093b142017-11-09 17:23:58 +0100593 const int sum = vert_scalar_product(src_y, src_stride, y_filter);
David Barkerbe6cc072016-12-15 15:39:10 +0000594 dst[y * dst_stride] =
595 clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
596 src_y[(SUBPEL_TAPS / 2 - 1) * src_stride]);
597 y_q4 += y_step_q4;
598 }
599 ++src;
600 ++dst;
601 }
602}
603
604static void convolve_add_src(const uint8_t *src, ptrdiff_t src_stride,
605 uint8_t *dst, ptrdiff_t dst_stride,
606 const InterpKernel *const x_filters, int x0_q4,
607 int x_step_q4, const InterpKernel *const y_filters,
608 int y0_q4, int y_step_q4, int w, int h) {
609 uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
Sebastien Alaiwan9e9dea02017-11-29 11:53:48 +0100610 const int intermediate_height =
David Barkerbe6cc072016-12-15 15:39:10 +0000611 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
612
613 assert(w <= MAX_SB_SIZE);
614 assert(h <= MAX_SB_SIZE);
615
616 assert(y_step_q4 <= 32);
617 assert(x_step_q4 <= 32);
618
619 convolve_add_src_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
620 temp, MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
621 intermediate_height);
622 convolve_add_src_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE,
623 dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h);
624}
625
626void aom_convolve8_add_src_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
627 uint8_t *dst, ptrdiff_t dst_stride,
628 const int16_t *filter_x, int x_step_q4,
629 const int16_t *filter_y, int y_step_q4,
630 int w, int h) {
631 const InterpKernel *const filters_x = get_filter_base(filter_x);
632 const int x0_q4 = get_filter_offset(filter_x, filters_x);
633
634 (void)filter_y;
635 (void)y_step_q4;
636
637 convolve_add_src_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
638 x_step_q4, w, h);
639}
640
641void aom_convolve8_add_src_vert_c(const uint8_t *src, ptrdiff_t src_stride,
642 uint8_t *dst, ptrdiff_t dst_stride,
643 const int16_t *filter_x, int x_step_q4,
644 const int16_t *filter_y, int y_step_q4, int w,
645 int h) {
646 const InterpKernel *const filters_y = get_filter_base(filter_y);
647 const int y0_q4 = get_filter_offset(filter_y, filters_y);
648
649 (void)filter_x;
650 (void)x_step_q4;
651
652 convolve_add_src_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
653 y_step_q4, w, h);
654}
655
656void aom_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
657 uint8_t *dst, ptrdiff_t dst_stride,
658 const int16_t *filter_x, int x_step_q4,
659 const int16_t *filter_y, int y_step_q4, int w,
660 int h) {
661 const InterpKernel *const filters_x = get_filter_base(filter_x);
662 const int x0_q4 = get_filter_offset(filter_x, filters_x);
663
664 const InterpKernel *const filters_y = get_filter_base(filter_y);
665 const int y0_q4 = get_filter_offset(filter_y, filters_y);
666
667 convolve_add_src(src, src_stride, dst, dst_stride, filters_x, x0_q4,
668 x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
669}
Debargha Mukherjee28d15c72017-05-12 10:44:03 -0700670
Debargha Mukherjee28d15c72017-05-12 10:44:03 -0700671static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride,
672 uint16_t *dst, ptrdiff_t dst_stride,
673 const InterpKernel *x_filters, int x0_q4,
674 int x_step_q4, int w, int h) {
Debargha Mukherjee11cf46f2017-05-25 12:07:47 -0700675 const int bd = 8;
Debargha Mukherjee28d15c72017-05-12 10:44:03 -0700676 src -= SUBPEL_TAPS / 2 - 1;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100677 for (int y = 0; y < h; ++y) {
Debargha Mukherjee28d15c72017-05-12 10:44:03 -0700678 int x_q4 = x0_q4;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100679 for (int x = 0; x < w; ++x) {
Debargha Mukherjee28d15c72017-05-12 10:44:03 -0700680 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
681 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
Sebastien Alaiwanb093b142017-11-09 17:23:58 +0100682 const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
683 (1 << (bd + FILTER_BITS - 1));
684 const int sum = horz_scalar_product(src_x, x_filter) + rounding;
Debargha Mukherjee28d15c72017-05-12 10:44:03 -0700685 dst[x] =
686 (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, FILTER_BITS - EXTRAPREC_BITS),
Debargha Mukherjee11cf46f2017-05-25 12:07:47 -0700687 0, EXTRAPREC_CLAMP_LIMIT(bd) - 1);
Debargha Mukherjee28d15c72017-05-12 10:44:03 -0700688 x_q4 += x_step_q4;
689 }
690 src += src_stride;
691 dst += dst_stride;
692 }
693}
694
695static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride,
696 uint8_t *dst, ptrdiff_t dst_stride,
697 const InterpKernel *y_filters, int y0_q4,
698 int y_step_q4, int w, int h) {
Debargha Mukherjee11cf46f2017-05-25 12:07:47 -0700699 const int bd = 8;
Debargha Mukherjee28d15c72017-05-12 10:44:03 -0700700 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
701
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100702 for (int x = 0; x < w; ++x) {
Debargha Mukherjee28d15c72017-05-12 10:44:03 -0700703 int y_q4 = y0_q4;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100704 for (int y = 0; y < h; ++y) {
Debargha Mukherjee28d15c72017-05-12 10:44:03 -0700705 const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
706 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
Sebastien Alaiwanb093b142017-11-09 17:23:58 +0100707 const int rounding =
Debargha Mukherjee11cf46f2017-05-25 12:07:47 -0700708 ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
709 (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1));
Sebastien Alaiwanb093b142017-11-09 17:23:58 +0100710 const int sum =
711 highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
Debargha Mukherjee28d15c72017-05-12 10:44:03 -0700712 dst[y * dst_stride] =
713 clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS + EXTRAPREC_BITS));
714 y_q4 += y_step_q4;
715 }
716 ++src;
717 ++dst;
718 }
719}
720
721static void convolve_add_src_hip(const uint8_t *src, ptrdiff_t src_stride,
722 uint8_t *dst, ptrdiff_t dst_stride,
723 const InterpKernel *const x_filters, int x0_q4,
724 int x_step_q4,
725 const InterpKernel *const y_filters, int y0_q4,
726 int y_step_q4, int w, int h) {
727 uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
Sebastien Alaiwan9e9dea02017-11-29 11:53:48 +0100728 const int intermediate_height =
Debargha Mukherjee28d15c72017-05-12 10:44:03 -0700729 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
730
731 assert(w <= MAX_SB_SIZE);
732 assert(h <= MAX_SB_SIZE);
733
734 assert(y_step_q4 <= 32);
735 assert(x_step_q4 <= 32);
736
737 convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
738 src_stride, temp, MAX_SB_SIZE, x_filters, x0_q4,
739 x_step_q4, w, intermediate_height);
740 convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
741 MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4,
742 y_step_q4, w, h);
743}
744
745void aom_convolve8_add_src_horiz_hip_c(const uint8_t *src, ptrdiff_t src_stride,
746 uint16_t *dst, ptrdiff_t dst_stride,
747 const int16_t *filter_x, int x_step_q4,
748 const int16_t *filter_y, int y_step_q4,
749 int w, int h) {
750 const InterpKernel *const filters_x = get_filter_base(filter_x);
751 const int x0_q4 = get_filter_offset(filter_x, filters_x);
752
753 (void)filter_y;
754 (void)y_step_q4;
755
756 convolve_add_src_horiz_hip(src, src_stride, dst, dst_stride, filters_x, x0_q4,
757 x_step_q4, w, h);
758}
759
760void aom_convolve8_add_src_vert_hip_c(const uint16_t *src, ptrdiff_t src_stride,
761 uint8_t *dst, ptrdiff_t dst_stride,
762 const int16_t *filter_x, int x_step_q4,
763 const int16_t *filter_y, int y_step_q4,
764 int w, int h) {
765 const InterpKernel *const filters_y = get_filter_base(filter_y);
766 const int y0_q4 = get_filter_offset(filter_y, filters_y);
767
768 (void)filter_x;
769 (void)x_step_q4;
770
771 convolve_add_src_vert_hip(src, src_stride, dst, dst_stride, filters_y, y0_q4,
772 y_step_q4, w, h);
773}
774
775void aom_convolve8_add_src_hip_c(const uint8_t *src, ptrdiff_t src_stride,
776 uint8_t *dst, ptrdiff_t dst_stride,
777 const int16_t *filter_x, int x_step_q4,
778 const int16_t *filter_y, int y_step_q4, int w,
779 int h) {
780 const InterpKernel *const filters_x = get_filter_base(filter_x);
781 const int x0_q4 = get_filter_offset(filter_x, filters_x);
782
783 const InterpKernel *const filters_y = get_filter_base(filter_y);
784 const int y0_q4 = get_filter_offset(filter_y, filters_y);
785
786 convolve_add_src_hip(src, src_stride, dst, dst_stride, filters_x, x0_q4,
787 x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
788}
David Barkerbe6cc072016-12-15 15:39:10 +0000789#endif // CONFIG_LOOP_RESTORATION
790
Sebastien Alaiwanb093b142017-11-09 17:23:58 +0100791static INLINE int highbd_horz_scalar_product(const uint16_t *a,
792 const int16_t *b) {
793 int sum = 0;
794 for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
795 return sum;
796}
797
Yaowu Xuc27fc142016-08-22 16:08:15 -0700798static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
799 uint8_t *dst8, ptrdiff_t dst_stride,
800 const InterpKernel *x_filters, int x0_q4,
801 int x_step_q4, int w, int h, int bd) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700802 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
803 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
804 src -= SUBPEL_TAPS / 2 - 1;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100805 for (int y = 0; y < h; ++y) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700806 int x_q4 = x0_q4;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100807 for (int x = 0; x < w; ++x) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700808 const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
809 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
Sebastien Alaiwanb093b142017-11-09 17:23:58 +0100810 const int sum = highbd_horz_scalar_product(src_x, x_filter);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700811 dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
812 x_q4 += x_step_q4;
813 }
814 src += src_stride;
815 dst += dst_stride;
816 }
817}
818
819static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride,
820 uint8_t *dst8, ptrdiff_t dst_stride,
821 const InterpKernel *x_filters, int x0_q4,
822 int x_step_q4, int w, int h, int bd) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700823 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
824 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
825 src -= SUBPEL_TAPS / 2 - 1;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100826 for (int y = 0; y < h; ++y) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700827 int x_q4 = x0_q4;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100828 for (int x = 0; x < w; ++x) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700829 const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
830 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
Sebastien Alaiwanb093b142017-11-09 17:23:58 +0100831 const int sum = highbd_horz_scalar_product(src_x, x_filter);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700832 dst[x] = ROUND_POWER_OF_TWO(
833 dst[x] + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
834 1);
835 x_q4 += x_step_q4;
836 }
837 src += src_stride;
838 dst += dst_stride;
839 }
840}
841
842static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
843 uint8_t *dst8, ptrdiff_t dst_stride,
844 const InterpKernel *y_filters, int y0_q4,
845 int y_step_q4, int w, int h, int bd) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700846 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
847 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
848 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100849 for (int x = 0; x < w; ++x) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700850 int y_q4 = y0_q4;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100851 for (int y = 0; y < h; ++y) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700852 const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
853 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
Sebastien Alaiwanb093b142017-11-09 17:23:58 +0100854 const int sum = highbd_vert_scalar_product(src_y, src_stride, y_filter);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700855 dst[y * dst_stride] =
856 clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
857 y_q4 += y_step_q4;
858 }
859 ++src;
860 ++dst;
861 }
862}
863
864static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride,
865 uint8_t *dst8, ptrdiff_t dst_stride,
866 const InterpKernel *y_filters, int y0_q4,
867 int y_step_q4, int w, int h, int bd) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700868 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
869 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
870 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100871 for (int x = 0; x < w; ++x) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700872 int y_q4 = y0_q4;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +0100873 for (int y = 0; y < h; ++y) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700874 const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
875 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
Sebastien Alaiwanb093b142017-11-09 17:23:58 +0100876 const int sum = highbd_vert_scalar_product(src_y, src_stride, y_filter);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700877 dst[y * dst_stride] = ROUND_POWER_OF_TWO(
878 dst[y * dst_stride] +
879 clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
880 1);
881 y_q4 += y_step_q4;
882 }
883 ++src;
884 ++dst;
885 }
886}
887
888static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride,
889 uint8_t *dst, ptrdiff_t dst_stride,
890 const InterpKernel *const x_filters, int x0_q4,
891 int x_step_q4, const InterpKernel *const y_filters,
892 int y0_q4, int y_step_q4, int w, int h, int bd) {
893 // Note: Fixed size intermediate buffer, temp, places limits on parameters.
894 // 2d filtering proceeds in 2 steps:
895 // (1) Interpolate horizontally into an intermediate buffer, temp.
896 // (2) Interpolate temp vertically to derive the sub-pixel result.
897 // Deriving the maximum number of rows in the temp buffer (135):
898 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
899 // --Largest block size is 64x64 pixels.
900 // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
901 // original frame (in 1/16th pixel units).
902 // --Must round-up because block may be located at sub-pixel position.
903 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
904 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
905 uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
Sebastien Alaiwan9e9dea02017-11-29 11:53:48 +0100906 const int intermediate_height =
Yaowu Xuc27fc142016-08-22 16:08:15 -0700907 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
908
909 assert(w <= MAX_SB_SIZE);
910 assert(h <= MAX_SB_SIZE);
911 assert(y_step_q4 <= 32);
912 assert(x_step_q4 <= 32);
913
914 highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
915 CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, x_filters, x0_q4,
916 x_step_q4, w, intermediate_height, bd);
917 highbd_convolve_vert(
918 CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
919 MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, bd);
920}
921
Yaowu Xuf883b422016-08-30 14:01:10 -0700922void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700923 uint8_t *dst, ptrdiff_t dst_stride,
924 const int16_t *filter_x, int x_step_q4,
925 const int16_t *filter_y, int y_step_q4, int w,
926 int h, int bd) {
927 const InterpKernel *const filters_x = get_filter_base(filter_x);
928 const int x0_q4 = get_filter_offset(filter_x, filters_x);
929 (void)filter_y;
930 (void)y_step_q4;
931
932 highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
933 x_step_q4, w, h, bd);
934}
935
Yaowu Xuf883b422016-08-30 14:01:10 -0700936void aom_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700937 uint8_t *dst, ptrdiff_t dst_stride,
938 const int16_t *filter_x, int x_step_q4,
939 const int16_t *filter_y, int y_step_q4,
940 int w, int h, int bd) {
941 const InterpKernel *const filters_x = get_filter_base(filter_x);
942 const int x0_q4 = get_filter_offset(filter_x, filters_x);
943 (void)filter_y;
944 (void)y_step_q4;
945
946 highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
947 x_step_q4, w, h, bd);
948}
949
Yaowu Xuf883b422016-08-30 14:01:10 -0700950void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700951 uint8_t *dst, ptrdiff_t dst_stride,
952 const int16_t *filter_x, int x_step_q4,
953 const int16_t *filter_y, int y_step_q4, int w,
954 int h, int bd) {
955 const InterpKernel *const filters_y = get_filter_base(filter_y);
956 const int y0_q4 = get_filter_offset(filter_y, filters_y);
957 (void)filter_x;
958 (void)x_step_q4;
959
960 highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
961 y_step_q4, w, h, bd);
962}
963
Yaowu Xuf883b422016-08-30 14:01:10 -0700964void aom_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700965 uint8_t *dst, ptrdiff_t dst_stride,
966 const int16_t *filter_x, int x_step_q4,
967 const int16_t *filter_y, int y_step_q4,
968 int w, int h, int bd) {
969 const InterpKernel *const filters_y = get_filter_base(filter_y);
970 const int y0_q4 = get_filter_offset(filter_y, filters_y);
971 (void)filter_x;
972 (void)x_step_q4;
973
974 highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
975 y_step_q4, w, h, bd);
976}
977
Yaowu Xuf883b422016-08-30 14:01:10 -0700978void aom_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700979 uint8_t *dst, ptrdiff_t dst_stride,
980 const int16_t *filter_x, int x_step_q4,
981 const int16_t *filter_y, int y_step_q4, int w,
982 int h, int bd) {
983 const InterpKernel *const filters_x = get_filter_base(filter_x);
984 const int x0_q4 = get_filter_offset(filter_x, filters_x);
985
986 const InterpKernel *const filters_y = get_filter_base(filter_y);
987 const int y0_q4 = get_filter_offset(filter_y, filters_y);
988
989 highbd_convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
990 filters_y, y0_q4, y_step_q4, w, h, bd);
991}
992
Yaowu Xuf883b422016-08-30 14:01:10 -0700993void aom_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700994 uint8_t *dst, ptrdiff_t dst_stride,
995 const int16_t *filter_x, int x_step_q4,
996 const int16_t *filter_y, int y_step_q4, int w,
997 int h, int bd) {
998 // Fixed size intermediate buffer places limits on parameters.
999 DECLARE_ALIGNED(16, uint16_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
1000 assert(w <= MAX_SB_SIZE);
1001 assert(h <= MAX_SB_SIZE);
1002
Yaowu Xuf883b422016-08-30 14:01:10 -07001003 aom_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE,
Yaowu Xuc27fc142016-08-22 16:08:15 -07001004 filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);
Yaowu Xuf883b422016-08-30 14:01:10 -07001005 aom_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, dst,
Yaowu Xuc27fc142016-08-22 16:08:15 -07001006 dst_stride, NULL, 0, NULL, 0, w, h, bd);
1007}
1008
Yaowu Xuf883b422016-08-30 14:01:10 -07001009void aom_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -07001010 uint8_t *dst8, ptrdiff_t dst_stride,
1011 const int16_t *filter_x, int filter_x_stride,
1012 const int16_t *filter_y, int filter_y_stride,
1013 int w, int h, int bd) {
1014 int r;
1015 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1016 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1017 (void)filter_x;
1018 (void)filter_y;
1019 (void)filter_x_stride;
1020 (void)filter_y_stride;
1021 (void)bd;
1022
1023 for (r = h; r > 0; --r) {
1024 memcpy(dst, src, w * sizeof(uint16_t));
1025 src += src_stride;
1026 dst += dst_stride;
1027 }
1028}
1029
Yaowu Xuf883b422016-08-30 14:01:10 -07001030void aom_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -07001031 uint8_t *dst8, ptrdiff_t dst_stride,
1032 const int16_t *filter_x, int filter_x_stride,
1033 const int16_t *filter_y, int filter_y_stride,
1034 int w, int h, int bd) {
Yaowu Xuc27fc142016-08-22 16:08:15 -07001035 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1036 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1037 (void)filter_x;
1038 (void)filter_y;
1039 (void)filter_x_stride;
1040 (void)filter_y_stride;
1041 (void)bd;
1042
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +01001043 for (int y = 0; y < h; ++y) {
1044 for (int x = 0; x < w; ++x) {
Yaowu Xuc27fc142016-08-22 16:08:15 -07001045 dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
1046 }
1047 src += src_stride;
1048 dst += dst_stride;
1049 }
1050}
David Barkerbe6cc072016-12-15 15:39:10 +00001051
1052#if CONFIG_LOOP_RESTORATION
1053static void highbd_convolve_add_src_horiz(const uint8_t *src8,
1054 ptrdiff_t src_stride, uint8_t *dst8,
1055 ptrdiff_t dst_stride,
1056 const InterpKernel *x_filters,
1057 int x0_q4, int x_step_q4, int w,
1058 int h, int bd) {
David Barkerbe6cc072016-12-15 15:39:10 +00001059 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1060 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1061 src -= SUBPEL_TAPS / 2 - 1;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +01001062 for (int y = 0; y < h; ++y) {
David Barkerbe6cc072016-12-15 15:39:10 +00001063 int x_q4 = x0_q4;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +01001064 for (int x = 0; x < w; ++x) {
David Barkerbe6cc072016-12-15 15:39:10 +00001065 const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
1066 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
Sebastien Alaiwanb093b142017-11-09 17:23:58 +01001067 const int sum = highbd_horz_scalar_product(src_x, x_filter);
David Barkerbe6cc072016-12-15 15:39:10 +00001068 dst[x] = clip_pixel_highbd(
1069 ROUND_POWER_OF_TWO(sum, FILTER_BITS) + src_x[SUBPEL_TAPS / 2 - 1],
1070 bd);
1071 x_q4 += x_step_q4;
1072 }
1073 src += src_stride;
1074 dst += dst_stride;
1075 }
1076}
1077
1078static void highbd_convolve_add_src_vert(const uint8_t *src8,
1079 ptrdiff_t src_stride, uint8_t *dst8,
1080 ptrdiff_t dst_stride,
1081 const InterpKernel *y_filters,
1082 int y0_q4, int y_step_q4, int w, int h,
1083 int bd) {
David Barkerbe6cc072016-12-15 15:39:10 +00001084 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1085 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1086 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +01001087 for (int x = 0; x < w; ++x) {
David Barkerbe6cc072016-12-15 15:39:10 +00001088 int y_q4 = y0_q4;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +01001089 for (int y = 0; y < h; ++y) {
David Barkerbe6cc072016-12-15 15:39:10 +00001090 const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1091 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
Sebastien Alaiwanb093b142017-11-09 17:23:58 +01001092 const int sum = highbd_vert_scalar_product(src_y, src_stride, y_filter);
David Barkerbe6cc072016-12-15 15:39:10 +00001093 dst[y * dst_stride] =
1094 clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
1095 src_y[(SUBPEL_TAPS / 2 - 1) * src_stride],
1096 bd);
1097 y_q4 += y_step_q4;
1098 }
1099 ++src;
1100 ++dst;
1101 }
1102}
1103
1104static void highbd_convolve_add_src(const uint8_t *src, ptrdiff_t src_stride,
1105 uint8_t *dst, ptrdiff_t dst_stride,
1106 const InterpKernel *const x_filters,
1107 int x0_q4, int x_step_q4,
1108 const InterpKernel *const y_filters,
1109 int y0_q4, int y_step_q4, int w, int h,
1110 int bd) {
1111 // Note: Fixed size intermediate buffer, temp, places limits on parameters.
1112 // 2d filtering proceeds in 2 steps:
1113 // (1) Interpolate horizontally into an intermediate buffer, temp.
1114 // (2) Interpolate temp vertically to derive the sub-pixel result.
1115 // Deriving the maximum number of rows in the temp buffer (135):
1116 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
1117 // --Largest block size is 64x64 pixels.
1118 // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
1119 // original frame (in 1/16th pixel units).
1120 // --Must round-up because block may be located at sub-pixel position.
1121 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
1122 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
1123 uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
Sebastien Alaiwan9e9dea02017-11-29 11:53:48 +01001124 const int intermediate_height =
David Barkerbe6cc072016-12-15 15:39:10 +00001125 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
1126
1127 assert(w <= MAX_SB_SIZE);
1128 assert(h <= MAX_SB_SIZE);
1129 assert(y_step_q4 <= 32);
1130 assert(x_step_q4 <= 32);
1131
1132 highbd_convolve_add_src_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1133 src_stride, CONVERT_TO_BYTEPTR(temp),
1134 MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
1135 intermediate_height, bd);
1136 highbd_convolve_add_src_vert(
1137 CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
1138 MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, bd);
1139}
1140
David Barkerbe6cc072016-12-15 15:39:10 +00001141void aom_highbd_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
1142 uint8_t *dst, ptrdiff_t dst_stride,
1143 const int16_t *filter_x, int x_step_q4,
1144 const int16_t *filter_y, int y_step_q4,
1145 int w, int h, int bd) {
1146 const InterpKernel *const filters_x = get_filter_base(filter_x);
1147 const int x0_q4 = get_filter_offset(filter_x, filters_x);
1148
1149 const InterpKernel *const filters_y = get_filter_base(filter_y);
1150 const int y0_q4 = get_filter_offset(filter_y, filters_y);
1151
1152 highbd_convolve_add_src(src, src_stride, dst, dst_stride, filters_x, x0_q4,
1153 x_step_q4, filters_y, y0_q4, y_step_q4, w, h, bd);
1154}
Debargha Mukherjee28d15c72017-05-12 10:44:03 -07001155
1156static void highbd_convolve_add_src_horiz_hip(
1157 const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst,
1158 ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4,
1159 int x_step_q4, int w, int h, int bd) {
Debargha Mukherjee11cf46f2017-05-25 12:07:47 -07001160 const int extraprec_clamp_limit = EXTRAPREC_CLAMP_LIMIT(bd);
Debargha Mukherjee28d15c72017-05-12 10:44:03 -07001161 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1162 src -= SUBPEL_TAPS / 2 - 1;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +01001163 for (int y = 0; y < h; ++y) {
Debargha Mukherjee28d15c72017-05-12 10:44:03 -07001164 int x_q4 = x0_q4;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +01001165 for (int x = 0; x < w; ++x) {
Debargha Mukherjee28d15c72017-05-12 10:44:03 -07001166 const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
1167 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
Sebastien Alaiwanb093b142017-11-09 17:23:58 +01001168 const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
1169 (1 << (bd + FILTER_BITS - 1));
1170 const int sum = highbd_horz_scalar_product(src_x, x_filter) + rounding;
Debargha Mukherjee28d15c72017-05-12 10:44:03 -07001171 dst[x] =
1172 (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, FILTER_BITS - EXTRAPREC_BITS),
1173 0, extraprec_clamp_limit - 1);
1174 x_q4 += x_step_q4;
1175 }
1176 src += src_stride;
1177 dst += dst_stride;
1178 }
1179}
1180
1181static void highbd_convolve_add_src_vert_hip(
1182 const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst8,
1183 ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4,
1184 int y_step_q4, int w, int h, int bd) {
Debargha Mukherjee28d15c72017-05-12 10:44:03 -07001185 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1186 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +01001187 for (int x = 0; x < w; ++x) {
Debargha Mukherjee28d15c72017-05-12 10:44:03 -07001188 int y_q4 = y0_q4;
Sebastien Alaiwanc76ed282017-11-09 16:59:25 +01001189 for (int y = 0; y < h; ++y) {
Debargha Mukherjee28d15c72017-05-12 10:44:03 -07001190 const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1191 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
Sebastien Alaiwanb093b142017-11-09 17:23:58 +01001192 const int rounding =
Debargha Mukherjee11cf46f2017-05-25 12:07:47 -07001193 ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
1194 (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1));
Sebastien Alaiwanb093b142017-11-09 17:23:58 +01001195 const int sum =
1196 highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
Debargha Mukherjee28d15c72017-05-12 10:44:03 -07001197 dst[y * dst_stride] = clip_pixel_highbd(
1198 ROUND_POWER_OF_TWO(sum, FILTER_BITS + EXTRAPREC_BITS), bd);
1199 y_q4 += y_step_q4;
1200 }
1201 ++src;
1202 ++dst;
1203 }
1204}
1205
1206static void highbd_convolve_add_src_hip(
1207 const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
1208 ptrdiff_t dst_stride, const InterpKernel *const x_filters, int x0_q4,
1209 int x_step_q4, const InterpKernel *const y_filters, int y0_q4,
1210 int y_step_q4, int w, int h, int bd) {
1211 // Note: Fixed size intermediate buffer, temp, places limits on parameters.
1212 // 2d filtering proceeds in 2 steps:
1213 // (1) Interpolate horizontally into an intermediate buffer, temp.
1214 // (2) Interpolate temp vertically to derive the sub-pixel result.
1215 // Deriving the maximum number of rows in the temp buffer (135):
1216 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
1217 // --Largest block size is 64x64 pixels.
1218 // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
1219 // original frame (in 1/16th pixel units).
1220 // --Must round-up because block may be located at sub-pixel position.
1221 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
1222 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
1223 uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
Sebastien Alaiwan9e9dea02017-11-29 11:53:48 +01001224 const int intermediate_height =
Debargha Mukherjee28d15c72017-05-12 10:44:03 -07001225 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
1226
1227 assert(w <= MAX_SB_SIZE);
1228 assert(h <= MAX_SB_SIZE);
1229 assert(y_step_q4 <= 32);
1230 assert(x_step_q4 <= 32);
1231
1232 highbd_convolve_add_src_horiz_hip(
1233 src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, MAX_SB_SIZE,
1234 x_filters, x0_q4, x_step_q4, w, intermediate_height, bd);
1235 highbd_convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
1236 MAX_SB_SIZE, dst, dst_stride, y_filters,
1237 y0_q4, y_step_q4, w, h, bd);
1238}
1239
Debargha Mukherjee28d15c72017-05-12 10:44:03 -07001240void aom_highbd_convolve8_add_src_hip_c(const uint8_t *src,
1241 ptrdiff_t src_stride, uint8_t *dst,
1242 ptrdiff_t dst_stride,
1243 const int16_t *filter_x, int x_step_q4,
1244 const int16_t *filter_y, int y_step_q4,
1245 int w, int h, int bd) {
1246 const InterpKernel *const filters_x = get_filter_base(filter_x);
1247 const int x0_q4 = get_filter_offset(filter_x, filters_x);
1248
1249 const InterpKernel *const filters_y = get_filter_base(filter_y);
1250 const int y0_q4 = get_filter_offset(filter_y, filters_y);
1251
1252 highbd_convolve_add_src_hip(src, src_stride, dst, dst_stride, filters_x,
1253 x0_q4, x_step_q4, filters_y, y0_q4, y_step_q4, w,
1254 h, bd);
1255}
David Barkerbe6cc072016-12-15 15:39:10 +00001256#endif // CONFIG_LOOP_RESTORATION