blob: 4dac6aacc8606b9894ddb31e0ff0a1fa62311693 [file] [log] [blame]
Yaowu Xuc27fc142016-08-22 16:08:15 -07001/*
Yaowu Xu9c01aa12016-09-01 14:32:49 -07002 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
Yaowu Xuc27fc142016-08-22 16:08:15 -07003 *
Yaowu Xu9c01aa12016-09-01 14:32:49 -07004 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
Yaowu Xuc27fc142016-08-22 16:08:15 -070010 */
11
12#include <assert.h>
13#include <string.h>
14
Yaowu Xuf883b422016-08-30 14:01:10 -070015#include "./aom_config.h"
16#include "./aom_dsp_rtcd.h"
17#include "aom/aom_integer.h"
18#include "aom_dsp/aom_convolve.h"
19#include "aom_dsp/aom_dsp_common.h"
20#include "aom_dsp/aom_filter.h"
Yaowu Xuc27fc142016-08-22 16:08:15 -070021#include "aom_ports/mem.h"
22
23static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
24 uint8_t *dst, ptrdiff_t dst_stride,
25 const InterpKernel *x_filters, int x0_q4,
26 int x_step_q4, int w, int h) {
27 int x, y;
28 src -= SUBPEL_TAPS / 2 - 1;
29 for (y = 0; y < h; ++y) {
30 int x_q4 = x0_q4;
31 for (x = 0; x < w; ++x) {
32 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
33 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
34 int k, sum = 0;
35 for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
36 dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
37 x_q4 += x_step_q4;
38 }
39 src += src_stride;
40 dst += dst_stride;
41 }
42}
43
44static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
45 uint8_t *dst, ptrdiff_t dst_stride,
46 const InterpKernel *x_filters, int x0_q4,
47 int x_step_q4, int w, int h) {
48 int x, y;
49 src -= SUBPEL_TAPS / 2 - 1;
50 for (y = 0; y < h; ++y) {
51 int x_q4 = x0_q4;
52 for (x = 0; x < w; ++x) {
53 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
54 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
55 int k, sum = 0;
56 for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
57 dst[x] = ROUND_POWER_OF_TWO(
58 dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
59 x_q4 += x_step_q4;
60 }
61 src += src_stride;
62 dst += dst_stride;
63 }
64}
65
66static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
67 uint8_t *dst, ptrdiff_t dst_stride,
68 const InterpKernel *y_filters, int y0_q4,
69 int y_step_q4, int w, int h) {
70 int x, y;
71 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
72
73 for (x = 0; x < w; ++x) {
74 int y_q4 = y0_q4;
75 for (y = 0; y < h; ++y) {
76 const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
77 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
78 int k, sum = 0;
79 for (k = 0; k < SUBPEL_TAPS; ++k)
80 sum += src_y[k * src_stride] * y_filter[k];
81 dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
82 y_q4 += y_step_q4;
83 }
84 ++src;
85 ++dst;
86 }
87}
88
89static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
90 uint8_t *dst, ptrdiff_t dst_stride,
91 const InterpKernel *y_filters, int y0_q4,
92 int y_step_q4, int w, int h) {
93 int x, y;
94 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
95
96 for (x = 0; x < w; ++x) {
97 int y_q4 = y0_q4;
98 for (y = 0; y < h; ++y) {
99 const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
100 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
101 int k, sum = 0;
102 for (k = 0; k < SUBPEL_TAPS; ++k)
103 sum += src_y[k * src_stride] * y_filter[k];
104 dst[y * dst_stride] = ROUND_POWER_OF_TWO(
105 dst[y * dst_stride] +
106 clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),
107 1);
108 y_q4 += y_step_q4;
109 }
110 ++src;
111 ++dst;
112 }
113}
114
115static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
116 ptrdiff_t dst_stride, const InterpKernel *const x_filters,
117 int x0_q4, int x_step_q4,
118 const InterpKernel *const y_filters, int y0_q4,
119 int y_step_q4, int w, int h) {
120 // Note: Fixed size intermediate buffer, temp, places limits on parameters.
121 // 2d filtering proceeds in 2 steps:
122 // (1) Interpolate horizontally into an intermediate buffer, temp.
123 // (2) Interpolate temp vertically to derive the sub-pixel result.
124 // Deriving the maximum number of rows in the temp buffer (135):
125 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
126 // --Largest block size is 64x64 pixels.
127 // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
128 // original frame (in 1/16th pixel units).
129 // --Must round-up because block may be located at sub-pixel position.
130 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
131 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
132 uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
133 int intermediate_height =
134 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
135
136 assert(w <= MAX_SB_SIZE);
137 assert(h <= MAX_SB_SIZE);
138
139 assert(y_step_q4 <= 32);
140 assert(x_step_q4 <= 32);
141
142 convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp,
143 MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
144 intermediate_height);
145 convolve_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst,
146 dst_stride, y_filters, y0_q4, y_step_q4, w, h);
147}
148
149static const InterpKernel *get_filter_base(const int16_t *filter) {
150 // NOTE: This assumes that the filter table is 256-byte aligned.
151 // TODO(agrange) Modify to make independent of table alignment.
152 return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
153}
154
155static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
156 return (int)((const InterpKernel *)(intptr_t)f - base);
157}
158
Yaowu Xuf883b422016-08-30 14:01:10 -0700159void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700160 uint8_t *dst, ptrdiff_t dst_stride,
161 const int16_t *filter_x, int x_step_q4,
162 const int16_t *filter_y, int y_step_q4, int w,
163 int h) {
164 const InterpKernel *const filters_x = get_filter_base(filter_x);
165 const int x0_q4 = get_filter_offset(filter_x, filters_x);
166
167 (void)filter_y;
168 (void)y_step_q4;
169
170 convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
171 w, h);
172}
173
Yaowu Xuf883b422016-08-30 14:01:10 -0700174void aom_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700175 uint8_t *dst, ptrdiff_t dst_stride,
176 const int16_t *filter_x, int x_step_q4,
177 const int16_t *filter_y, int y_step_q4, int w,
178 int h) {
179 const InterpKernel *const filters_x = get_filter_base(filter_x);
180 const int x0_q4 = get_filter_offset(filter_x, filters_x);
181
182 (void)filter_y;
183 (void)y_step_q4;
184
185 convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
186 x_step_q4, w, h);
187}
188
Yaowu Xuf883b422016-08-30 14:01:10 -0700189void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700190 uint8_t *dst, ptrdiff_t dst_stride,
191 const int16_t *filter_x, int x_step_q4,
192 const int16_t *filter_y, int y_step_q4, int w,
193 int h) {
194 const InterpKernel *const filters_y = get_filter_base(filter_y);
195 const int y0_q4 = get_filter_offset(filter_y, filters_y);
196
197 (void)filter_x;
198 (void)x_step_q4;
199
200 convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4,
201 w, h);
202}
203
Yaowu Xuf883b422016-08-30 14:01:10 -0700204void aom_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700205 uint8_t *dst, ptrdiff_t dst_stride,
206 const int16_t *filter_x, int x_step_q4,
207 const int16_t *filter_y, int y_step_q4, int w,
208 int h) {
209 const InterpKernel *const filters_y = get_filter_base(filter_y);
210 const int y0_q4 = get_filter_offset(filter_y, filters_y);
211
212 (void)filter_x;
213 (void)x_step_q4;
214
215 convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
216 y_step_q4, w, h);
217}
218
Yaowu Xuf883b422016-08-30 14:01:10 -0700219void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700220 ptrdiff_t dst_stride, const int16_t *filter_x,
221 int x_step_q4, const int16_t *filter_y, int y_step_q4,
222 int w, int h) {
223 const InterpKernel *const filters_x = get_filter_base(filter_x);
224 const int x0_q4 = get_filter_offset(filter_x, filters_x);
225
226 const InterpKernel *const filters_y = get_filter_base(filter_y);
227 const int y0_q4 = get_filter_offset(filter_y, filters_y);
228
229 convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
230 filters_y, y0_q4, y_step_q4, w, h);
231}
232
Yaowu Xuf883b422016-08-30 14:01:10 -0700233void aom_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700234 ptrdiff_t dst_stride, const int16_t *filter_x,
235 int x_step_q4, const int16_t *filter_y, int y_step_q4,
236 int w, int h) {
237 /* Fixed size intermediate buffer places limits on parameters. */
238 DECLARE_ALIGNED(16, uint8_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
239 assert(w <= MAX_SB_SIZE);
240 assert(h <= MAX_SB_SIZE);
241
Yaowu Xuf883b422016-08-30 14:01:10 -0700242 aom_convolve8_c(src, src_stride, temp, MAX_SB_SIZE, filter_x, x_step_q4,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700243 filter_y, y_step_q4, w, h);
Yaowu Xuf883b422016-08-30 14:01:10 -0700244 aom_convolve_avg_c(temp, MAX_SB_SIZE, dst, dst_stride, NULL, 0, NULL, 0, w,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700245 h);
246}
247
Yaowu Xuf883b422016-08-30 14:01:10 -0700248void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700249 ptrdiff_t dst_stride, const int16_t *filter_x,
250 int filter_x_stride, const int16_t *filter_y,
251 int filter_y_stride, int w, int h) {
252 int r;
253
254 (void)filter_x;
255 (void)filter_x_stride;
256 (void)filter_y;
257 (void)filter_y_stride;
258
259 for (r = h; r > 0; --r) {
260 memcpy(dst, src, w);
261 src += src_stride;
262 dst += dst_stride;
263 }
264}
265
Yaowu Xuf883b422016-08-30 14:01:10 -0700266void aom_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700267 ptrdiff_t dst_stride, const int16_t *filter_x,
268 int filter_x_stride, const int16_t *filter_y,
269 int filter_y_stride, int w, int h) {
270 int x, y;
271
272 (void)filter_x;
273 (void)filter_x_stride;
274 (void)filter_y;
275 (void)filter_y_stride;
276
277 for (y = 0; y < h; ++y) {
278 for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
279
280 src += src_stride;
281 dst += dst_stride;
282 }
283}
284
Yaowu Xuf883b422016-08-30 14:01:10 -0700285void aom_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700286 ptrdiff_t dst_stride, const int16_t *filter_x,
287 int x_step_q4, const int16_t *filter_y, int y_step_q4,
288 int w, int h) {
Yaowu Xuf883b422016-08-30 14:01:10 -0700289 aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700290 filter_y, y_step_q4, w, h);
291}
292
Yaowu Xuf883b422016-08-30 14:01:10 -0700293void aom_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700294 ptrdiff_t dst_stride, const int16_t *filter_x,
295 int x_step_q4, const int16_t *filter_y, int y_step_q4,
296 int w, int h) {
Yaowu Xuf883b422016-08-30 14:01:10 -0700297 aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700298 filter_y, y_step_q4, w, h);
299}
300
Yaowu Xuf883b422016-08-30 14:01:10 -0700301void aom_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700302 ptrdiff_t dst_stride, const int16_t *filter_x,
303 int x_step_q4, const int16_t *filter_y, int y_step_q4,
304 int w, int h) {
Yaowu Xuf883b422016-08-30 14:01:10 -0700305 aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700306 filter_y, y_step_q4, w, h);
307}
308
Yaowu Xuf883b422016-08-30 14:01:10 -0700309void aom_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700310 uint8_t *dst, ptrdiff_t dst_stride,
311 const int16_t *filter_x, int x_step_q4,
312 const int16_t *filter_y, int y_step_q4, int w,
313 int h) {
Yaowu Xuf883b422016-08-30 14:01:10 -0700314 aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700315 x_step_q4, filter_y, y_step_q4, w, h);
316}
317
Yaowu Xuf883b422016-08-30 14:01:10 -0700318void aom_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700319 uint8_t *dst, ptrdiff_t dst_stride,
320 const int16_t *filter_x, int x_step_q4,
321 const int16_t *filter_y, int y_step_q4, int w,
322 int h) {
Yaowu Xuf883b422016-08-30 14:01:10 -0700323 aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700324 x_step_q4, filter_y, y_step_q4, w, h);
325}
326
Yaowu Xuf883b422016-08-30 14:01:10 -0700327void aom_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700328 ptrdiff_t dst_stride, const int16_t *filter_x,
329 int x_step_q4, const int16_t *filter_y, int y_step_q4,
330 int w, int h) {
Yaowu Xuf883b422016-08-30 14:01:10 -0700331 aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700332 filter_y, y_step_q4, w, h);
333}
334
David Barkerbe6cc072016-12-15 15:39:10 +0000335#if CONFIG_LOOP_RESTORATION
336static void convolve_add_src_horiz(const uint8_t *src, ptrdiff_t src_stride,
337 uint8_t *dst, ptrdiff_t dst_stride,
338 const InterpKernel *x_filters, int x0_q4,
339 int x_step_q4, int w, int h) {
Debargha Mukherjee11cf46f2017-05-25 12:07:47 -0700340 int x, y, k;
David Barkerbe6cc072016-12-15 15:39:10 +0000341 src -= SUBPEL_TAPS / 2 - 1;
342 for (y = 0; y < h; ++y) {
343 int x_q4 = x0_q4;
344 for (x = 0; x < w; ++x) {
345 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
346 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
Debargha Mukherjee11cf46f2017-05-25 12:07:47 -0700347 int sum = 0;
David Barkerbe6cc072016-12-15 15:39:10 +0000348 for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
349 dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
350 src_x[SUBPEL_TAPS / 2 - 1]);
351 x_q4 += x_step_q4;
352 }
353 src += src_stride;
354 dst += dst_stride;
355 }
356}
357
358static void convolve_add_src_vert(const uint8_t *src, ptrdiff_t src_stride,
359 uint8_t *dst, ptrdiff_t dst_stride,
360 const InterpKernel *y_filters, int y0_q4,
361 int y_step_q4, int w, int h) {
Debargha Mukherjee11cf46f2017-05-25 12:07:47 -0700362 int x, y, k;
David Barkerbe6cc072016-12-15 15:39:10 +0000363 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
364
365 for (x = 0; x < w; ++x) {
366 int y_q4 = y0_q4;
367 for (y = 0; y < h; ++y) {
368 const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
369 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
Debargha Mukherjee11cf46f2017-05-25 12:07:47 -0700370 int sum = 0;
David Barkerbe6cc072016-12-15 15:39:10 +0000371 for (k = 0; k < SUBPEL_TAPS; ++k)
372 sum += src_y[k * src_stride] * y_filter[k];
373 dst[y * dst_stride] =
374 clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
375 src_y[(SUBPEL_TAPS / 2 - 1) * src_stride]);
376 y_q4 += y_step_q4;
377 }
378 ++src;
379 ++dst;
380 }
381}
382
383static void convolve_add_src(const uint8_t *src, ptrdiff_t src_stride,
384 uint8_t *dst, ptrdiff_t dst_stride,
385 const InterpKernel *const x_filters, int x0_q4,
386 int x_step_q4, const InterpKernel *const y_filters,
387 int y0_q4, int y_step_q4, int w, int h) {
388 uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
389 int intermediate_height =
390 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
391
392 assert(w <= MAX_SB_SIZE);
393 assert(h <= MAX_SB_SIZE);
394
395 assert(y_step_q4 <= 32);
396 assert(x_step_q4 <= 32);
397
398 convolve_add_src_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
399 temp, MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
400 intermediate_height);
401 convolve_add_src_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE,
402 dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h);
403}
404
405void aom_convolve8_add_src_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
406 uint8_t *dst, ptrdiff_t dst_stride,
407 const int16_t *filter_x, int x_step_q4,
408 const int16_t *filter_y, int y_step_q4,
409 int w, int h) {
410 const InterpKernel *const filters_x = get_filter_base(filter_x);
411 const int x0_q4 = get_filter_offset(filter_x, filters_x);
412
413 (void)filter_y;
414 (void)y_step_q4;
415
416 convolve_add_src_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
417 x_step_q4, w, h);
418}
419
420void aom_convolve8_add_src_vert_c(const uint8_t *src, ptrdiff_t src_stride,
421 uint8_t *dst, ptrdiff_t dst_stride,
422 const int16_t *filter_x, int x_step_q4,
423 const int16_t *filter_y, int y_step_q4, int w,
424 int h) {
425 const InterpKernel *const filters_y = get_filter_base(filter_y);
426 const int y0_q4 = get_filter_offset(filter_y, filters_y);
427
428 (void)filter_x;
429 (void)x_step_q4;
430
431 convolve_add_src_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
432 y_step_q4, w, h);
433}
434
435void aom_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
436 uint8_t *dst, ptrdiff_t dst_stride,
437 const int16_t *filter_x, int x_step_q4,
438 const int16_t *filter_y, int y_step_q4, int w,
439 int h) {
440 const InterpKernel *const filters_x = get_filter_base(filter_x);
441 const int x0_q4 = get_filter_offset(filter_x, filters_x);
442
443 const InterpKernel *const filters_y = get_filter_base(filter_y);
444 const int y0_q4 = get_filter_offset(filter_y, filters_y);
445
446 convolve_add_src(src, src_stride, dst, dst_stride, filters_x, x0_q4,
447 x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
448}
Debargha Mukherjee28d15c72017-05-12 10:44:03 -0700449
Debargha Mukherjee28d15c72017-05-12 10:44:03 -0700450static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride,
451 uint16_t *dst, ptrdiff_t dst_stride,
452 const InterpKernel *x_filters, int x0_q4,
453 int x_step_q4, int w, int h) {
Debargha Mukherjee11cf46f2017-05-25 12:07:47 -0700454 const int bd = 8;
455 int x, y, k;
Debargha Mukherjee28d15c72017-05-12 10:44:03 -0700456 src -= SUBPEL_TAPS / 2 - 1;
457 for (y = 0; y < h; ++y) {
458 int x_q4 = x0_q4;
459 for (x = 0; x < w; ++x) {
460 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
461 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
Debargha Mukherjee11cf46f2017-05-25 12:07:47 -0700462 int sum = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
463 (1 << (bd + FILTER_BITS - 1));
Debargha Mukherjee28d15c72017-05-12 10:44:03 -0700464 for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
465 dst[x] =
466 (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, FILTER_BITS - EXTRAPREC_BITS),
Debargha Mukherjee11cf46f2017-05-25 12:07:47 -0700467 0, EXTRAPREC_CLAMP_LIMIT(bd) - 1);
Debargha Mukherjee28d15c72017-05-12 10:44:03 -0700468 x_q4 += x_step_q4;
469 }
470 src += src_stride;
471 dst += dst_stride;
472 }
473}
474
475static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride,
476 uint8_t *dst, ptrdiff_t dst_stride,
477 const InterpKernel *y_filters, int y0_q4,
478 int y_step_q4, int w, int h) {
Debargha Mukherjee11cf46f2017-05-25 12:07:47 -0700479 const int bd = 8;
480 int x, y, k;
Debargha Mukherjee28d15c72017-05-12 10:44:03 -0700481 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
482
483 for (x = 0; x < w; ++x) {
484 int y_q4 = y0_q4;
485 for (y = 0; y < h; ++y) {
486 const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
487 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
Debargha Mukherjee11cf46f2017-05-25 12:07:47 -0700488 int sum =
489 ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
490 (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1));
Debargha Mukherjee28d15c72017-05-12 10:44:03 -0700491 for (k = 0; k < SUBPEL_TAPS; ++k)
492 sum += src_y[k * src_stride] * y_filter[k];
493 dst[y * dst_stride] =
494 clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS + EXTRAPREC_BITS));
495 y_q4 += y_step_q4;
496 }
497 ++src;
498 ++dst;
499 }
500}
501
502static void convolve_add_src_hip(const uint8_t *src, ptrdiff_t src_stride,
503 uint8_t *dst, ptrdiff_t dst_stride,
504 const InterpKernel *const x_filters, int x0_q4,
505 int x_step_q4,
506 const InterpKernel *const y_filters, int y0_q4,
507 int y_step_q4, int w, int h) {
508 uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
509 int intermediate_height =
510 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
511
512 assert(w <= MAX_SB_SIZE);
513 assert(h <= MAX_SB_SIZE);
514
515 assert(y_step_q4 <= 32);
516 assert(x_step_q4 <= 32);
517
518 convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
519 src_stride, temp, MAX_SB_SIZE, x_filters, x0_q4,
520 x_step_q4, w, intermediate_height);
521 convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
522 MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4,
523 y_step_q4, w, h);
524}
525
526void aom_convolve8_add_src_horiz_hip_c(const uint8_t *src, ptrdiff_t src_stride,
527 uint16_t *dst, ptrdiff_t dst_stride,
528 const int16_t *filter_x, int x_step_q4,
529 const int16_t *filter_y, int y_step_q4,
530 int w, int h) {
531 const InterpKernel *const filters_x = get_filter_base(filter_x);
532 const int x0_q4 = get_filter_offset(filter_x, filters_x);
533
534 (void)filter_y;
535 (void)y_step_q4;
536
537 convolve_add_src_horiz_hip(src, src_stride, dst, dst_stride, filters_x, x0_q4,
538 x_step_q4, w, h);
539}
540
541void aom_convolve8_add_src_vert_hip_c(const uint16_t *src, ptrdiff_t src_stride,
542 uint8_t *dst, ptrdiff_t dst_stride,
543 const int16_t *filter_x, int x_step_q4,
544 const int16_t *filter_y, int y_step_q4,
545 int w, int h) {
546 const InterpKernel *const filters_y = get_filter_base(filter_y);
547 const int y0_q4 = get_filter_offset(filter_y, filters_y);
548
549 (void)filter_x;
550 (void)x_step_q4;
551
552 convolve_add_src_vert_hip(src, src_stride, dst, dst_stride, filters_y, y0_q4,
553 y_step_q4, w, h);
554}
555
556void aom_convolve8_add_src_hip_c(const uint8_t *src, ptrdiff_t src_stride,
557 uint8_t *dst, ptrdiff_t dst_stride,
558 const int16_t *filter_x, int x_step_q4,
559 const int16_t *filter_y, int y_step_q4, int w,
560 int h) {
561 const InterpKernel *const filters_x = get_filter_base(filter_x);
562 const int x0_q4 = get_filter_offset(filter_x, filters_x);
563
564 const InterpKernel *const filters_y = get_filter_base(filter_y);
565 const int y0_q4 = get_filter_offset(filter_y, filters_y);
566
567 convolve_add_src_hip(src, src_stride, dst, dst_stride, filters_x, x0_q4,
568 x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
569}
David Barkerbe6cc072016-12-15 15:39:10 +0000570#endif // CONFIG_LOOP_RESTORATION
571
Sebastien Alaiwan71e87842017-04-12 16:03:28 +0200572#if CONFIG_HIGHBITDEPTH
Yaowu Xuc27fc142016-08-22 16:08:15 -0700573static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
574 uint8_t *dst8, ptrdiff_t dst_stride,
575 const InterpKernel *x_filters, int x0_q4,
576 int x_step_q4, int w, int h, int bd) {
577 int x, y;
578 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
579 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
580 src -= SUBPEL_TAPS / 2 - 1;
581 for (y = 0; y < h; ++y) {
582 int x_q4 = x0_q4;
583 for (x = 0; x < w; ++x) {
584 const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
585 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
586 int k, sum = 0;
587 for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
588 dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
589 x_q4 += x_step_q4;
590 }
591 src += src_stride;
592 dst += dst_stride;
593 }
594}
595
596static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride,
597 uint8_t *dst8, ptrdiff_t dst_stride,
598 const InterpKernel *x_filters, int x0_q4,
599 int x_step_q4, int w, int h, int bd) {
600 int x, y;
601 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
602 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
603 src -= SUBPEL_TAPS / 2 - 1;
604 for (y = 0; y < h; ++y) {
605 int x_q4 = x0_q4;
606 for (x = 0; x < w; ++x) {
607 const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
608 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
609 int k, sum = 0;
610 for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
611 dst[x] = ROUND_POWER_OF_TWO(
612 dst[x] + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
613 1);
614 x_q4 += x_step_q4;
615 }
616 src += src_stride;
617 dst += dst_stride;
618 }
619}
620
621static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
622 uint8_t *dst8, ptrdiff_t dst_stride,
623 const InterpKernel *y_filters, int y0_q4,
624 int y_step_q4, int w, int h, int bd) {
625 int x, y;
626 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
627 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
628 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
629 for (x = 0; x < w; ++x) {
630 int y_q4 = y0_q4;
631 for (y = 0; y < h; ++y) {
632 const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
633 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
634 int k, sum = 0;
635 for (k = 0; k < SUBPEL_TAPS; ++k)
636 sum += src_y[k * src_stride] * y_filter[k];
637 dst[y * dst_stride] =
638 clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
639 y_q4 += y_step_q4;
640 }
641 ++src;
642 ++dst;
643 }
644}
645
646static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride,
647 uint8_t *dst8, ptrdiff_t dst_stride,
648 const InterpKernel *y_filters, int y0_q4,
649 int y_step_q4, int w, int h, int bd) {
650 int x, y;
651 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
652 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
653 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
654 for (x = 0; x < w; ++x) {
655 int y_q4 = y0_q4;
656 for (y = 0; y < h; ++y) {
657 const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
658 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
659 int k, sum = 0;
660 for (k = 0; k < SUBPEL_TAPS; ++k)
661 sum += src_y[k * src_stride] * y_filter[k];
662 dst[y * dst_stride] = ROUND_POWER_OF_TWO(
663 dst[y * dst_stride] +
664 clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
665 1);
666 y_q4 += y_step_q4;
667 }
668 ++src;
669 ++dst;
670 }
671}
672
673static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride,
674 uint8_t *dst, ptrdiff_t dst_stride,
675 const InterpKernel *const x_filters, int x0_q4,
676 int x_step_q4, const InterpKernel *const y_filters,
677 int y0_q4, int y_step_q4, int w, int h, int bd) {
678 // Note: Fixed size intermediate buffer, temp, places limits on parameters.
679 // 2d filtering proceeds in 2 steps:
680 // (1) Interpolate horizontally into an intermediate buffer, temp.
681 // (2) Interpolate temp vertically to derive the sub-pixel result.
682 // Deriving the maximum number of rows in the temp buffer (135):
683 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
684 // --Largest block size is 64x64 pixels.
685 // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
686 // original frame (in 1/16th pixel units).
687 // --Must round-up because block may be located at sub-pixel position.
688 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
689 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
690 uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
691 int intermediate_height =
692 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
693
694 assert(w <= MAX_SB_SIZE);
695 assert(h <= MAX_SB_SIZE);
696 assert(y_step_q4 <= 32);
697 assert(x_step_q4 <= 32);
698
699 highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
700 CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, x_filters, x0_q4,
701 x_step_q4, w, intermediate_height, bd);
702 highbd_convolve_vert(
703 CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
704 MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, bd);
705}
706
Yaowu Xuf883b422016-08-30 14:01:10 -0700707void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700708 uint8_t *dst, ptrdiff_t dst_stride,
709 const int16_t *filter_x, int x_step_q4,
710 const int16_t *filter_y, int y_step_q4, int w,
711 int h, int bd) {
712 const InterpKernel *const filters_x = get_filter_base(filter_x);
713 const int x0_q4 = get_filter_offset(filter_x, filters_x);
714 (void)filter_y;
715 (void)y_step_q4;
716
717 highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
718 x_step_q4, w, h, bd);
719}
720
Yaowu Xuf883b422016-08-30 14:01:10 -0700721void aom_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700722 uint8_t *dst, ptrdiff_t dst_stride,
723 const int16_t *filter_x, int x_step_q4,
724 const int16_t *filter_y, int y_step_q4,
725 int w, int h, int bd) {
726 const InterpKernel *const filters_x = get_filter_base(filter_x);
727 const int x0_q4 = get_filter_offset(filter_x, filters_x);
728 (void)filter_y;
729 (void)y_step_q4;
730
731 highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
732 x_step_q4, w, h, bd);
733}
734
Yaowu Xuf883b422016-08-30 14:01:10 -0700735void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700736 uint8_t *dst, ptrdiff_t dst_stride,
737 const int16_t *filter_x, int x_step_q4,
738 const int16_t *filter_y, int y_step_q4, int w,
739 int h, int bd) {
740 const InterpKernel *const filters_y = get_filter_base(filter_y);
741 const int y0_q4 = get_filter_offset(filter_y, filters_y);
742 (void)filter_x;
743 (void)x_step_q4;
744
745 highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
746 y_step_q4, w, h, bd);
747}
748
Yaowu Xuf883b422016-08-30 14:01:10 -0700749void aom_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700750 uint8_t *dst, ptrdiff_t dst_stride,
751 const int16_t *filter_x, int x_step_q4,
752 const int16_t *filter_y, int y_step_q4,
753 int w, int h, int bd) {
754 const InterpKernel *const filters_y = get_filter_base(filter_y);
755 const int y0_q4 = get_filter_offset(filter_y, filters_y);
756 (void)filter_x;
757 (void)x_step_q4;
758
759 highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
760 y_step_q4, w, h, bd);
761}
762
Yaowu Xuf883b422016-08-30 14:01:10 -0700763void aom_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700764 uint8_t *dst, ptrdiff_t dst_stride,
765 const int16_t *filter_x, int x_step_q4,
766 const int16_t *filter_y, int y_step_q4, int w,
767 int h, int bd) {
768 const InterpKernel *const filters_x = get_filter_base(filter_x);
769 const int x0_q4 = get_filter_offset(filter_x, filters_x);
770
771 const InterpKernel *const filters_y = get_filter_base(filter_y);
772 const int y0_q4 = get_filter_offset(filter_y, filters_y);
773
774 highbd_convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
775 filters_y, y0_q4, y_step_q4, w, h, bd);
776}
777
Yaowu Xuf883b422016-08-30 14:01:10 -0700778void aom_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700779 uint8_t *dst, ptrdiff_t dst_stride,
780 const int16_t *filter_x, int x_step_q4,
781 const int16_t *filter_y, int y_step_q4, int w,
782 int h, int bd) {
783 // Fixed size intermediate buffer places limits on parameters.
784 DECLARE_ALIGNED(16, uint16_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
785 assert(w <= MAX_SB_SIZE);
786 assert(h <= MAX_SB_SIZE);
787
Yaowu Xuf883b422016-08-30 14:01:10 -0700788 aom_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700789 filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);
Yaowu Xuf883b422016-08-30 14:01:10 -0700790 aom_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, dst,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700791 dst_stride, NULL, 0, NULL, 0, w, h, bd);
792}
793
Yaowu Xuf883b422016-08-30 14:01:10 -0700794void aom_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700795 uint8_t *dst8, ptrdiff_t dst_stride,
796 const int16_t *filter_x, int filter_x_stride,
797 const int16_t *filter_y, int filter_y_stride,
798 int w, int h, int bd) {
799 int r;
800 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
801 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
802 (void)filter_x;
803 (void)filter_y;
804 (void)filter_x_stride;
805 (void)filter_y_stride;
806 (void)bd;
807
808 for (r = h; r > 0; --r) {
809 memcpy(dst, src, w * sizeof(uint16_t));
810 src += src_stride;
811 dst += dst_stride;
812 }
813}
814
Yaowu Xuf883b422016-08-30 14:01:10 -0700815void aom_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700816 uint8_t *dst8, ptrdiff_t dst_stride,
817 const int16_t *filter_x, int filter_x_stride,
818 const int16_t *filter_y, int filter_y_stride,
819 int w, int h, int bd) {
820 int x, y;
821 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
822 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
823 (void)filter_x;
824 (void)filter_y;
825 (void)filter_x_stride;
826 (void)filter_y_stride;
827 (void)bd;
828
829 for (y = 0; y < h; ++y) {
830 for (x = 0; x < w; ++x) {
831 dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
832 }
833 src += src_stride;
834 dst += dst_stride;
835 }
836}
David Barkerbe6cc072016-12-15 15:39:10 +0000837
838#if CONFIG_LOOP_RESTORATION
839static void highbd_convolve_add_src_horiz(const uint8_t *src8,
840 ptrdiff_t src_stride, uint8_t *dst8,
841 ptrdiff_t dst_stride,
842 const InterpKernel *x_filters,
843 int x0_q4, int x_step_q4, int w,
844 int h, int bd) {
Debargha Mukherjee11cf46f2017-05-25 12:07:47 -0700845 int x, y, k;
David Barkerbe6cc072016-12-15 15:39:10 +0000846 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
847 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
848 src -= SUBPEL_TAPS / 2 - 1;
849 for (y = 0; y < h; ++y) {
850 int x_q4 = x0_q4;
851 for (x = 0; x < w; ++x) {
852 const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
853 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
Debargha Mukherjee11cf46f2017-05-25 12:07:47 -0700854 int sum = 0;
David Barkerbe6cc072016-12-15 15:39:10 +0000855 for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
856 dst[x] = clip_pixel_highbd(
857 ROUND_POWER_OF_TWO(sum, FILTER_BITS) + src_x[SUBPEL_TAPS / 2 - 1],
858 bd);
859 x_q4 += x_step_q4;
860 }
861 src += src_stride;
862 dst += dst_stride;
863 }
864}
865
866static void highbd_convolve_add_src_vert(const uint8_t *src8,
867 ptrdiff_t src_stride, uint8_t *dst8,
868 ptrdiff_t dst_stride,
869 const InterpKernel *y_filters,
870 int y0_q4, int y_step_q4, int w, int h,
871 int bd) {
Debargha Mukherjee11cf46f2017-05-25 12:07:47 -0700872 int x, y, k;
David Barkerbe6cc072016-12-15 15:39:10 +0000873 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
874 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
875 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
876 for (x = 0; x < w; ++x) {
877 int y_q4 = y0_q4;
878 for (y = 0; y < h; ++y) {
879 const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
880 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
Debargha Mukherjee11cf46f2017-05-25 12:07:47 -0700881 int sum = 0;
David Barkerbe6cc072016-12-15 15:39:10 +0000882 for (k = 0; k < SUBPEL_TAPS; ++k)
883 sum += src_y[k * src_stride] * y_filter[k];
884 dst[y * dst_stride] =
885 clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
886 src_y[(SUBPEL_TAPS / 2 - 1) * src_stride],
887 bd);
888 y_q4 += y_step_q4;
889 }
890 ++src;
891 ++dst;
892 }
893}
894
895static void highbd_convolve_add_src(const uint8_t *src, ptrdiff_t src_stride,
896 uint8_t *dst, ptrdiff_t dst_stride,
897 const InterpKernel *const x_filters,
898 int x0_q4, int x_step_q4,
899 const InterpKernel *const y_filters,
900 int y0_q4, int y_step_q4, int w, int h,
901 int bd) {
902 // Note: Fixed size intermediate buffer, temp, places limits on parameters.
903 // 2d filtering proceeds in 2 steps:
904 // (1) Interpolate horizontally into an intermediate buffer, temp.
905 // (2) Interpolate temp vertically to derive the sub-pixel result.
906 // Deriving the maximum number of rows in the temp buffer (135):
907 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
908 // --Largest block size is 64x64 pixels.
909 // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
910 // original frame (in 1/16th pixel units).
911 // --Must round-up because block may be located at sub-pixel position.
912 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
913 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
914 uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
915 int intermediate_height =
916 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
917
918 assert(w <= MAX_SB_SIZE);
919 assert(h <= MAX_SB_SIZE);
920 assert(y_step_q4 <= 32);
921 assert(x_step_q4 <= 32);
922
923 highbd_convolve_add_src_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1),
924 src_stride, CONVERT_TO_BYTEPTR(temp),
925 MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
926 intermediate_height, bd);
927 highbd_convolve_add_src_vert(
928 CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
929 MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, bd);
930}
931
932void aom_highbd_convolve8_add_src_horiz_c(
933 const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
934 ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
935 const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {
936 const InterpKernel *const filters_x = get_filter_base(filter_x);
937 const int x0_q4 = get_filter_offset(filter_x, filters_x);
938 (void)filter_y;
939 (void)y_step_q4;
940
941 highbd_convolve_add_src_horiz(src, src_stride, dst, dst_stride, filters_x,
942 x0_q4, x_step_q4, w, h, bd);
943}
944
945void aom_highbd_convolve8_add_src_vert_c(const uint8_t *src,
946 ptrdiff_t src_stride, uint8_t *dst,
947 ptrdiff_t dst_stride,
948 const int16_t *filter_x, int x_step_q4,
949 const int16_t *filter_y, int y_step_q4,
950 int w, int h, int bd) {
951 const InterpKernel *const filters_y = get_filter_base(filter_y);
952 const int y0_q4 = get_filter_offset(filter_y, filters_y);
953 (void)filter_x;
954 (void)x_step_q4;
955
956 highbd_convolve_add_src_vert(src, src_stride, dst, dst_stride, filters_y,
957 y0_q4, y_step_q4, w, h, bd);
958}
959
960void aom_highbd_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
961 uint8_t *dst, ptrdiff_t dst_stride,
962 const int16_t *filter_x, int x_step_q4,
963 const int16_t *filter_y, int y_step_q4,
964 int w, int h, int bd) {
965 const InterpKernel *const filters_x = get_filter_base(filter_x);
966 const int x0_q4 = get_filter_offset(filter_x, filters_x);
967
968 const InterpKernel *const filters_y = get_filter_base(filter_y);
969 const int y0_q4 = get_filter_offset(filter_y, filters_y);
970
971 highbd_convolve_add_src(src, src_stride, dst, dst_stride, filters_x, x0_q4,
972 x_step_q4, filters_y, y0_q4, y_step_q4, w, h, bd);
973}
Debargha Mukherjee28d15c72017-05-12 10:44:03 -0700974
975static void highbd_convolve_add_src_horiz_hip(
976 const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst,
977 ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4,
978 int x_step_q4, int w, int h, int bd) {
Debargha Mukherjee11cf46f2017-05-25 12:07:47 -0700979 const int extraprec_clamp_limit = EXTRAPREC_CLAMP_LIMIT(bd);
980 int x, y, k;
Debargha Mukherjee28d15c72017-05-12 10:44:03 -0700981 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
982 src -= SUBPEL_TAPS / 2 - 1;
983 for (y = 0; y < h; ++y) {
984 int x_q4 = x0_q4;
985 for (x = 0; x < w; ++x) {
986 const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
987 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
Debargha Mukherjee11cf46f2017-05-25 12:07:47 -0700988 int sum = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
989 (1 << (bd + FILTER_BITS - 1));
Debargha Mukherjee28d15c72017-05-12 10:44:03 -0700990 for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
991 dst[x] =
992 (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, FILTER_BITS - EXTRAPREC_BITS),
993 0, extraprec_clamp_limit - 1);
994 x_q4 += x_step_q4;
995 }
996 src += src_stride;
997 dst += dst_stride;
998 }
999}
1000
1001static void highbd_convolve_add_src_vert_hip(
1002 const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst8,
1003 ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4,
1004 int y_step_q4, int w, int h, int bd) {
Debargha Mukherjee11cf46f2017-05-25 12:07:47 -07001005 int x, y, k;
Debargha Mukherjee28d15c72017-05-12 10:44:03 -07001006 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1007 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1008 for (x = 0; x < w; ++x) {
1009 int y_q4 = y0_q4;
1010 for (y = 0; y < h; ++y) {
1011 const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1012 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
Debargha Mukherjee11cf46f2017-05-25 12:07:47 -07001013 int sum =
1014 ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
1015 (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1));
Debargha Mukherjee28d15c72017-05-12 10:44:03 -07001016 for (k = 0; k < SUBPEL_TAPS; ++k)
1017 sum += src_y[k * src_stride] * y_filter[k];
1018 dst[y * dst_stride] = clip_pixel_highbd(
1019 ROUND_POWER_OF_TWO(sum, FILTER_BITS + EXTRAPREC_BITS), bd);
1020 y_q4 += y_step_q4;
1021 }
1022 ++src;
1023 ++dst;
1024 }
1025}
1026
1027static void highbd_convolve_add_src_hip(
1028 const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
1029 ptrdiff_t dst_stride, const InterpKernel *const x_filters, int x0_q4,
1030 int x_step_q4, const InterpKernel *const y_filters, int y0_q4,
1031 int y_step_q4, int w, int h, int bd) {
1032 // Note: Fixed size intermediate buffer, temp, places limits on parameters.
1033 // 2d filtering proceeds in 2 steps:
1034 // (1) Interpolate horizontally into an intermediate buffer, temp.
1035 // (2) Interpolate temp vertically to derive the sub-pixel result.
1036 // Deriving the maximum number of rows in the temp buffer (135):
1037 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
1038 // --Largest block size is 64x64 pixels.
1039 // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
1040 // original frame (in 1/16th pixel units).
1041 // --Must round-up because block may be located at sub-pixel position.
1042 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
1043 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
1044 uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
1045 int intermediate_height =
1046 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
1047
1048 assert(w <= MAX_SB_SIZE);
1049 assert(h <= MAX_SB_SIZE);
1050 assert(y_step_q4 <= 32);
1051 assert(x_step_q4 <= 32);
1052
1053 highbd_convolve_add_src_horiz_hip(
1054 src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, MAX_SB_SIZE,
1055 x_filters, x0_q4, x_step_q4, w, intermediate_height, bd);
1056 highbd_convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
1057 MAX_SB_SIZE, dst, dst_stride, y_filters,
1058 y0_q4, y_step_q4, w, h, bd);
1059}
1060
1061void aom_highbd_convolve8_add_src_horiz_hip_c(
1062 const uint8_t *src, ptrdiff_t src_stride, uint16_t *dst,
1063 ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
1064 const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {
1065 const InterpKernel *const filters_x = get_filter_base(filter_x);
1066 const int x0_q4 = get_filter_offset(filter_x, filters_x);
1067 (void)filter_y;
1068 (void)y_step_q4;
1069
1070 highbd_convolve_add_src_horiz_hip(src, src_stride, dst, dst_stride, filters_x,
1071 x0_q4, x_step_q4, w, h, bd);
1072}
1073
1074void aom_highbd_convolve8_add_src_vert_hip_c(
1075 const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst,
1076 ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
1077 const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {
1078 const InterpKernel *const filters_y = get_filter_base(filter_y);
1079 const int y0_q4 = get_filter_offset(filter_y, filters_y);
1080 (void)filter_x;
1081 (void)x_step_q4;
1082
1083 highbd_convolve_add_src_vert_hip(src, src_stride, dst, dst_stride, filters_y,
1084 y0_q4, y_step_q4, w, h, bd);
1085}
1086
1087void aom_highbd_convolve8_add_src_hip_c(const uint8_t *src,
1088 ptrdiff_t src_stride, uint8_t *dst,
1089 ptrdiff_t dst_stride,
1090 const int16_t *filter_x, int x_step_q4,
1091 const int16_t *filter_y, int y_step_q4,
1092 int w, int h, int bd) {
1093 const InterpKernel *const filters_x = get_filter_base(filter_x);
1094 const int x0_q4 = get_filter_offset(filter_x, filters_x);
1095
1096 const InterpKernel *const filters_y = get_filter_base(filter_y);
1097 const int y0_q4 = get_filter_offset(filter_y, filters_y);
1098
1099 highbd_convolve_add_src_hip(src, src_stride, dst, dst_stride, filters_x,
1100 x0_q4, x_step_q4, filters_y, y0_q4, y_step_q4, w,
1101 h, bd);
1102}
1103
David Barkerbe6cc072016-12-15 15:39:10 +00001104#endif // CONFIG_LOOP_RESTORATION
Sebastien Alaiwan71e87842017-04-12 16:03:28 +02001105#endif // CONFIG_HIGHBITDEPTH