blob: e2e8392194f5075f08ea4a12b812b8dd896ffce5 [file] [log] [blame]
Yaowu Xuc27fc142016-08-22 16:08:15 -07001/*
Yaowu Xu2ab7ff02016-09-02 12:04:54 -07002 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
Yaowu Xuc27fc142016-08-22 16:08:15 -07003 *
Yaowu Xu2ab7ff02016-09-02 12:04:54 -07004 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
Yaowu Xuc27fc142016-08-22 16:08:15 -070010 */
11
12#include <stdlib.h>
13
Yaowu Xuf883b422016-08-30 14:01:10 -070014#include "./aom_config.h"
15#include "./aom_dsp_rtcd.h"
16#include "aom_dsp/aom_dsp_common.h"
Yaowu Xuc27fc142016-08-22 16:08:15 -070017#include "aom_ports/mem.h"
18
19static INLINE int8_t signed_char_clamp(int t) {
20 return (int8_t)clamp(t, -128, 127);
21}
22
Ryan Leidd6fa062017-04-13 12:05:33 -070023#define PARALLEL_DEBLOCKING_11_TAP 0
24#define PARALLEL_DEBLOCKING_9_TAP 0
25
Sebastien Alaiwan71e87842017-04-12 16:03:28 +020026#if CONFIG_HIGHBITDEPTH
Yaowu Xuc27fc142016-08-22 16:08:15 -070027static INLINE int16_t signed_char_clamp_high(int t, int bd) {
28 switch (bd) {
29 case 10: return (int16_t)clamp(t, -128 * 4, 128 * 4 - 1);
30 case 12: return (int16_t)clamp(t, -128 * 16, 128 * 16 - 1);
31 case 8:
32 default: return (int16_t)clamp(t, -128, 128 - 1);
33 }
34}
35#endif
Ryan Lei392d0ff2017-02-09 13:05:42 -080036#if CONFIG_PARALLEL_DEBLOCKING
Yaowu Xuc27fc142016-08-22 16:08:15 -070037// should we apply any filter at all: 11111111 yes, 00000000 no
Ryan Lei392d0ff2017-02-09 13:05:42 -080038static INLINE int8_t filter_mask2(uint8_t limit, uint8_t blimit, uint8_t p1,
39 uint8_t p0, uint8_t q0, uint8_t q1) {
40 int8_t mask = 0;
41 mask |= (abs(p1 - p0) > limit) * -1;
42 mask |= (abs(q1 - q0) > limit) * -1;
43 mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
44 return ~mask;
45}
46#endif // CONFIG_PARALLEL_DEBLOCKING
Yaowu Xuc27fc142016-08-22 16:08:15 -070047static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3,
48 uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0,
49 uint8_t q1, uint8_t q2, uint8_t q3) {
50 int8_t mask = 0;
51 mask |= (abs(p3 - p2) > limit) * -1;
52 mask |= (abs(p2 - p1) > limit) * -1;
53 mask |= (abs(p1 - p0) > limit) * -1;
54 mask |= (abs(q1 - q0) > limit) * -1;
55 mask |= (abs(q2 - q1) > limit) * -1;
56 mask |= (abs(q3 - q2) > limit) * -1;
57 mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
58 return ~mask;
59}
60
61static INLINE int8_t flat_mask4(uint8_t thresh, uint8_t p3, uint8_t p2,
62 uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1,
63 uint8_t q2, uint8_t q3) {
64 int8_t mask = 0;
65 mask |= (abs(p1 - p0) > thresh) * -1;
66 mask |= (abs(q1 - q0) > thresh) * -1;
67 mask |= (abs(p2 - p0) > thresh) * -1;
68 mask |= (abs(q2 - q0) > thresh) * -1;
69 mask |= (abs(p3 - p0) > thresh) * -1;
70 mask |= (abs(q3 - q0) > thresh) * -1;
71 return ~mask;
72}
73
Ryan Leidd6fa062017-04-13 12:05:33 -070074#if PARALLEL_DEBLOCKING_9_TAP
75static INLINE int8_t flat_mask2(uint8_t thresh, uint8_t p4, uint8_t p0,
76 uint8_t q0, uint8_t q4) {
77 int8_t mask = 0;
78 mask |= (abs(p4 - p0) > thresh) * -1;
79 mask |= (abs(q4 - q0) > thresh) * -1;
80 return ~mask;
81}
82#endif
83
84#if PARALLEL_DEBLOCKING_11_TAP
85static INLINE int8_t flat_mask3(uint8_t thresh, uint8_t p5, uint8_t p4,
86 uint8_t p0, uint8_t q0, uint8_t q4,
87 uint8_t q5) {
88 int8_t mask = 0;
89 mask |= (abs(p4 - p0) > thresh) * -1;
90 mask |= (abs(q4 - q0) > thresh) * -1;
91 mask |= (abs(p5 - p0) > thresh) * -1;
92 mask |= (abs(q5 - q0) > thresh) * -1;
93 return ~mask;
94}
95#endif
96
Yaowu Xuc27fc142016-08-22 16:08:15 -070097static INLINE int8_t flat_mask5(uint8_t thresh, uint8_t p4, uint8_t p3,
98 uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0,
99 uint8_t q1, uint8_t q2, uint8_t q3,
100 uint8_t q4) {
101 int8_t mask = ~flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3);
102 mask |= (abs(p4 - p0) > thresh) * -1;
103 mask |= (abs(q4 - q0) > thresh) * -1;
104 return ~mask;
105}
106
107// is there high edge variance internal edge: 11111111 yes, 00000000 no
108static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0,
109 uint8_t q0, uint8_t q1) {
110 int8_t hev = 0;
111 hev |= (abs(p1 - p0) > thresh) * -1;
112 hev |= (abs(q1 - q0) > thresh) * -1;
113 return hev;
114}
115
116static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1,
117 uint8_t *op0, uint8_t *oq0, uint8_t *oq1) {
118 int8_t filter1, filter2;
119
120 const int8_t ps1 = (int8_t)*op1 ^ 0x80;
121 const int8_t ps0 = (int8_t)*op0 ^ 0x80;
122 const int8_t qs0 = (int8_t)*oq0 ^ 0x80;
123 const int8_t qs1 = (int8_t)*oq1 ^ 0x80;
124 const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1);
125
126 // add outer taps if we have high edge variance
127 int8_t filter = signed_char_clamp(ps1 - qs1) & hev;
128
129 // inner taps
130 filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask;
131
132 // save bottom 3 bits so that we round one side +4 and the other +3
133 // if it equals 4 we'll set to adjust by -1 to account for the fact
134 // we'd round 3 the other way
135 filter1 = signed_char_clamp(filter + 4) >> 3;
136 filter2 = signed_char_clamp(filter + 3) >> 3;
137
138 *oq0 = signed_char_clamp(qs0 - filter1) ^ 0x80;
139 *op0 = signed_char_clamp(ps0 + filter2) ^ 0x80;
140
141 // outer tap adjustments
142 filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
143
144 *oq1 = signed_char_clamp(qs1 - filter) ^ 0x80;
145 *op1 = signed_char_clamp(ps1 + filter) ^ 0x80;
146}
147
Yaowu Xuf883b422016-08-30 14:01:10 -0700148void aom_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700149 const uint8_t *blimit, const uint8_t *limit,
150 const uint8_t *thresh) {
151 int i;
152
153 // loop filter designed to work using chars so that we can make maximum use
154 // of 8 bit simd instructions.
155 for (i = 0; i < 8; ++i) {
Ryan Lei392d0ff2017-02-09 13:05:42 -0800156#if !CONFIG_PARALLEL_DEBLOCKING
Yaowu Xuc27fc142016-08-22 16:08:15 -0700157 const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
158 const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
159 const int8_t mask =
160 filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
Ryan Lei392d0ff2017-02-09 13:05:42 -0800161#else // CONFIG_PARALLEL_DEBLOCKING
162 const uint8_t p1 = s[-2 * p], p0 = s[-p];
163 const uint8_t q0 = s[0 * p], q1 = s[1 * p];
164 const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1);
165#endif // !CONFIG_PARALLEL_DEBLOCKING
Yaowu Xuc27fc142016-08-22 16:08:15 -0700166 filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p);
167 ++s;
168 }
169}
170
Yaowu Xuf883b422016-08-30 14:01:10 -0700171void aom_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700172 const uint8_t *limit0, const uint8_t *thresh0,
173 const uint8_t *blimit1, const uint8_t *limit1,
174 const uint8_t *thresh1) {
Yaowu Xuf883b422016-08-30 14:01:10 -0700175 aom_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0);
176 aom_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700177}
178
Yaowu Xuf883b422016-08-30 14:01:10 -0700179void aom_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700180 const uint8_t *limit, const uint8_t *thresh) {
181 int i;
182
183 // loop filter designed to work using chars so that we can make maximum use
184 // of 8 bit simd instructions.
185 for (i = 0; i < 8; ++i) {
Ryan Lei392d0ff2017-02-09 13:05:42 -0800186#if !CONFIG_PARALLEL_DEBLOCKING
Yaowu Xuc27fc142016-08-22 16:08:15 -0700187 const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
188 const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
189 const int8_t mask =
190 filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
Ryan Lei392d0ff2017-02-09 13:05:42 -0800191#else // CONFIG_PARALLEL_DEBLOCKING
192 const uint8_t p1 = s[-2], p0 = s[-1];
193 const uint8_t q0 = s[0], q1 = s[1];
194 const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1);
195#endif // !CONFIG_PARALLEL_DEBLOCKING
Yaowu Xuc27fc142016-08-22 16:08:15 -0700196 filter4(mask, *thresh, s - 2, s - 1, s, s + 1);
197 s += pitch;
198 }
199}
200
Yaowu Xuf883b422016-08-30 14:01:10 -0700201void aom_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700202 const uint8_t *limit0, const uint8_t *thresh0,
203 const uint8_t *blimit1, const uint8_t *limit1,
204 const uint8_t *thresh1) {
Yaowu Xuf883b422016-08-30 14:01:10 -0700205 aom_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0);
206 aom_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700207}
208
209static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat,
210 uint8_t *op3, uint8_t *op2, uint8_t *op1,
211 uint8_t *op0, uint8_t *oq0, uint8_t *oq1,
212 uint8_t *oq2, uint8_t *oq3) {
213 if (flat && mask) {
214 const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
215 const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
216
217 // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
218 *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
219 *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
220 *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
221 *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
222 *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
223 *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
224 } else {
225 filter4(mask, thresh, op1, op0, oq0, oq1);
226 }
227}
228
Yaowu Xuf883b422016-08-30 14:01:10 -0700229void aom_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700230 const uint8_t *limit, const uint8_t *thresh) {
231 int i;
232
233 // loop filter designed to work using chars so that we can make maximum use
234 // of 8 bit simd instructions.
235 for (i = 0; i < 8; ++i) {
236 const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
237 const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
238
239 const int8_t mask =
240 filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
241 const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
242 filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
243 s + 1 * p, s + 2 * p, s + 3 * p);
244 ++s;
245 }
246}
247
Yaowu Xuf883b422016-08-30 14:01:10 -0700248void aom_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700249 const uint8_t *limit0, const uint8_t *thresh0,
250 const uint8_t *blimit1, const uint8_t *limit1,
251 const uint8_t *thresh1) {
Yaowu Xuf883b422016-08-30 14:01:10 -0700252 aom_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0);
253 aom_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700254}
255
Yaowu Xuf883b422016-08-30 14:01:10 -0700256void aom_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700257 const uint8_t *limit, const uint8_t *thresh) {
258 int i;
259
260 for (i = 0; i < 8; ++i) {
261 const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
262 const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
263 const int8_t mask =
264 filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
265 const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
266 filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2,
267 s + 3);
268 s += pitch;
269 }
270}
271
Yaowu Xuf883b422016-08-30 14:01:10 -0700272void aom_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700273 const uint8_t *limit0, const uint8_t *thresh0,
274 const uint8_t *blimit1, const uint8_t *limit1,
275 const uint8_t *thresh1) {
Yaowu Xuf883b422016-08-30 14:01:10 -0700276 aom_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0);
277 aom_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700278}
279
Ryan Leidd6fa062017-04-13 12:05:33 -0700280#if PARALLEL_DEBLOCKING_11_TAP
281static INLINE void filter12(int8_t mask, uint8_t thresh, uint8_t flat,
282 uint8_t flat2, uint8_t *op5, uint8_t *op4,
283 uint8_t *op3, uint8_t *op2, uint8_t *op1,
284 uint8_t *op0, uint8_t *oq0, uint8_t *oq1,
285 uint8_t *oq2, uint8_t *oq3, uint8_t *oq4,
286 uint8_t *oq5) {
287 if (flat2 && flat && mask) {
288 const uint8_t p5 = *op5, p4 = *op4, p3 = *op3, p2 = *op2, p1 = *op1,
289 p0 = *op0;
290 const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4,
291 q5 = *oq5;
292
293 // 11-tap filter [1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1]
294 *op4 = (p5 * 5 + p4 * 2 + p3 + p2 + p1 + p0 + q0 + 6) / 12;
295 *op3 = (p5 * 4 + p4 + p3 * 2 + p2 + p1 + p0 + q0 + q1 + 6) / 12;
296 *op2 = (p5 * 3 + p4 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + q2 + 6) / 12;
297 *op1 = (p5 * 2 + p4 + p3 + p2 + p1 * 2 + p0 + q0 + q1 + q2 + q3 + 6) / 12;
298 *op0 = (p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 + q1 + q2 + q3 + q4 + 6) / 12;
299 *oq0 = (p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 + q2 + q3 + q4 + q5 + 6) / 12;
300 *oq1 = (p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 + q3 + q4 + q5 * 2 + 6) / 12;
301 *oq2 = (p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 + q5 * 3 + 6) / 12;
302 *oq3 = (p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 + q5 * 4 + 6) / 12;
303 *oq4 = (p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 5 + 6) / 12;
304 } else {
305 filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
306 }
307}
308#endif
309
310#if PARALLEL_DEBLOCKING_9_TAP
311static INLINE void filter10(int8_t mask, uint8_t thresh, uint8_t flat,
312 uint8_t flat2, uint8_t *op4, uint8_t *op3,
313 uint8_t *op2, uint8_t *op1, uint8_t *op0,
314 uint8_t *oq0, uint8_t *oq1, uint8_t *oq2,
315 uint8_t *oq3, uint8_t *oq4) {
316 if (flat2 && flat && mask) {
317 const uint8_t p4 = *op4, p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
318 const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4;
319
320 // 9-tap filter [1, 1, 1, 1, 2, 1, 1, 1, 1]
321 *op3 = (p4 * 4 + p3 * 2 + p2 + p1 + p0 + q0 + 5) / 10;
322 *op2 = (p4 * 3 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + 5) / 10;
323 *op1 = (p4 * 2 + p3 + p2 + p1 * 2 + p0 + q0 + q1 + q2 + 5) / 10;
324 *op0 = (p4 + p3 + p2 + p1 + p0 * 2 + q0 + q1 + q2 + q3 + 5) / 10;
325 *oq0 = (p3 + p2 + p1 + p0 + q0 * 2 + q1 + q2 + q3 + q4 + 5) / 10;
326 *oq1 = (p2 + p1 + p0 + q0 + q1 * 2 + q2 + q3 + q4 * 2 + 5) / 10;
327 *oq2 = (p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 * 3 + 5) / 10;
328 *oq3 = (p0 + q0 + q1 + q2 + q3 * 2 + q4 * 4 + 5) / 10;
329 } else {
330 filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
331 }
332}
333#endif
334
Yaowu Xuc27fc142016-08-22 16:08:15 -0700335static INLINE void filter16(int8_t mask, uint8_t thresh, uint8_t flat,
336 uint8_t flat2, uint8_t *op7, uint8_t *op6,
337 uint8_t *op5, uint8_t *op4, uint8_t *op3,
338 uint8_t *op2, uint8_t *op1, uint8_t *op0,
339 uint8_t *oq0, uint8_t *oq1, uint8_t *oq2,
340 uint8_t *oq3, uint8_t *oq4, uint8_t *oq5,
341 uint8_t *oq6, uint8_t *oq7) {
342 if (flat2 && flat && mask) {
343 const uint8_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4, p3 = *op3,
344 p2 = *op2, p1 = *op1, p0 = *op0;
345
346 const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4,
347 q5 = *oq5, q6 = *oq6, q7 = *oq7;
348
349 // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
350 *op6 = ROUND_POWER_OF_TWO(
351 p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 + q0, 4);
352 *op5 = ROUND_POWER_OF_TWO(
353 p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 + q0 + q1, 4);
354 *op4 = ROUND_POWER_OF_TWO(
355 p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 + q0 + q1 + q2, 4);
356 *op3 = ROUND_POWER_OF_TWO(
357 p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 + q0 + q1 + q2 + q3, 4);
358 *op2 = ROUND_POWER_OF_TWO(
359 p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + q2 + q3 + q4,
360 4);
361 *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 +
362 q0 + q1 + q2 + q3 + q4 + q5,
363 4);
364 *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 +
365 q1 + q2 + q3 + q4 + q5 + q6,
366 4);
367 *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 +
368 q2 + q3 + q4 + q5 + q6 + q7,
369 4);
370 *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 +
371 q3 + q4 + q5 + q6 + q7 * 2,
372 4);
373 *oq2 = ROUND_POWER_OF_TWO(
374 p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3,
375 4);
376 *oq3 = ROUND_POWER_OF_TWO(
377 p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
378 *oq4 = ROUND_POWER_OF_TWO(
379 p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
380 *oq5 = ROUND_POWER_OF_TWO(
381 p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
382 *oq6 = ROUND_POWER_OF_TWO(
383 p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
384 } else {
385 filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
386 }
387}
388
389static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit,
390 const uint8_t *limit,
391 const uint8_t *thresh, int count) {
392 int i;
393
394 // loop filter designed to work using chars so that we can make maximum use
395 // of 8 bit simd instructions.
396 for (i = 0; i < 8 * count; ++i) {
Ryan Leidd6fa062017-04-13 12:05:33 -0700397 const uint8_t p7 = s[-8 * p], p6 = s[-7 * p], p5 = s[-6 * p],
398 p4 = s[-5 * p], p3 = s[-4 * p], p2 = s[-3 * p],
399 p1 = s[-2 * p], p0 = s[-p];
400 const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p],
401 q4 = s[4 * p], q5 = s[5 * p], q6 = s[6 * p], q7 = s[7 * p];
Yaowu Xuc27fc142016-08-22 16:08:15 -0700402 const int8_t mask =
403 filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
404 const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
Ryan Leidd6fa062017-04-13 12:05:33 -0700405
406#if PARALLEL_DEBLOCKING_11_TAP
407 const int8_t flat2 = flat_mask3(1, p5, p4, p0, q0, q4, q5);
408
409 filter12(mask, *thresh, flat, flat2, s - 6 * p, s - 5 * p, s - 4 * p,
410 s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p,
411 s + 3 * p, s + 4 * p, s + 5 * p);
412
413#elif PARALLEL_DEBLOCKING_9_TAP
414 const int8_t flat2 = flat_mask2(1, p4, p0, q0, q4);
415
416 filter10(mask, *thresh, flat, flat2, s - 5 * p, s - 4 * p, s - 3 * p,
417 s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p,
418 s + 4 * p);
419#else
420 const int8_t flat2 = flat_mask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700421
422 filter16(mask, *thresh, flat, flat2, s - 8 * p, s - 7 * p, s - 6 * p,
423 s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
424 s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p,
425 s + 7 * p);
Ryan Leidd6fa062017-04-13 12:05:33 -0700426#endif
427
Yaowu Xuc27fc142016-08-22 16:08:15 -0700428 ++s;
429 }
430}
431
Yaowu Xuf883b422016-08-30 14:01:10 -0700432void aom_lpf_horizontal_edge_8_c(uint8_t *s, int p, const uint8_t *blimit,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700433 const uint8_t *limit, const uint8_t *thresh) {
434 mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1);
435}
436
Yaowu Xuf883b422016-08-30 14:01:10 -0700437void aom_lpf_horizontal_edge_16_c(uint8_t *s, int p, const uint8_t *blimit,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700438 const uint8_t *limit, const uint8_t *thresh) {
439 mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2);
440}
441
442static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
443 const uint8_t *limit, const uint8_t *thresh,
444 int count) {
445 int i;
446
447 for (i = 0; i < count; ++i) {
Ryan Leidd6fa062017-04-13 12:05:33 -0700448 const uint8_t p7 = s[-8], p6 = s[-7], p5 = s[-6], p4 = s[-5], p3 = s[-4],
449 p2 = s[-3], p1 = s[-2], p0 = s[-1];
450 const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3], q4 = s[4],
451 q5 = s[5], q6 = s[6], q7 = s[7];
Yaowu Xuc27fc142016-08-22 16:08:15 -0700452 const int8_t mask =
453 filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
454 const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
Ryan Leidd6fa062017-04-13 12:05:33 -0700455
456#if PARALLEL_DEBLOCKING_11_TAP
457 const int8_t flat2 = flat_mask3(1, p5, p4, p0, q0, q4, q5);
458
459 filter12(mask, *thresh, flat, flat2, s - 6, s - 5, s - 4, s - 3, s - 2,
460 s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5);
461#elif PARALLEL_DEBLOCKING_9_TAP
462 const int8_t flat2 = flat_mask2(1, p4, p0, q0, q4);
463
464 filter10(mask, *thresh, flat, flat2, s - 5, s - 4, s - 3, s - 2, s - 1, s,
465 s + 1, s + 2, s + 3, s + 4);
466
467#else
468 const int8_t flat2 = flat_mask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700469
470 filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5, s - 4,
471 s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6,
472 s + 7);
Ryan Leidd6fa062017-04-13 12:05:33 -0700473#endif
474
Yaowu Xuc27fc142016-08-22 16:08:15 -0700475 s += p;
476 }
477}
478
Yaowu Xuf883b422016-08-30 14:01:10 -0700479void aom_lpf_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700480 const uint8_t *limit, const uint8_t *thresh) {
481 mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8);
482}
483
Yaowu Xuf883b422016-08-30 14:01:10 -0700484void aom_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700485 const uint8_t *limit, const uint8_t *thresh) {
486 mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16);
487}
488
Sebastien Alaiwan71e87842017-04-12 16:03:28 +0200489#if CONFIG_HIGHBITDEPTH
Ryan Lei392d0ff2017-02-09 13:05:42 -0800490#if CONFIG_PARALLEL_DEBLOCKING
491// Should we apply any filter at all: 11111111 yes, 00000000 no ?
492static INLINE int8_t highbd_filter_mask2(uint8_t limit, uint8_t blimit,
493 uint16_t p1, uint16_t p0, uint16_t q0,
494 uint16_t q1, int bd) {
495 int8_t mask = 0;
496 int16_t limit16 = (uint16_t)limit << (bd - 8);
497 int16_t blimit16 = (uint16_t)blimit << (bd - 8);
498 mask |= (abs(p1 - p0) > limit16) * -1;
499 mask |= (abs(q1 - q0) > limit16) * -1;
500 mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
501 return ~mask;
502}
503#endif // CONFIG_PARALLEL_DEBLOCKING
504
Yaowu Xuc27fc142016-08-22 16:08:15 -0700505// Should we apply any filter at all: 11111111 yes, 00000000 no ?
506static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit,
507 uint16_t p3, uint16_t p2, uint16_t p1,
508 uint16_t p0, uint16_t q0, uint16_t q1,
509 uint16_t q2, uint16_t q3, int bd) {
510 int8_t mask = 0;
511 int16_t limit16 = (uint16_t)limit << (bd - 8);
512 int16_t blimit16 = (uint16_t)blimit << (bd - 8);
513 mask |= (abs(p3 - p2) > limit16) * -1;
514 mask |= (abs(p2 - p1) > limit16) * -1;
515 mask |= (abs(p1 - p0) > limit16) * -1;
516 mask |= (abs(q1 - q0) > limit16) * -1;
517 mask |= (abs(q2 - q1) > limit16) * -1;
518 mask |= (abs(q3 - q2) > limit16) * -1;
519 mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
520 return ~mask;
521}
522
523static INLINE int8_t highbd_flat_mask4(uint8_t thresh, uint16_t p3, uint16_t p2,
524 uint16_t p1, uint16_t p0, uint16_t q0,
525 uint16_t q1, uint16_t q2, uint16_t q3,
526 int bd) {
527 int8_t mask = 0;
528 int16_t thresh16 = (uint16_t)thresh << (bd - 8);
529 mask |= (abs(p1 - p0) > thresh16) * -1;
530 mask |= (abs(q1 - q0) > thresh16) * -1;
531 mask |= (abs(p2 - p0) > thresh16) * -1;
532 mask |= (abs(q2 - q0) > thresh16) * -1;
533 mask |= (abs(p3 - p0) > thresh16) * -1;
534 mask |= (abs(q3 - q0) > thresh16) * -1;
535 return ~mask;
536}
537
538static INLINE int8_t highbd_flat_mask5(uint8_t thresh, uint16_t p4, uint16_t p3,
539 uint16_t p2, uint16_t p1, uint16_t p0,
540 uint16_t q0, uint16_t q1, uint16_t q2,
541 uint16_t q3, uint16_t q4, int bd) {
542 int8_t mask = ~highbd_flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3, bd);
543 int16_t thresh16 = (uint16_t)thresh << (bd - 8);
544 mask |= (abs(p4 - p0) > thresh16) * -1;
545 mask |= (abs(q4 - q0) > thresh16) * -1;
546 return ~mask;
547}
548
549// Is there high edge variance internal edge:
550// 11111111_11111111 yes, 00000000_00000000 no ?
551static INLINE int16_t highbd_hev_mask(uint8_t thresh, uint16_t p1, uint16_t p0,
552 uint16_t q0, uint16_t q1, int bd) {
553 int16_t hev = 0;
554 int16_t thresh16 = (uint16_t)thresh << (bd - 8);
555 hev |= (abs(p1 - p0) > thresh16) * -1;
556 hev |= (abs(q1 - q0) > thresh16) * -1;
557 return hev;
558}
559
560static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1,
561 uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
562 int bd) {
563 int16_t filter1, filter2;
564 // ^0x80 equivalent to subtracting 0x80 from the values to turn them
565 // into -128 to +127 instead of 0 to 255.
566 int shift = bd - 8;
567 const int16_t ps1 = (int16_t)*op1 - (0x80 << shift);
568 const int16_t ps0 = (int16_t)*op0 - (0x80 << shift);
569 const int16_t qs0 = (int16_t)*oq0 - (0x80 << shift);
570 const int16_t qs1 = (int16_t)*oq1 - (0x80 << shift);
571 const uint16_t hev = highbd_hev_mask(thresh, *op1, *op0, *oq0, *oq1, bd);
572
573 // Add outer taps if we have high edge variance.
574 int16_t filter = signed_char_clamp_high(ps1 - qs1, bd) & hev;
575
576 // Inner taps.
577 filter = signed_char_clamp_high(filter + 3 * (qs0 - ps0), bd) & mask;
578
579 // Save bottom 3 bits so that we round one side +4 and the other +3
580 // if it equals 4 we'll set to adjust by -1 to account for the fact
581 // we'd round 3 the other way.
582 filter1 = signed_char_clamp_high(filter + 4, bd) >> 3;
583 filter2 = signed_char_clamp_high(filter + 3, bd) >> 3;
584
585 *oq0 = signed_char_clamp_high(qs0 - filter1, bd) + (0x80 << shift);
586 *op0 = signed_char_clamp_high(ps0 + filter2, bd) + (0x80 << shift);
587
588 // Outer tap adjustments.
589 filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
590
591 *oq1 = signed_char_clamp_high(qs1 - filter, bd) + (0x80 << shift);
592 *op1 = signed_char_clamp_high(ps1 + filter, bd) + (0x80 << shift);
593}
594
Yaowu Xuf883b422016-08-30 14:01:10 -0700595void aom_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700596 const uint8_t *blimit, const uint8_t *limit,
597 const uint8_t *thresh, int bd) {
598 int i;
599
600 // loop filter designed to work using chars so that we can make maximum use
601 // of 8 bit simd instructions.
602 for (i = 0; i < 8; ++i) {
Ryan Lei392d0ff2017-02-09 13:05:42 -0800603#if !CONFIG_PARALLEL_DEBLOCKING
Yaowu Xuc27fc142016-08-22 16:08:15 -0700604 const uint16_t p3 = s[-4 * p];
605 const uint16_t p2 = s[-3 * p];
606 const uint16_t p1 = s[-2 * p];
607 const uint16_t p0 = s[-p];
608 const uint16_t q0 = s[0 * p];
609 const uint16_t q1 = s[1 * p];
610 const uint16_t q2 = s[2 * p];
611 const uint16_t q3 = s[3 * p];
612 const int8_t mask =
613 highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
Ryan Lei392d0ff2017-02-09 13:05:42 -0800614#else // CONFIG_PARALLEL_DEBLOCKING
615 const uint16_t p1 = s[-2 * p];
616 const uint16_t p0 = s[-p];
617 const uint16_t q0 = s[0 * p];
618 const uint16_t q1 = s[1 * p];
619 const int8_t mask =
620 highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd);
621#endif // !CONFIG_PARALLEL_DEBLOCKING
Yaowu Xuc27fc142016-08-22 16:08:15 -0700622 highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd);
623 ++s;
624 }
625}
626
Yaowu Xuf883b422016-08-30 14:01:10 -0700627void aom_highbd_lpf_horizontal_4_dual_c(
Yaowu Xuc27fc142016-08-22 16:08:15 -0700628 uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
629 const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
630 const uint8_t *thresh1, int bd) {
Yaowu Xuf883b422016-08-30 14:01:10 -0700631 aom_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, bd);
632 aom_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700633}
634
Yaowu Xuf883b422016-08-30 14:01:10 -0700635void aom_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700636 const uint8_t *limit, const uint8_t *thresh,
637 int bd) {
638 int i;
639
640 // loop filter designed to work using chars so that we can make maximum use
641 // of 8 bit simd instructions.
642 for (i = 0; i < 8; ++i) {
Ryan Lei392d0ff2017-02-09 13:05:42 -0800643#if !CONFIG_PARALLEL_DEBLOCKING
Yaowu Xuc27fc142016-08-22 16:08:15 -0700644 const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
645 const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
646 const int8_t mask =
647 highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
Ryan Lei392d0ff2017-02-09 13:05:42 -0800648#else // CONFIG_PARALLEL_DEBLOCKING
649 const uint16_t p1 = s[-2], p0 = s[-1];
650 const uint16_t q0 = s[0], q1 = s[1];
651 const int8_t mask =
652 highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd);
653#endif // !CONFIG_PARALLEL_DEBLOCKING
Yaowu Xuc27fc142016-08-22 16:08:15 -0700654 highbd_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd);
655 s += pitch;
656 }
657}
658
Yaowu Xuf883b422016-08-30 14:01:10 -0700659void aom_highbd_lpf_vertical_4_dual_c(
Yaowu Xuc27fc142016-08-22 16:08:15 -0700660 uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
661 const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
662 const uint8_t *thresh1, int bd) {
Yaowu Xuf883b422016-08-30 14:01:10 -0700663 aom_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, bd);
664 aom_highbd_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700665 bd);
666}
667
668static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat,
669 uint16_t *op3, uint16_t *op2, uint16_t *op1,
670 uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
671 uint16_t *oq2, uint16_t *oq3, int bd) {
672 if (flat && mask) {
673 const uint16_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
674 const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
675
676 // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
677 *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
678 *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
679 *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
680 *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
681 *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
682 *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
683 } else {
684 highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd);
685 }
686}
687
Yaowu Xuf883b422016-08-30 14:01:10 -0700688void aom_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700689 const uint8_t *limit, const uint8_t *thresh,
690 int bd) {
691 int i;
692
693 // loop filter designed to work using chars so that we can make maximum use
694 // of 8 bit simd instructions.
695 for (i = 0; i < 8; ++i) {
696 const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
697 const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
698
699 const int8_t mask =
700 highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
701 const int8_t flat =
702 highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
703 highbd_filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p,
704 s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p, bd);
705 ++s;
706 }
707}
708
Yaowu Xuf883b422016-08-30 14:01:10 -0700709void aom_highbd_lpf_horizontal_8_dual_c(
Yaowu Xuc27fc142016-08-22 16:08:15 -0700710 uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
711 const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
712 const uint8_t *thresh1, int bd) {
Yaowu Xuf883b422016-08-30 14:01:10 -0700713 aom_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, bd);
714 aom_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700715}
716
Yaowu Xuf883b422016-08-30 14:01:10 -0700717void aom_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700718 const uint8_t *limit, const uint8_t *thresh,
719 int bd) {
720 int i;
721
722 for (i = 0; i < 8; ++i) {
723 const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
724 const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
725 const int8_t mask =
726 highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
727 const int8_t flat =
728 highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
729 highbd_filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1,
730 s + 2, s + 3, bd);
731 s += pitch;
732 }
733}
734
Yaowu Xuf883b422016-08-30 14:01:10 -0700735void aom_highbd_lpf_vertical_8_dual_c(
Yaowu Xuc27fc142016-08-22 16:08:15 -0700736 uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
737 const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
738 const uint8_t *thresh1, int bd) {
Yaowu Xuf883b422016-08-30 14:01:10 -0700739 aom_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, bd);
740 aom_highbd_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700741 bd);
742}
743
744static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, uint8_t flat,
745 uint8_t flat2, uint16_t *op7, uint16_t *op6,
746 uint16_t *op5, uint16_t *op4, uint16_t *op3,
747 uint16_t *op2, uint16_t *op1, uint16_t *op0,
748 uint16_t *oq0, uint16_t *oq1, uint16_t *oq2,
749 uint16_t *oq3, uint16_t *oq4, uint16_t *oq5,
750 uint16_t *oq6, uint16_t *oq7, int bd) {
751 if (flat2 && flat && mask) {
752 const uint16_t p7 = *op7;
753 const uint16_t p6 = *op6;
754 const uint16_t p5 = *op5;
755 const uint16_t p4 = *op4;
756 const uint16_t p3 = *op3;
757 const uint16_t p2 = *op2;
758 const uint16_t p1 = *op1;
759 const uint16_t p0 = *op0;
760 const uint16_t q0 = *oq0;
761 const uint16_t q1 = *oq1;
762 const uint16_t q2 = *oq2;
763 const uint16_t q3 = *oq3;
764 const uint16_t q4 = *oq4;
765 const uint16_t q5 = *oq5;
766 const uint16_t q6 = *oq6;
767 const uint16_t q7 = *oq7;
768
769 // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
770 *op6 = ROUND_POWER_OF_TWO(
771 p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 + q0, 4);
772 *op5 = ROUND_POWER_OF_TWO(
773 p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 + q0 + q1, 4);
774 *op4 = ROUND_POWER_OF_TWO(
775 p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 + q0 + q1 + q2, 4);
776 *op3 = ROUND_POWER_OF_TWO(
777 p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 + q0 + q1 + q2 + q3, 4);
778 *op2 = ROUND_POWER_OF_TWO(
779 p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + q2 + q3 + q4,
780 4);
781 *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 +
782 q0 + q1 + q2 + q3 + q4 + q5,
783 4);
784 *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 +
785 q1 + q2 + q3 + q4 + q5 + q6,
786 4);
787 *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 +
788 q2 + q3 + q4 + q5 + q6 + q7,
789 4);
790 *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 +
791 q3 + q4 + q5 + q6 + q7 * 2,
792 4);
793 *oq2 = ROUND_POWER_OF_TWO(
794 p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3,
795 4);
796 *oq3 = ROUND_POWER_OF_TWO(
797 p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
798 *oq4 = ROUND_POWER_OF_TWO(
799 p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
800 *oq5 = ROUND_POWER_OF_TWO(
801 p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
802 *oq6 = ROUND_POWER_OF_TWO(
803 p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
804 } else {
805 highbd_filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3,
806 bd);
807 }
808}
809
810static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
811 const uint8_t *blimit,
812 const uint8_t *limit,
813 const uint8_t *thresh, int count,
814 int bd) {
815 int i;
816
817 // loop filter designed to work using chars so that we can make maximum use
818 // of 8 bit simd instructions.
819 for (i = 0; i < 8 * count; ++i) {
820 const uint16_t p3 = s[-4 * p];
821 const uint16_t p2 = s[-3 * p];
822 const uint16_t p1 = s[-2 * p];
823 const uint16_t p0 = s[-p];
824 const uint16_t q0 = s[0 * p];
825 const uint16_t q1 = s[1 * p];
826 const uint16_t q2 = s[2 * p];
827 const uint16_t q3 = s[3 * p];
828 const int8_t mask =
829 highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
830 const int8_t flat =
831 highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
832 const int8_t flat2 =
833 highbd_flat_mask5(1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, q0,
834 s[4 * p], s[5 * p], s[6 * p], s[7 * p], bd);
835
836 highbd_filter16(mask, *thresh, flat, flat2, s - 8 * p, s - 7 * p, s - 6 * p,
837 s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
838 s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p,
839 s + 6 * p, s + 7 * p, bd);
840 ++s;
841 }
842}
843
Yaowu Xuf883b422016-08-30 14:01:10 -0700844void aom_highbd_lpf_horizontal_edge_8_c(uint16_t *s, int p,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700845 const uint8_t *blimit,
846 const uint8_t *limit,
847 const uint8_t *thresh, int bd) {
848 highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd);
849}
850
Yaowu Xuf883b422016-08-30 14:01:10 -0700851void aom_highbd_lpf_horizontal_edge_16_c(uint16_t *s, int p,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700852 const uint8_t *blimit,
853 const uint8_t *limit,
854 const uint8_t *thresh, int bd) {
855 highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2, bd);
856}
857
858static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
859 const uint8_t *blimit,
860 const uint8_t *limit,
861 const uint8_t *thresh, int count,
862 int bd) {
863 int i;
864
865 for (i = 0; i < count; ++i) {
866 const uint16_t p3 = s[-4];
867 const uint16_t p2 = s[-3];
868 const uint16_t p1 = s[-2];
869 const uint16_t p0 = s[-1];
870 const uint16_t q0 = s[0];
871 const uint16_t q1 = s[1];
872 const uint16_t q2 = s[2];
873 const uint16_t q3 = s[3];
874 const int8_t mask =
875 highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
876 const int8_t flat =
877 highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
878 const int8_t flat2 = highbd_flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0,
879 q0, s[4], s[5], s[6], s[7], bd);
880
881 highbd_filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5,
882 s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4,
883 s + 5, s + 6, s + 7, bd);
884 s += p;
885 }
886}
887
Yaowu Xuf883b422016-08-30 14:01:10 -0700888void aom_highbd_lpf_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700889 const uint8_t *limit, const uint8_t *thresh,
890 int bd) {
891 highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd);
892}
893
Yaowu Xuf883b422016-08-30 14:01:10 -0700894void aom_highbd_lpf_vertical_16_dual_c(uint16_t *s, int p,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700895 const uint8_t *blimit,
896 const uint8_t *limit,
897 const uint8_t *thresh, int bd) {
898 highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16, bd);
899}
Sebastien Alaiwan71e87842017-04-12 16:03:28 +0200900#endif // CONFIG_HIGHBITDEPTH