blob: 636d2a12eac4d375bfb7bf32c0c8d0613f20e428 [file] [log] [blame]
Steinar Midtskogenbe668e92016-08-05 12:12:38 +02001/*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12#include "./aom_dsp_rtcd.h"
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +020013#include "aom_ports/mem.h"
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +010014#include "aom_ports/bitops.h"
15#include "av1/common/clpf_simd_kernel.h"
Steinar Midtskogen3dbd55a2016-09-09 15:23:35 +020016
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +020017// Process blocks of width 8, two lines at a time, 8 bit.
18static void clpf_block8(const uint8_t *src, uint8_t *dst, int sstride,
Steinar Midtskogen73ad5232017-01-30 14:39:07 +010019 int dstride, int x0, int y0, int sizey,
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +010020 BOUNDARY_TYPE bt, unsigned int strength) {
Steinar Midtskogen73ad5232017-01-30 14:39:07 +010021 const int bottom = bt & TILE_BOTTOM_BOUNDARY ? sizey - 2 : -1;
22 const int right = !(bt & TILE_RIGHT_BOUNDARY);
23 const int left = !(bt & TILE_LEFT_BOUNDARY);
24 const int top = bt & TILE_ABOVE_BOUNDARY ? y0 : -1;
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +020025 DECLARE_ALIGNED(16, static const uint64_t,
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +010026 c_shuff[]) = { 0x0504030201000000LL, 0x0d0c0b0a09080808LL };
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +020027 DECLARE_ALIGNED(16, static const uint64_t,
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +010028 d_shuff[]) = { 0x0605040302010000LL, 0x0e0d0c0b0a090808LL };
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +020029 DECLARE_ALIGNED(16, static const uint64_t,
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +010030 e_shuff[]) = { 0x0707060504030201LL, 0x0f0f0e0d0c0b0a09LL };
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +020031 DECLARE_ALIGNED(16, static const uint64_t,
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +010032 f_shuff[]) = { 0x0707070605040302LL, 0x0f0f0f0e0d0c0b0aLL };
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +020033 int y;
34
Steinar Midtskogene8224c72016-08-24 13:00:04 +020035 dst += x0 + y0 * dstride;
36 src += x0 + y0 * sstride;
Steinar Midtskogenbe668e92016-08-05 12:12:38 +020037
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +020038 for (y = 0; y < sizey; y += 2) {
39 const v64 l1 = v64_load_aligned(src);
40 const v64 l2 = v64_load_aligned(src + sstride);
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +010041 const v64 l3 = v64_load_aligned(src - (y != top) * sstride);
42 const v64 l4 = v64_load_aligned(src + ((y != bottom) + 1) * sstride);
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +020043 v128 o = v128_from_v64(l1, l2);
44 const v128 a =
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +010045 v128_from_v64(v64_load_aligned(src - 2 * (y != top) * sstride), l3);
46 const v128 b = v128_from_v64(l3, l1);
47 const v128 g = v128_from_v64(l2, l4);
48 const v128 h = v128_from_v64(
49 l4, v64_load_aligned(src + (2 * (y != bottom) + 1) * sstride));
50 v128 c, d, e, f;
Steinar Midtskogenbe668e92016-08-05 12:12:38 +020051
Steinar Midtskogen73ad5232017-01-30 14:39:07 +010052 if (left) {
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +010053 c = v128_from_v64(v64_load_unaligned(src - 2),
Steinar Midtskogene66fc872016-09-26 12:51:25 +020054 v64_load_unaligned(src - 2 + sstride));
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +010055 d = v128_from_v64(v64_load_unaligned(src - 1),
Steinar Midtskogene66fc872016-09-26 12:51:25 +020056 v64_load_unaligned(src - 1 + sstride));
57 } else { // Left clipping
Steinar Midtskogene66fc872016-09-26 12:51:25 +020058 c = v128_shuffle_8(o, v128_load_aligned(c_shuff));
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +010059 d = v128_shuffle_8(o, v128_load_aligned(d_shuff));
Steinar Midtskogen3dbd55a2016-09-09 15:23:35 +020060 }
Steinar Midtskogene66fc872016-09-26 12:51:25 +020061 if (right) {
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +010062 e = v128_from_v64(v64_load_unaligned(src + 1),
Steinar Midtskogene66fc872016-09-26 12:51:25 +020063 v64_load_unaligned(src + 1 + sstride));
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +010064 f = v128_from_v64(v64_load_unaligned(src + 2),
Steinar Midtskogene66fc872016-09-26 12:51:25 +020065 v64_load_unaligned(src + 2 + sstride));
66 } else { // Right clipping
Steinar Midtskogene66fc872016-09-26 12:51:25 +020067 e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +010068 f = v128_shuffle_8(o, v128_load_aligned(f_shuff));
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +020069 }
Steinar Midtskogenbe668e92016-08-05 12:12:38 +020070
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +010071 o = calc_delta(o, a, b, c, d, e, f, g, h, strength);
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +020072 v64_store_aligned(dst, v128_high_v64(o));
73 v64_store_aligned(dst + dstride, v128_low_v64(o));
74 src += sstride * 2;
75 dst += dstride * 2;
76 }
77}
78
Steinar Midtskogenf844e6e2017-02-09 17:24:37 +010079// As above, but with no clipping tests
80static void clpf_block8_noclip(const uint8_t *src, uint8_t *dst, int sstride,
81 int dstride, int x0, int y0, int sizey,
82 unsigned int strength) {
83 int y;
84
85 dst += x0 + y0 * dstride;
86 src += x0 + y0 * sstride;
87
88 for (y = 0; y < sizey; y += 2) {
89 const v64 l1 = v64_load_aligned(src);
90 const v64 l2 = v64_load_aligned(src + sstride);
91 const v64 l3 = v64_load_aligned(src - sstride);
92 const v64 l4 = v64_load_aligned(src + 2 * sstride);
93 const v128 a = v128_from_v64(v64_load_aligned(src - 2 * sstride), l3);
94 const v128 b = v128_from_v64(l3, l1);
95 const v128 g = v128_from_v64(l2, l4);
96 const v128 h = v128_from_v64(l4, v64_load_aligned(src + 3 * sstride));
97 const v128 c = v128_from_v64(v64_load_unaligned(src - 2),
98 v64_load_unaligned(src - 2 + sstride));
99 const v128 d = v128_from_v64(v64_load_unaligned(src - 1),
100 v64_load_unaligned(src - 1 + sstride));
101 const v128 e = v128_from_v64(v64_load_unaligned(src + 1),
102 v64_load_unaligned(src + 1 + sstride));
103 const v128 f = v128_from_v64(v64_load_unaligned(src + 2),
104 v64_load_unaligned(src + 2 + sstride));
105 const v128 o =
106 calc_delta(v128_from_v64(l1, l2), a, b, c, d, e, f, g, h, strength);
107
108 v64_store_aligned(dst, v128_high_v64(o));
109 v64_store_aligned(dst + dstride, v128_low_v64(o));
110 src += sstride * 2;
111 dst += dstride * 2;
112 }
113}
114
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200115// Process blocks of width 4, four lines at a time, 8 bit.
116static void clpf_block4(const uint8_t *src, uint8_t *dst, int sstride,
Steinar Midtskogen73ad5232017-01-30 14:39:07 +0100117 int dstride, int x0, int y0, int sizey,
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100118 BOUNDARY_TYPE bt, unsigned int strength) {
Steinar Midtskogen73ad5232017-01-30 14:39:07 +0100119 const int right = !(bt & TILE_RIGHT_BOUNDARY);
120 const int bottom = bt & TILE_BOTTOM_BOUNDARY ? sizey - 4 : -1;
121 const int left = !(bt & TILE_LEFT_BOUNDARY);
122 const int top = bt & TILE_ABOVE_BOUNDARY ? y0 : -1;
123
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200124 DECLARE_ALIGNED(16, static const uint64_t,
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100125 c_shuff[]) = { 0x0504040401000000LL, 0x0d0c0c0c09080808LL };
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200126 DECLARE_ALIGNED(16, static const uint64_t,
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100127 d_shuff[]) = { 0x0605040402010000LL, 0x0e0d0c0c0a090808LL };
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200128 DECLARE_ALIGNED(16, static const uint64_t,
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100129 e_shuff[]) = { 0x0707060503030201LL, 0x0f0f0e0d0b0b0a09LL };
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200130 DECLARE_ALIGNED(16, static const uint64_t,
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100131 f_shuff[]) = { 0x0707070603030302LL, 0x0f0f0f0e0b0b0b0aLL };
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200132 int y;
133
134 dst += x0 + y0 * dstride;
135 src += x0 + y0 * sstride;
136
137 for (y = 0; y < sizey; y += 4) {
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100138 const uint32_t l0 = u32_load_aligned(src - 2 * (y != top) * sstride);
139 const uint32_t l1 = u32_load_aligned(src - (y != top) * sstride);
140 const uint32_t l2 = u32_load_aligned(src);
141 const uint32_t l3 = u32_load_aligned(src + sstride);
142 const uint32_t l4 = u32_load_aligned(src + 2 * sstride);
143 const uint32_t l5 = u32_load_aligned(src + 3 * sstride);
144 const uint32_t l6 = u32_load_aligned(src + ((y != bottom) + 3) * sstride);
145 const uint32_t l7 =
146 u32_load_aligned(src + (2 * (y != bottom) + 3) * sstride);
147 v128 o = v128_from_32(l2, l3, l4, l5);
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200148 const v128 a = v128_from_32(l0, l1, l2, l3);
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100149 const v128 b = v128_from_32(l1, l2, l3, l4);
150 const v128 g = v128_from_32(l3, l4, l5, l6);
151 const v128 h = v128_from_32(l4, l5, l6, l7);
152 v128 c, d, e, f;
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200153
Steinar Midtskogen73ad5232017-01-30 14:39:07 +0100154 if (left) {
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100155 c = v128_from_32(u32_load_unaligned(src - 2),
Steinar Midtskogene66fc872016-09-26 12:51:25 +0200156 u32_load_unaligned(src + sstride - 2),
157 u32_load_unaligned(src + 2 * sstride - 2),
158 u32_load_unaligned(src + 3 * sstride - 2));
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100159 d = v128_from_32(u32_load_unaligned(src - 1),
Steinar Midtskogene66fc872016-09-26 12:51:25 +0200160 u32_load_unaligned(src + sstride - 1),
161 u32_load_unaligned(src + 2 * sstride - 1),
162 u32_load_unaligned(src + 3 * sstride - 1));
163 } else { // Left clipping
Steinar Midtskogene66fc872016-09-26 12:51:25 +0200164 c = v128_shuffle_8(o, v128_load_aligned(c_shuff));
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100165 d = v128_shuffle_8(o, v128_load_aligned(d_shuff));
Steinar Midtskogen3dbd55a2016-09-09 15:23:35 +0200166 }
Steinar Midtskogene66fc872016-09-26 12:51:25 +0200167 if (right) {
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100168 e = v128_from_32(u32_load_unaligned(src + 1),
Steinar Midtskogene66fc872016-09-26 12:51:25 +0200169 u32_load_unaligned(src + sstride + 1),
170 u32_load_unaligned(src + 2 * sstride + 1),
171 u32_load_unaligned(src + 3 * sstride + 1));
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100172 f = v128_from_32(u32_load_unaligned(src + 2),
Steinar Midtskogene66fc872016-09-26 12:51:25 +0200173 u32_load_unaligned(src + sstride + 2),
174 u32_load_unaligned(src + 2 * sstride + 2),
175 u32_load_unaligned(src + 3 * sstride + 2));
176 } else { // Right clipping
Steinar Midtskogene66fc872016-09-26 12:51:25 +0200177 e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100178 f = v128_shuffle_8(o, v128_load_aligned(f_shuff));
Steinar Midtskogenbe668e92016-08-05 12:12:38 +0200179 }
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200180
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100181 o = calc_delta(o, a, b, c, d, e, f, g, h, strength);
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200182 u32_store_aligned(dst, v128_low_u32(v128_shr_n_byte(o, 12)));
183 u32_store_aligned(dst + dstride, v128_low_u32(v128_shr_n_byte(o, 8)));
184 u32_store_aligned(dst + 2 * dstride, v128_low_u32(v128_shr_n_byte(o, 4)));
185 u32_store_aligned(dst + 3 * dstride, v128_low_u32(o));
186
187 dst += 4 * dstride;
188 src += 4 * sstride;
Steinar Midtskogenbe668e92016-08-05 12:12:38 +0200189 }
190}
191
Steinar Midtskogenf844e6e2017-02-09 17:24:37 +0100192// As above, but with no clipping tests
193static void clpf_block4_noclip(const uint8_t *src, uint8_t *dst, int sstride,
194 int dstride, int x0, int y0, int sizey,
195 unsigned int strength) {
196 int y;
197
198 dst += x0 + y0 * dstride;
199 src += x0 + y0 * sstride;
200
201 for (y = 0; y < sizey; y += 4) {
202 const uint32_t l0 = u32_load_aligned(src - 2 * sstride);
203 const uint32_t l1 = u32_load_aligned(src - sstride);
204 const uint32_t l2 = u32_load_aligned(src);
205 const uint32_t l3 = u32_load_aligned(src + sstride);
206 const uint32_t l4 = u32_load_aligned(src + 2 * sstride);
207 const uint32_t l5 = u32_load_aligned(src + 3 * sstride);
208 const uint32_t l6 = u32_load_aligned(src + 4 * sstride);
209 const uint32_t l7 = u32_load_aligned(src + 5 * sstride);
210 const v128 a = v128_from_32(l0, l1, l2, l3);
211 const v128 b = v128_from_32(l1, l2, l3, l4);
212 const v128 g = v128_from_32(l3, l4, l5, l6);
213 const v128 h = v128_from_32(l4, l5, l6, l7);
214 const v128 c = v128_from_32(u32_load_unaligned(src - 2),
215 u32_load_unaligned(src + sstride - 2),
216 u32_load_unaligned(src + 2 * sstride - 2),
217 u32_load_unaligned(src + 3 * sstride - 2));
218 const v128 d = v128_from_32(u32_load_unaligned(src - 1),
219 u32_load_unaligned(src + sstride - 1),
220 u32_load_unaligned(src + 2 * sstride - 1),
221 u32_load_unaligned(src + 3 * sstride - 1));
222 const v128 e = v128_from_32(u32_load_unaligned(src + 1),
223 u32_load_unaligned(src + sstride + 1),
224 u32_load_unaligned(src + 2 * sstride + 1),
225 u32_load_unaligned(src + 3 * sstride + 1));
226 const v128 f = v128_from_32(u32_load_unaligned(src + 2),
227 u32_load_unaligned(src + sstride + 2),
228 u32_load_unaligned(src + 2 * sstride + 2),
229 u32_load_unaligned(src + 3 * sstride + 2));
230
231 const v128 o = calc_delta(v128_from_32(l2, l3, l4, l5), a, b, c, d, e, f, g,
232 h, strength);
233
234 u32_store_aligned(dst, v128_low_u32(v128_shr_n_byte(o, 12)));
235 u32_store_aligned(dst + dstride, v128_low_u32(v128_shr_n_byte(o, 8)));
236 u32_store_aligned(dst + 2 * dstride, v128_low_u32(v128_shr_n_byte(o, 4)));
237 u32_store_aligned(dst + 3 * dstride, v128_low_u32(o));
238
239 dst += 4 * dstride;
240 src += 4 * sstride;
241 }
242}
243
Steinar Midtskogene8224c72016-08-24 13:00:04 +0200244void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int sstride,
245 int dstride, int x0, int y0, int sizex,
Steinar Midtskogen73ad5232017-01-30 14:39:07 +0100246 int sizey, unsigned int strength,
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100247 BOUNDARY_TYPE bt, unsigned int bd) {
Steinar Midtskogene66fc872016-09-26 12:51:25 +0200248 if ((sizex != 4 && sizex != 8) || ((sizey & 3) && sizex == 4)) {
249 // Fallback to C for odd sizes:
250 // * block widths not 4 or 8
251 // * block heights not a multiple of 4 if the block width is 4
Steinar Midtskogen73ad5232017-01-30 14:39:07 +0100252 aom_clpf_block_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, strength,
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100253 bt, bd);
Steinar Midtskogenbe668e92016-08-05 12:12:38 +0200254 } else {
Steinar Midtskogenf844e6e2017-02-09 17:24:37 +0100255 if (bt)
256 (sizex == 4 ? clpf_block4 : clpf_block8)(src, dst, sstride, dstride, x0,
257 y0, sizey, bt, strength);
258 else
259 (sizex == 4 ? clpf_block4_noclip : clpf_block8_noclip)(
260 src, dst, sstride, dstride, x0, y0, sizey, strength);
Steinar Midtskogenbe668e92016-08-05 12:12:38 +0200261 }
262}
Steinar Midtskogen3dbd55a2016-09-09 15:23:35 +0200263
264#if CONFIG_AOM_HIGHBITDEPTH
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100265// sign(a - b) * max(0, abs(a - b) - max(0, abs(a - b) -
266// strength + (abs(a - b) >> (bd - 3 - log2(s)))))
267SIMD_INLINE v128 constrain_hbd(v128 a, v128 b, unsigned int strength,
268 unsigned int bd) {
269 const v128 diff = v128_sub_16(v128_max_s16(a, b), v128_min_s16(a, b));
270 const v128 sign = v128_cmpeq_16(v128_min_s16(a, b), a); // -(a <= b)
271 const v128 zero = v128_zero();
272 const v128 s = v128_max_s16(
273 zero, v128_sub_16(v128_dup_16(strength),
274 v128_shr_u16(diff, bd - 3 - get_msb(strength))));
275 return v128_sub_16(
276 v128_xor(sign,
277 v128_max_s16(
278 zero, v128_sub_16(
279 diff, v128_max_s16(zero, v128_sub_16(diff, s))))),
280 sign);
281}
282
283// delta = 1/16 * constrain(a, x, s, bd) + 3/16 * constrain(b, x, s, bd) +
284// 1/16 * constrain(c, x, s, bd) + 3/16 * constrain(d, x, s, bd) +
285// 3/16 * constrain(e, x, s, bd) + 1/16 * constrain(f, x, s, bd) +
286// 3/16 * constrain(g, x, s, bd) + 1/16 * constrain(h, x, s, bd)
287SIMD_INLINE v128 calc_delta_hbd(v128 x, v128 a, v128 b, v128 c, v128 d, v128 e,
288 v128 f, v128 g, v128 h, unsigned int s,
289 unsigned int bd) {
290 const v128 bdeg = v128_add_16(
291 v128_add_16(constrain_hbd(b, x, s, bd), constrain_hbd(d, x, s, bd)),
292 v128_add_16(constrain_hbd(e, x, s, bd), constrain_hbd(g, x, s, bd)));
Steinar Midtskogen3dbd55a2016-09-09 15:23:35 +0200293 const v128 delta = v128_add_16(
294 v128_add_16(
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100295 v128_add_16(constrain_hbd(a, x, s, bd), constrain_hbd(c, x, s, bd)),
296 v128_add_16(constrain_hbd(f, x, s, bd), constrain_hbd(h, x, s, bd))),
297 v128_add_16(v128_add_16(bdeg, bdeg), bdeg));
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200298 return v128_add_16(
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100299 x,
300 v128_shr_s16(
301 v128_add_16(v128_dup_16(8),
302 v128_add_16(delta, v128_cmplt_s16(delta, v128_zero()))),
303 4));
Steinar Midtskogen3dbd55a2016-09-09 15:23:35 +0200304}
305
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200306static void calc_delta_hbd4(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100307 v128 f, v128 g, v128 h, uint16_t *dst,
308 unsigned int s, unsigned int bd, int dstride) {
309 o = calc_delta_hbd(o, a, b, c, d, e, f, g, h, s, bd);
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200310 v64_store_aligned(dst, v128_high_v64(o));
311 v64_store_aligned(dst + dstride, v128_low_v64(o));
312}
313
314static void calc_delta_hbd8(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100315 v128 f, v128 g, v128 h, uint16_t *dst,
316 unsigned int s, unsigned int bd) {
317 v128_store_aligned(dst, calc_delta_hbd(o, a, b, c, d, e, f, g, h, s, bd));
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200318}
319
320// Process blocks of width 4, two lines at time.
321SIMD_INLINE void clpf_block_hbd4(const uint16_t *src, uint16_t *dst,
322 int sstride, int dstride, int x0, int y0,
Steinar Midtskogen73ad5232017-01-30 14:39:07 +0100323 int sizey, unsigned int strength,
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100324 BOUNDARY_TYPE bt, unsigned int bd) {
Steinar Midtskogen73ad5232017-01-30 14:39:07 +0100325 const int right = !(bt & TILE_RIGHT_BOUNDARY);
326 const int bottom = bt & TILE_BOTTOM_BOUNDARY ? sizey - 2 : -1;
327 const int left = !(bt & TILE_LEFT_BOUNDARY);
328 const int top = bt & TILE_ABOVE_BOUNDARY ? y0 : -1;
329
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200330 DECLARE_ALIGNED(16, static const uint64_t,
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100331 c_shuff[]) = { 0x0302010001000100LL, 0x0b0a090809080908LL };
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200332 DECLARE_ALIGNED(16, static const uint64_t,
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100333 d_shuff[]) = { 0x0504030201000100LL, 0x0d0c0b0a09080908LL };
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200334 DECLARE_ALIGNED(16, static const uint64_t,
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100335 e_shuff[]) = { 0x0706070605040302LL, 0x0f0e0f0e0d0c0b0aLL };
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200336 DECLARE_ALIGNED(16, static const uint64_t,
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100337 f_shuff[]) = { 0x0706070607060504LL, 0x0f0e0f0e0f0e0d0cLL };
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200338 int y;
Steinar Midtskogen3dbd55a2016-09-09 15:23:35 +0200339
340 dst += x0 + y0 * dstride;
341 src += x0 + y0 * sstride;
342
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200343 for (y = 0; y < sizey; y += 2) {
344 const v64 l1 = v64_load_aligned(src);
345 const v64 l2 = v64_load_aligned(src + sstride);
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100346 const v64 l3 = v64_load_aligned(src - (y != top) * sstride);
347 const v64 l4 = v64_load_aligned(src + ((y != bottom) + 1) * sstride);
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200348 v128 o = v128_from_v64(l1, l2);
349 const v128 a =
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100350 v128_from_v64(v64_load_aligned(src - 2 * (y != top) * sstride), l3);
351 const v128 b = v128_from_v64(l3, l1);
352 const v128 g = v128_from_v64(l2, l4);
353 const v128 h = v128_from_v64(
354 l4, v64_load_aligned(src + (2 * (y != bottom) + 1) * sstride));
355 v128 c, d, e, f;
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200356
Steinar Midtskogen73ad5232017-01-30 14:39:07 +0100357 if (left) {
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100358 c = v128_from_v64(v64_load_unaligned(src - 2),
Steinar Midtskogene66fc872016-09-26 12:51:25 +0200359 v64_load_unaligned(src - 2 + sstride));
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100360 d = v128_from_v64(v64_load_unaligned(src - 1),
Steinar Midtskogene66fc872016-09-26 12:51:25 +0200361 v64_load_unaligned(src - 1 + sstride));
362 } else { // Left clipping
Steinar Midtskogene66fc872016-09-26 12:51:25 +0200363 c = v128_shuffle_8(o, v128_load_aligned(c_shuff));
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100364 d = v128_shuffle_8(o, v128_load_aligned(d_shuff));
Steinar Midtskogen3dbd55a2016-09-09 15:23:35 +0200365 }
Steinar Midtskogene66fc872016-09-26 12:51:25 +0200366 if (right) {
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100367 e = v128_from_v64(v64_load_unaligned(src + 1),
Steinar Midtskogene66fc872016-09-26 12:51:25 +0200368 v64_load_unaligned(src + 1 + sstride));
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100369 f = v128_from_v64(v64_load_unaligned(src + 2),
Steinar Midtskogene66fc872016-09-26 12:51:25 +0200370 v64_load_unaligned(src + 2 + sstride));
371 } else { // Right clipping
Steinar Midtskogene66fc872016-09-26 12:51:25 +0200372 e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100373 f = v128_shuffle_8(o, v128_load_aligned(f_shuff));
Steinar Midtskogen3dbd55a2016-09-09 15:23:35 +0200374 }
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100375 calc_delta_hbd4(o, a, b, c, d, e, f, g, h, dst, strength, bd, dstride);
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200376 src += sstride * 2;
377 dst += dstride * 2;
378 }
379}
380
Steinar Midtskogenf844e6e2017-02-09 17:24:37 +0100381// As above, but with no clipping tests
382SIMD_INLINE void clpf_block_hbd4_noclip(const uint16_t *src, uint16_t *dst,
383 int sstride, int dstride, int x0,
384 int y0, int sizey,
385 unsigned int strength,
386 unsigned int bd) {
387 int y;
388
389 dst += x0 + y0 * dstride;
390 src += x0 + y0 * sstride;
391
392 for (y = 0; y < sizey; y += 2) {
393 const v64 l1 = v64_load_aligned(src);
394 const v64 l2 = v64_load_aligned(src + sstride);
395 const v64 l3 = v64_load_aligned(src - sstride);
396 const v64 l4 = v64_load_aligned(src + 2 * sstride);
397 const v128 a = v128_from_v64(v64_load_aligned(src - 2 * sstride), l3);
398 const v128 b = v128_from_v64(l3, l1);
399 const v128 g = v128_from_v64(l2, l4);
400 const v128 h = v128_from_v64(l4, v64_load_aligned(src + 3 * sstride));
401 const v128 c = v128_from_v64(v64_load_unaligned(src - 2),
402 v64_load_unaligned(src - 2 + sstride));
403 const v128 d = v128_from_v64(v64_load_unaligned(src - 1),
404 v64_load_unaligned(src - 1 + sstride));
405 const v128 e = v128_from_v64(v64_load_unaligned(src + 1),
406 v64_load_unaligned(src + 1 + sstride));
407 const v128 f = v128_from_v64(v64_load_unaligned(src + 2),
408 v64_load_unaligned(src + 2 + sstride));
409
410 calc_delta_hbd4(v128_from_v64(l1, l2), a, b, c, d, e, f, g, h, dst,
411 strength, bd, dstride);
412 src += sstride * 2;
413 dst += dstride * 2;
414 }
415}
416
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200417// The most simple case. Start here if you need to understand the functions.
418SIMD_INLINE void clpf_block_hbd(const uint16_t *src, uint16_t *dst, int sstride,
419 int dstride, int x0, int y0, int sizey,
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100420 unsigned int strength, BOUNDARY_TYPE bt,
421 unsigned int bd) {
Steinar Midtskogen73ad5232017-01-30 14:39:07 +0100422 const int right = !(bt & TILE_RIGHT_BOUNDARY);
Steinar Midtskogen73ad5232017-01-30 14:39:07 +0100423 const int left = !(bt & TILE_LEFT_BOUNDARY);
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100424 const int ymin = -!(bt & TILE_ABOVE_BOUNDARY) * 2;
425 const int ymax = sizey + !(bt & TILE_BOTTOM_BOUNDARY) * 2 - 1;
Steinar Midtskogen73ad5232017-01-30 14:39:07 +0100426
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200427 DECLARE_ALIGNED(16, static const uint64_t,
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100428 c_shuff[]) = { 0x0302010001000100LL, 0x0b0a090807060504LL };
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200429 DECLARE_ALIGNED(16, static const uint64_t,
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100430 d_shuff[]) = { 0x0504030201000100LL, 0x0d0c0b0a09080706LL };
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200431 DECLARE_ALIGNED(16, static const uint64_t,
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100432 e_shuff[]) = { 0x0908070605040302LL, 0x0f0e0f0e0d0c0b0aLL };
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200433 DECLARE_ALIGNED(16, static const uint64_t,
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100434 f_shuff[]) = { 0x0b0a090807060504LL, 0x0f0e0f0e0f0e0d0cLL };
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200435 int y;
436
437 dst += x0 + y0 * dstride;
438 src += x0 + y0 * sstride;
439
440 // Read 8 set of pixels at a time. Clipping along upper and lower
441 // edges is handled by reading the upper or lower line twice.
442 // Clipping along the left and right edges is handled by shuffle
443 // instructions doing shift and pad.
444 for (y = 0; y < sizey; y++) {
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100445 const v128 o = v128_load_aligned(src + y * sstride);
446 const v128 a = v128_load_aligned(src + AOMMAX(ymin, y - 2) * sstride);
447 const v128 b = v128_load_aligned(src + AOMMAX(ymin, y - 1) * sstride);
448 const v128 g = v128_load_aligned(src + AOMMIN(ymax, y + 1) * sstride);
449 const v128 h = v128_load_aligned(src + AOMMIN(ymax, y + 2) * sstride);
450 v128 c, d, e, f;
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200451
Steinar Midtskogen73ad5232017-01-30 14:39:07 +0100452 if (left) {
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100453 c = v128_load_unaligned(src + y * sstride - 2);
454 d = v128_load_unaligned(src + y * sstride - 1);
Steinar Midtskogene66fc872016-09-26 12:51:25 +0200455 } else { // Left clipping
Steinar Midtskogene66fc872016-09-26 12:51:25 +0200456 c = v128_shuffle_8(o, v128_load_aligned(c_shuff));
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100457 d = v128_shuffle_8(o, v128_load_aligned(d_shuff));
Steinar Midtskogen3dbd55a2016-09-09 15:23:35 +0200458 }
Steinar Midtskogene66fc872016-09-26 12:51:25 +0200459 if (right) {
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100460 e = v128_load_unaligned(src + y * sstride + 1);
461 f = v128_load_unaligned(src + y * sstride + 2);
Steinar Midtskogene66fc872016-09-26 12:51:25 +0200462 } else { // Right clipping
Steinar Midtskogene66fc872016-09-26 12:51:25 +0200463 e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100464 f = v128_shuffle_8(o, v128_load_aligned(f_shuff));
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200465 }
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100466 calc_delta_hbd8(o, a, b, c, d, e, f, g, h, dst, strength, bd);
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200467 dst += dstride;
Steinar Midtskogen3dbd55a2016-09-09 15:23:35 +0200468 }
469}
470
Steinar Midtskogenf844e6e2017-02-09 17:24:37 +0100471// As above, but with no clipping tests
472SIMD_INLINE void clpf_block_hbd_noclip(const uint16_t *src, uint16_t *dst,
473 int sstride, int dstride, int x0, int y0,
474 int sizey, unsigned int strength,
475 unsigned int bd) {
476 int y;
477
478 dst += x0 + y0 * dstride;
479 src += x0 + y0 * sstride;
480
481 for (y = 0; y < sizey; y++) {
482 const v128 o = v128_load_aligned(src);
483 const v128 a = v128_load_aligned(src - 2 * sstride);
484 const v128 b = v128_load_aligned(src - 1 * sstride);
485 const v128 g = v128_load_aligned(src + sstride);
486 const v128 h = v128_load_aligned(src + 2 * sstride);
487 const v128 c = v128_load_unaligned(src - 2);
488 const v128 d = v128_load_unaligned(src - 1);
489 const v128 e = v128_load_unaligned(src + 1);
490 const v128 f = v128_load_unaligned(src + 2);
491
492 calc_delta_hbd8(o, a, b, c, d, e, f, g, h, dst, strength, bd);
493 src += sstride;
494 dst += dstride;
495 }
496}
497
Steinar Midtskogen3dbd55a2016-09-09 15:23:35 +0200498void SIMD_FUNC(aom_clpf_block_hbd)(const uint16_t *src, uint16_t *dst,
499 int sstride, int dstride, int x0, int y0,
Steinar Midtskogen73ad5232017-01-30 14:39:07 +0100500 int sizex, int sizey, unsigned int strength,
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100501 BOUNDARY_TYPE bt, unsigned int bd) {
Steinar Midtskogene66fc872016-09-26 12:51:25 +0200502 if ((sizex != 4 && sizex != 8) || ((sizey & 1) && sizex == 4)) {
503 // Fallback to C for odd sizes:
504 // * block width not 4 or 8
505 // * block heights not a multiple of 2 if the block width is 4
Steinar Midtskogen3dbd55a2016-09-09 15:23:35 +0200506 aom_clpf_block_hbd_c(src, dst, sstride, dstride, x0, y0, sizex, sizey,
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100507 strength, bt, bd);
Steinar Midtskogen3dbd55a2016-09-09 15:23:35 +0200508 } else {
Steinar Midtskogenf844e6e2017-02-09 17:24:37 +0100509 if (bt)
510 (sizex == 4 ? clpf_block_hbd4 : clpf_block_hbd)(
511 src, dst, sstride, dstride, x0, y0, sizey, strength, bt, bd);
512 else
513 (sizex == 4 ? clpf_block_hbd4_noclip : clpf_block_hbd_noclip)(
514 src, dst, sstride, dstride, x0, y0, sizey, strength, bd);
Steinar Midtskogen3dbd55a2016-09-09 15:23:35 +0200515 }
516}
517#endif