blob: c7ffc569a6d0feb523080c10724398cf2e6a2af5 [file] [log] [blame]
Steinar Midtskogenbe668e92016-08-05 12:12:38 +02001/*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
Steinar Midtskogenb8ff6aa2017-03-25 18:52:22 +010012#include "./av1_rtcd.h"
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +010013#include "aom_ports/bitops.h"
Steinar Midtskogen8ff52fc2017-04-04 12:29:19 +020014#include "aom_ports/mem.h"
Steinar Midtskogen45544f92017-03-31 14:40:06 +020015
Steinar Midtskogen94de0aa2017-08-02 10:30:12 +020016// sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp)))
17SIMD_INLINE v128 constrain16(v128 a, v128 b, unsigned int threshold,
18 unsigned int adjdamp) {
19 v128 diff = v128_sub_16(a, b);
20 const v128 sign = v128_shr_n_s16(diff, 15);
21 diff = v128_abs_s16(diff);
22 const v128 s =
23 v128_ssub_u16(v128_dup_16(threshold), v128_shr_u16(diff, adjdamp));
24 return v128_xor(v128_add_16(sign, v128_min_s16(diff, s)), sign);
25}
26
Steinar Midtskogenfebe2232017-04-02 09:47:27 +020027// sign(a - b) * min(abs(a - b), max(0, strength - (abs(a - b) >> adjdamp)))
Steinar Midtskogenbab3d6a2017-04-02 21:40:14 +020028SIMD_INLINE v128 constrain(v256 a, v256 b, unsigned int strength,
Steinar Midtskogenfebe2232017-04-02 09:47:27 +020029 unsigned int adjdamp) {
Steinar Midtskogenbab3d6a2017-04-02 21:40:14 +020030 const v256 diff16 = v256_sub_16(a, b);
31 v128 diff = v128_pack_s16_s8(v256_high_v128(diff16), v256_low_v128(diff16));
Steinar Midtskogen9b501e12017-04-03 13:31:54 +020032 const v128 sign = v128_cmplt_s8(diff, v128_zero());
Steinar Midtskogenbab3d6a2017-04-02 21:40:14 +020033 diff = v128_abs_s8(diff);
34 return v128_xor(
35 v128_add_8(sign,
36 v128_min_u8(diff, v128_ssub_u8(v128_dup_8(strength),
37 v128_shr_u8(diff, adjdamp)))),
38 sign);
Steinar Midtskogen45544f92017-03-31 14:40:06 +020039}
40
Steinar Midtskogenfebe2232017-04-02 09:47:27 +020041// delta = 1/16 * constrain(a, x, s, d) + 3/16 * constrain(b, x, s, d) +
42// 1/16 * constrain(c, x, s, d) + 3/16 * constrain(d, x, s, d) +
43// 3/16 * constrain(e, x, s, d) + 1/16 * constrain(f, x, s, d) +
44// 3/16 * constrain(g, x, s, d) + 1/16 * constrain(h, x, s, d)
Steinar Midtskogenbab3d6a2017-04-02 21:40:14 +020045SIMD_INLINE v128 calc_delta(v256 x, v256 a, v256 b, v256 c, v256 d, v256 e,
46 v256 f, v256 g, v256 h, unsigned int s,
Steinar Midtskogen45544f92017-03-31 14:40:06 +020047 unsigned int dmp) {
48 const v128 bdeg =
49 v128_add_8(v128_add_8(constrain(b, x, s, dmp), constrain(d, x, s, dmp)),
50 v128_add_8(constrain(e, x, s, dmp), constrain(g, x, s, dmp)));
51 const v128 delta = v128_add_8(
52 v128_add_8(v128_add_8(constrain(a, x, s, dmp), constrain(c, x, s, dmp)),
53 v128_add_8(constrain(f, x, s, dmp), constrain(h, x, s, dmp))),
54 v128_add_8(v128_add_8(bdeg, bdeg), bdeg));
55 return v128_add_8(
Steinar Midtskogenbab3d6a2017-04-02 21:40:14 +020056 v128_pack_s16_u8(v256_high_v128(x), v256_low_v128(x)),
57 v128_shr_s8(
58 v128_add_8(v128_dup_8(8),
59 v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))),
60 4));
Steinar Midtskogen45544f92017-03-31 14:40:06 +020061}
62
Steinar Midtskogenfebe2232017-04-02 09:47:27 +020063// delta = 1/8 * constrain(a, x, s, d) + 3/8 * constrain(b, x, s, d) +
64// 3/8 * constrain(c, x, s, d) + 1/8 * constrain(d, x, s, d) +
Steinar Midtskogenbab3d6a2017-04-02 21:40:14 +020065SIMD_INLINE v128 calc_hdelta(v256 x, v256 a, v256 b, v256 c, v256 d,
Steinar Midtskogen45544f92017-03-31 14:40:06 +020066 unsigned int s, unsigned int dmp) {
67 const v128 bc = v128_add_8(constrain(b, x, s, dmp), constrain(c, x, s, dmp));
68 const v128 delta =
69 v128_add_8(v128_add_8(constrain(a, x, s, dmp), constrain(d, x, s, dmp)),
70 v128_add_8(v128_add_8(bc, bc), bc));
71 return v128_add_8(
Steinar Midtskogenbab3d6a2017-04-02 21:40:14 +020072 v128_pack_s16_u8(v256_high_v128(x), v256_low_v128(x)),
73 v128_shr_s8(
74 v128_add_8(v128_dup_8(4),
75 v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))),
76 3));
Steinar Midtskogen45544f92017-03-31 14:40:06 +020077}
Steinar Midtskogen3dbd55a2016-09-09 15:23:35 +020078
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +020079// Process blocks of width 8, two lines at a time, 8 bit.
Steinar Midtskogen569c7b92017-04-02 10:45:16 +020080static void SIMD_FUNC(clpf_block8)(uint8_t *dst, const uint16_t *src,
81 int dstride, int sstride, int sizey,
82 unsigned int strength,
83 unsigned int adjdamp) {
Steinar Midtskogenf844e6e2017-02-09 17:24:37 +010084 int y;
85
Steinar Midtskogenf844e6e2017-02-09 17:24:37 +010086 for (y = 0; y < sizey; y += 2) {
Steinar Midtskogen73aa77c2017-03-27 17:50:30 +020087 const v128 l1 = v128_load_aligned(src);
88 const v128 l2 = v128_load_aligned(src + sstride);
89 const v128 l3 = v128_load_aligned(src - sstride);
90 const v128 l4 = v128_load_aligned(src + 2 * sstride);
Steinar Midtskogenbab3d6a2017-04-02 21:40:14 +020091 const v256 a = v256_from_v128(v128_load_aligned(src - 2 * sstride), l3);
92 const v256 b = v256_from_v128(l3, l1);
93 const v256 g = v256_from_v128(l2, l4);
94 const v256 h = v256_from_v128(l4, v128_load_aligned(src + 3 * sstride));
95 const v256 c = v256_from_v128(v128_load_unaligned(src - 2),
96 v128_load_unaligned(src - 2 + sstride));
97 const v256 d = v256_from_v128(v128_load_unaligned(src - 1),
98 v128_load_unaligned(src - 1 + sstride));
99 const v256 e = v256_from_v128(v128_load_unaligned(src + 1),
100 v128_load_unaligned(src + 1 + sstride));
101 const v256 f = v256_from_v128(v128_load_unaligned(src + 2),
102 v128_load_unaligned(src + 2 + sstride));
103 const v128 o = calc_delta(v256_from_v128(l1, l2), a, b, c, d, e, f, g, h,
Steinar Midtskogenfebe2232017-04-02 09:47:27 +0200104 strength, adjdamp);
Steinar Midtskogenf844e6e2017-02-09 17:24:37 +0100105
106 v64_store_aligned(dst, v128_high_v64(o));
107 v64_store_aligned(dst + dstride, v128_low_v64(o));
108 src += sstride * 2;
109 dst += dstride * 2;
110 }
111}
112
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200113// Process blocks of width 4, four lines at a time, 8 bit.
Steinar Midtskogen569c7b92017-04-02 10:45:16 +0200114static void SIMD_FUNC(clpf_block4)(uint8_t *dst, const uint16_t *src,
115 int dstride, int sstride, int sizey,
116 unsigned int strength,
117 unsigned int adjdamp) {
Steinar Midtskogenf844e6e2017-02-09 17:24:37 +0100118 int y;
119
Steinar Midtskogenf844e6e2017-02-09 17:24:37 +0100120 for (y = 0; y < sizey; y += 4) {
Steinar Midtskogen73aa77c2017-03-27 17:50:30 +0200121 const v64 l0 = v64_load_aligned(src - 2 * sstride);
122 const v64 l1 = v64_load_aligned(src - sstride);
123 const v64 l2 = v64_load_aligned(src);
124 const v64 l3 = v64_load_aligned(src + sstride);
125 const v64 l4 = v64_load_aligned(src + 2 * sstride);
126 const v64 l5 = v64_load_aligned(src + 3 * sstride);
127 const v64 l6 = v64_load_aligned(src + 4 * sstride);
128 const v64 l7 = v64_load_aligned(src + 5 * sstride);
Steinar Midtskogenbab3d6a2017-04-02 21:40:14 +0200129 const v128 o =
Steinar Midtskogen9b501e12017-04-03 13:31:54 +0200130 calc_delta(v256_from_v64(l2, l3, l4, l5), v256_from_v64(l0, l1, l2, l3),
131 v256_from_v64(l1, l2, l3, l4),
132 v256_from_v64(v64_load_unaligned(src - 2),
133 v64_load_unaligned(src + sstride - 2),
134 v64_load_unaligned(src + 2 * sstride - 2),
135 v64_load_unaligned(src + 3 * sstride - 2)),
136 v256_from_v64(v64_load_unaligned(src - 1),
137 v64_load_unaligned(src + sstride - 1),
138 v64_load_unaligned(src + 2 * sstride - 1),
139 v64_load_unaligned(src + 3 * sstride - 1)),
140 v256_from_v64(v64_load_unaligned(src + 1),
141 v64_load_unaligned(src + sstride + 1),
142 v64_load_unaligned(src + 2 * sstride + 1),
143 v64_load_unaligned(src + 3 * sstride + 1)),
144 v256_from_v64(v64_load_unaligned(src + 2),
145 v64_load_unaligned(src + sstride + 2),
146 v64_load_unaligned(src + 2 * sstride + 2),
147 v64_load_unaligned(src + 3 * sstride + 2)),
148 v256_from_v64(l3, l4, l5, l6), v256_from_v64(l4, l5, l6, l7),
149 strength, adjdamp);
Steinar Midtskogenf844e6e2017-02-09 17:24:37 +0100150
151 u32_store_aligned(dst, v128_low_u32(v128_shr_n_byte(o, 12)));
152 u32_store_aligned(dst + dstride, v128_low_u32(v128_shr_n_byte(o, 8)));
153 u32_store_aligned(dst + 2 * dstride, v128_low_u32(v128_shr_n_byte(o, 4)));
154 u32_store_aligned(dst + 3 * dstride, v128_low_u32(o));
155
156 dst += 4 * dstride;
157 src += 4 * sstride;
158 }
159}
160
Steinar Midtskogen569c7b92017-04-02 10:45:16 +0200161static void SIMD_FUNC(clpf_hblock8)(uint8_t *dst, const uint16_t *src,
162 int dstride, int sstride, int sizey,
163 unsigned int strength,
164 unsigned int adjdamp) {
Steinar Midtskogen73aa77c2017-03-27 17:50:30 +0200165 int y;
166
167 for (y = 0; y < sizey; y += 2) {
Steinar Midtskogenbab3d6a2017-04-02 21:40:14 +0200168 const v256 x = v256_from_v128(v128_load_aligned(src),
169 v128_load_aligned(src + sstride));
170 const v256 a = v256_from_v128(v128_load_unaligned(src - 2),
171 v128_load_unaligned(src - 2 + sstride));
172 const v256 b = v256_from_v128(v128_load_unaligned(src - 1),
173 v128_load_unaligned(src - 1 + sstride));
174 const v256 c = v256_from_v128(v128_load_unaligned(src + 1),
175 v128_load_unaligned(src + 1 + sstride));
176 const v256 d = v256_from_v128(v128_load_unaligned(src + 2),
177 v128_load_unaligned(src + 2 + sstride));
178 const v128 o = calc_hdelta(x, a, b, c, d, strength, adjdamp);
Steinar Midtskogen73aa77c2017-03-27 17:50:30 +0200179
180 v64_store_aligned(dst, v128_high_v64(o));
181 v64_store_aligned(dst + dstride, v128_low_v64(o));
182 src += sstride * 2;
183 dst += dstride * 2;
184 }
185}
186
187// Process blocks of width 4, four lines at a time, 8 bit.
Steinar Midtskogen569c7b92017-04-02 10:45:16 +0200188static void SIMD_FUNC(clpf_hblock4)(uint8_t *dst, const uint16_t *src,
189 int dstride, int sstride, int sizey,
190 unsigned int strength,
191 unsigned int adjdamp) {
Steinar Midtskogen73aa77c2017-03-27 17:50:30 +0200192 int y;
193
194 for (y = 0; y < sizey; y += 4) {
Steinar Midtskogen9b501e12017-04-03 13:31:54 +0200195 const v256 a = v256_from_v64(v64_load_unaligned(src - 2),
196 v64_load_unaligned(src + sstride - 2),
197 v64_load_unaligned(src + 2 * sstride - 2),
198 v64_load_unaligned(src + 3 * sstride - 2));
199 const v256 b = v256_from_v64(v64_load_unaligned(src - 1),
200 v64_load_unaligned(src + sstride - 1),
201 v64_load_unaligned(src + 2 * sstride - 1),
202 v64_load_unaligned(src + 3 * sstride - 1));
203 const v256 c = v256_from_v64(v64_load_unaligned(src + 1),
204 v64_load_unaligned(src + sstride + 1),
205 v64_load_unaligned(src + 2 * sstride + 1),
206 v64_load_unaligned(src + 3 * sstride + 1));
207 const v256 d = v256_from_v64(v64_load_unaligned(src + 2),
208 v64_load_unaligned(src + sstride + 2),
209 v64_load_unaligned(src + 2 * sstride + 2),
210 v64_load_unaligned(src + 3 * sstride + 2));
Steinar Midtskogen73aa77c2017-03-27 17:50:30 +0200211
212 const v128 o = calc_hdelta(
Steinar Midtskogen9b501e12017-04-03 13:31:54 +0200213 v256_from_v64(v64_load_aligned(src), v64_load_aligned(src + sstride),
214 v64_load_aligned(src + 2 * sstride),
215 v64_load_aligned(src + 3 * sstride)),
216 a, b, c, d, strength, adjdamp);
Steinar Midtskogen73aa77c2017-03-27 17:50:30 +0200217
218 u32_store_aligned(dst, v128_low_u32(v128_shr_n_byte(o, 12)));
219 u32_store_aligned(dst + dstride, v128_low_u32(v128_shr_n_byte(o, 8)));
220 u32_store_aligned(dst + 2 * dstride, v128_low_u32(v128_shr_n_byte(o, 4)));
221 u32_store_aligned(dst + 3 * dstride, v128_low_u32(o));
222
223 dst += 4 * dstride;
224 src += 4 * sstride;
225 }
226}
227
228void SIMD_FUNC(aom_clpf_block)(uint8_t *dst, const uint16_t *src, int dstride,
229 int sstride, int sizex, int sizey,
230 unsigned int strength, unsigned int dmp) {
Steinar Midtskogene66fc872016-09-26 12:51:25 +0200231 if ((sizex != 4 && sizex != 8) || ((sizey & 3) && sizex == 4)) {
232 // Fallback to C for odd sizes:
233 // * block widths not 4 or 8
234 // * block heights not a multiple of 4 if the block width is 4
Steinar Midtskogen73aa77c2017-03-27 17:50:30 +0200235 aom_clpf_block_c(dst, src, dstride, sstride, sizex, sizey, strength, dmp);
Steinar Midtskogenbe668e92016-08-05 12:12:38 +0200236 } else {
Steinar Midtskogen569c7b92017-04-02 10:45:16 +0200237 (sizex == 4 ? SIMD_FUNC(clpf_block4) : SIMD_FUNC(clpf_block8))(
238 dst, src, dstride, sstride, sizey, strength, dmp - get_msb(strength));
Steinar Midtskogenbe668e92016-08-05 12:12:38 +0200239 }
240}
Steinar Midtskogen3dbd55a2016-09-09 15:23:35 +0200241
Steinar Midtskogen73aa77c2017-03-27 17:50:30 +0200242void SIMD_FUNC(aom_clpf_hblock)(uint8_t *dst, const uint16_t *src, int dstride,
243 int sstride, int sizex, int sizey,
244 unsigned int strength, unsigned int dmp) {
245 if ((sizex != 4 && sizex != 8) || ((sizey & 3) && sizex == 4)) {
246 // Fallback to C for odd sizes:
247 // * block widths not 4 or 8
248 // * block heights not a multiple of 4 if the block width is 4
249 aom_clpf_hblock_c(dst, src, dstride, sstride, sizex, sizey, strength, dmp);
250 } else {
Steinar Midtskogen569c7b92017-04-02 10:45:16 +0200251 (sizex == 4 ? SIMD_FUNC(clpf_hblock4) : SIMD_FUNC(clpf_hblock8))(
Steinar Midtskogenfebe2232017-04-02 09:47:27 +0200252 dst, src, dstride, sstride, sizey, strength, dmp - get_msb(strength));
Steinar Midtskogen73aa77c2017-03-27 17:50:30 +0200253 }
254}
255
Steinar Midtskogenfebe2232017-04-02 09:47:27 +0200256// delta = 1/16 * constrain(a, x, s, d) + 3/16 * constrain(b, x, s, d) +
257// 1/16 * constrain(c, x, s, d) + 3/16 * constrain(d, x, s, d) +
258// 3/16 * constrain(e, x, s, d) + 1/16 * constrain(f, x, s, d) +
259// 3/16 * constrain(g, x, s, d) + 1/16 * constrain(h, x, s, d)
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100260SIMD_INLINE v128 calc_delta_hbd(v128 x, v128 a, v128 b, v128 c, v128 d, v128 e,
261 v128 f, v128 g, v128 h, unsigned int s,
Steinar Midtskogen4305e6b2017-02-17 14:07:29 +0100262 unsigned int dmp) {
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100263 const v128 bdeg = v128_add_16(
Steinar Midtskogen8ff52fc2017-04-04 12:29:19 +0200264 v128_add_16(constrain16(b, x, s, dmp), constrain16(d, x, s, dmp)),
265 v128_add_16(constrain16(e, x, s, dmp), constrain16(g, x, s, dmp)));
Steinar Midtskogen3dbd55a2016-09-09 15:23:35 +0200266 const v128 delta = v128_add_16(
267 v128_add_16(
Steinar Midtskogen8ff52fc2017-04-04 12:29:19 +0200268 v128_add_16(constrain16(a, x, s, dmp), constrain16(c, x, s, dmp)),
269 v128_add_16(constrain16(f, x, s, dmp), constrain16(h, x, s, dmp))),
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100270 v128_add_16(v128_add_16(bdeg, bdeg), bdeg));
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200271 return v128_add_16(
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100272 x,
273 v128_shr_s16(
274 v128_add_16(v128_dup_16(8),
275 v128_add_16(delta, v128_cmplt_s16(delta, v128_zero()))),
276 4));
Steinar Midtskogen3dbd55a2016-09-09 15:23:35 +0200277}
278
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200279static void calc_delta_hbd4(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100280 v128 f, v128 g, v128 h, uint16_t *dst,
Steinar Midtskogen4305e6b2017-02-17 14:07:29 +0100281 unsigned int s, unsigned int dmp, int dstride) {
282 o = calc_delta_hbd(o, a, b, c, d, e, f, g, h, s, dmp);
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200283 v64_store_aligned(dst, v128_high_v64(o));
284 v64_store_aligned(dst + dstride, v128_low_v64(o));
285}
286
287static void calc_delta_hbd8(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100288 v128 f, v128 g, v128 h, uint16_t *dst,
Steinar Midtskogenfebe2232017-04-02 09:47:27 +0200289 unsigned int s, unsigned int adjdamp) {
290 v128_store_aligned(dst,
291 calc_delta_hbd(o, a, b, c, d, e, f, g, h, s, adjdamp));
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200292}
293
Steinar Midtskogen3c33def2017-03-21 12:56:17 +0100294// delta = 1/16 * constrain(a, x, s, dmp) + 3/16 * constrain(b, x, s, dmp) +
295// 3/16 * constrain(c, x, s, dmp) + 1/16 * constrain(d, x, s, dmp)
296SIMD_INLINE v128 calc_hdelta_hbd(v128 x, v128 a, v128 b, v128 c, v128 d,
297 unsigned int s, unsigned int dmp) {
298 const v128 bc =
Steinar Midtskogen8ff52fc2017-04-04 12:29:19 +0200299 v128_add_16(constrain16(b, x, s, dmp), constrain16(c, x, s, dmp));
Steinar Midtskogen3c33def2017-03-21 12:56:17 +0100300 const v128 delta = v128_add_16(
Steinar Midtskogen8ff52fc2017-04-04 12:29:19 +0200301 v128_add_16(constrain16(a, x, s, dmp), constrain16(d, x, s, dmp)),
Steinar Midtskogen3c33def2017-03-21 12:56:17 +0100302 v128_add_16(v128_add_16(bc, bc), bc));
303 return v128_add_16(
304 x,
305 v128_shr_s16(
306 v128_add_16(v128_dup_16(4),
307 v128_add_16(delta, v128_cmplt_s16(delta, v128_zero()))),
308 3));
309}
310
311static void calc_hdelta_hbd4(v128 o, v128 a, v128 b, v128 c, v128 d,
Steinar Midtskogenfebe2232017-04-02 09:47:27 +0200312 uint16_t *dst, unsigned int s,
313 unsigned int adjdamp, int dstride) {
314 o = calc_hdelta_hbd(o, a, b, c, d, s, adjdamp);
Steinar Midtskogen3c33def2017-03-21 12:56:17 +0100315 v64_store_aligned(dst, v128_high_v64(o));
316 v64_store_aligned(dst + dstride, v128_low_v64(o));
317}
318
319static void calc_hdelta_hbd8(v128 o, v128 a, v128 b, v128 c, v128 d,
Steinar Midtskogenfebe2232017-04-02 09:47:27 +0200320 uint16_t *dst, unsigned int s,
321 unsigned int adjdamp) {
322 v128_store_aligned(dst, calc_hdelta_hbd(o, a, b, c, d, s, adjdamp));
Steinar Midtskogen3c33def2017-03-21 12:56:17 +0100323}
324
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200325// Process blocks of width 4, two lines at time.
Steinar Midtskogen569c7b92017-04-02 10:45:16 +0200326static void SIMD_FUNC(clpf_block_hbd4)(uint16_t *dst, const uint16_t *src,
327 int dstride, int sstride, int sizey,
328 unsigned int strength,
329 unsigned int adjdamp) {
Steinar Midtskogenf844e6e2017-02-09 17:24:37 +0100330 int y;
331
Steinar Midtskogenf844e6e2017-02-09 17:24:37 +0100332 for (y = 0; y < sizey; y += 2) {
333 const v64 l1 = v64_load_aligned(src);
334 const v64 l2 = v64_load_aligned(src + sstride);
335 const v64 l3 = v64_load_aligned(src - sstride);
336 const v64 l4 = v64_load_aligned(src + 2 * sstride);
337 const v128 a = v128_from_v64(v64_load_aligned(src - 2 * sstride), l3);
338 const v128 b = v128_from_v64(l3, l1);
339 const v128 g = v128_from_v64(l2, l4);
340 const v128 h = v128_from_v64(l4, v64_load_aligned(src + 3 * sstride));
341 const v128 c = v128_from_v64(v64_load_unaligned(src - 2),
342 v64_load_unaligned(src - 2 + sstride));
343 const v128 d = v128_from_v64(v64_load_unaligned(src - 1),
344 v64_load_unaligned(src - 1 + sstride));
345 const v128 e = v128_from_v64(v64_load_unaligned(src + 1),
346 v64_load_unaligned(src + 1 + sstride));
347 const v128 f = v128_from_v64(v64_load_unaligned(src + 2),
348 v64_load_unaligned(src + 2 + sstride));
349
350 calc_delta_hbd4(v128_from_v64(l1, l2), a, b, c, d, e, f, g, h, dst,
Steinar Midtskogenfebe2232017-04-02 09:47:27 +0200351 strength, adjdamp, dstride);
Steinar Midtskogenf844e6e2017-02-09 17:24:37 +0100352 src += sstride * 2;
353 dst += dstride * 2;
354 }
355}
356
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200357// The most simple case. Start here if you need to understand the functions.
Steinar Midtskogen569c7b92017-04-02 10:45:16 +0200358static void SIMD_FUNC(clpf_block_hbd)(uint16_t *dst, const uint16_t *src,
359 int dstride, int sstride, int sizey,
360 unsigned int strength,
361 unsigned int adjdamp) {
Steinar Midtskogenf844e6e2017-02-09 17:24:37 +0100362 int y;
363
Steinar Midtskogenf844e6e2017-02-09 17:24:37 +0100364 for (y = 0; y < sizey; y++) {
365 const v128 o = v128_load_aligned(src);
366 const v128 a = v128_load_aligned(src - 2 * sstride);
367 const v128 b = v128_load_aligned(src - 1 * sstride);
368 const v128 g = v128_load_aligned(src + sstride);
369 const v128 h = v128_load_aligned(src + 2 * sstride);
370 const v128 c = v128_load_unaligned(src - 2);
371 const v128 d = v128_load_unaligned(src - 1);
372 const v128 e = v128_load_unaligned(src + 1);
373 const v128 f = v128_load_unaligned(src + 2);
374
Steinar Midtskogenfebe2232017-04-02 09:47:27 +0200375 calc_delta_hbd8(o, a, b, c, d, e, f, g, h, dst, strength, adjdamp);
Steinar Midtskogenf844e6e2017-02-09 17:24:37 +0100376 src += sstride;
377 dst += dstride;
378 }
379}
380
Steinar Midtskogen3c33def2017-03-21 12:56:17 +0100381// Process blocks of width 4, horizontal filter, two lines at time.
Steinar Midtskogen569c7b92017-04-02 10:45:16 +0200382static void SIMD_FUNC(clpf_hblock_hbd4)(uint16_t *dst, const uint16_t *src,
383 int dstride, int sstride, int sizey,
384 unsigned int strength,
385 unsigned int adjdamp) {
Steinar Midtskogen3c33def2017-03-21 12:56:17 +0100386 int y;
387
Steinar Midtskogen3c33def2017-03-21 12:56:17 +0100388 for (y = 0; y < sizey; y += 2) {
389 const v128 a = v128_from_v64(v64_load_unaligned(src - 2),
390 v64_load_unaligned(src - 2 + sstride));
391 const v128 b = v128_from_v64(v64_load_unaligned(src - 1),
392 v64_load_unaligned(src - 1 + sstride));
393 const v128 c = v128_from_v64(v64_load_unaligned(src + 1),
394 v64_load_unaligned(src + 1 + sstride));
395 const v128 d = v128_from_v64(v64_load_unaligned(src + 2),
396 v64_load_unaligned(src + 2 + sstride));
397
398 calc_hdelta_hbd4(v128_from_v64(v64_load_unaligned(src),
399 v64_load_unaligned(src + sstride)),
Steinar Midtskogenfebe2232017-04-02 09:47:27 +0200400 a, b, c, d, dst, strength, adjdamp, dstride);
Steinar Midtskogen3c33def2017-03-21 12:56:17 +0100401 src += sstride * 2;
402 dst += dstride * 2;
403 }
404}
405
406// Process blocks of width 8, horizontal filter, two lines at time.
Steinar Midtskogen569c7b92017-04-02 10:45:16 +0200407static void SIMD_FUNC(clpf_hblock_hbd)(uint16_t *dst, const uint16_t *src,
408 int dstride, int sstride, int sizey,
409 unsigned int strength,
410 unsigned int adjdamp) {
Steinar Midtskogen3c33def2017-03-21 12:56:17 +0100411 int y;
412
Steinar Midtskogen3c33def2017-03-21 12:56:17 +0100413 for (y = 0; y < sizey; y++) {
414 const v128 o = v128_load_aligned(src);
415 const v128 a = v128_load_unaligned(src - 2);
416 const v128 b = v128_load_unaligned(src - 1);
417 const v128 c = v128_load_unaligned(src + 1);
418 const v128 d = v128_load_unaligned(src + 2);
419
Steinar Midtskogenfebe2232017-04-02 09:47:27 +0200420 calc_hdelta_hbd8(o, a, b, c, d, dst, strength, adjdamp);
Steinar Midtskogen3c33def2017-03-21 12:56:17 +0100421 src += sstride;
422 dst += dstride;
423 }
424}
425
Steinar Midtskogen73aa77c2017-03-27 17:50:30 +0200426void SIMD_FUNC(aom_clpf_block_hbd)(uint16_t *dst, const uint16_t *src,
427 int dstride, int sstride, int sizex,
428 int sizey, unsigned int strength,
Steinar Midtskogend280a842017-03-21 09:59:14 +0100429 unsigned int dmp) {
Steinar Midtskogene66fc872016-09-26 12:51:25 +0200430 if ((sizex != 4 && sizex != 8) || ((sizey & 1) && sizex == 4)) {
431 // Fallback to C for odd sizes:
432 // * block width not 4 or 8
433 // * block heights not a multiple of 2 if the block width is 4
Steinar Midtskogen73aa77c2017-03-27 17:50:30 +0200434 aom_clpf_block_hbd_c(dst, src, dstride, sstride, sizex, sizey, strength,
435 dmp);
Steinar Midtskogen3dbd55a2016-09-09 15:23:35 +0200436 } else {
Steinar Midtskogen569c7b92017-04-02 10:45:16 +0200437 (sizex == 4 ? SIMD_FUNC(clpf_block_hbd4) : SIMD_FUNC(clpf_block_hbd))(
Steinar Midtskogenfebe2232017-04-02 09:47:27 +0200438 dst, src, dstride, sstride, sizey, strength, dmp - get_msb(strength));
Steinar Midtskogen3dbd55a2016-09-09 15:23:35 +0200439 }
440}
Steinar Midtskogen3c33def2017-03-21 12:56:17 +0100441
Steinar Midtskogen73aa77c2017-03-27 17:50:30 +0200442void SIMD_FUNC(aom_clpf_hblock_hbd)(uint16_t *dst, const uint16_t *src,
443 int dstride, int sstride, int sizex,
444 int sizey, unsigned int strength,
Steinar Midtskogen3c33def2017-03-21 12:56:17 +0100445 unsigned int dmp) {
446 if ((sizex != 4 && sizex != 8) || ((sizey & 1) && sizex == 4)) {
447 // Fallback to C for odd sizes:
448 // * block width not 4 or 8
449 // * block heights not a multiple of 2 if the block width is 4
Steinar Midtskogen73aa77c2017-03-27 17:50:30 +0200450 aom_clpf_hblock_hbd_c(dst, src, dstride, sstride, sizex, sizey, strength,
451 dmp);
Steinar Midtskogen3c33def2017-03-21 12:56:17 +0100452 } else {
Steinar Midtskogen569c7b92017-04-02 10:45:16 +0200453 (sizex == 4 ? SIMD_FUNC(clpf_hblock_hbd4) : SIMD_FUNC(clpf_hblock_hbd))(
Steinar Midtskogenfebe2232017-04-02 09:47:27 +0200454 dst, src, dstride, sstride, sizey, strength, dmp - get_msb(strength));
Steinar Midtskogen3c33def2017-03-21 12:56:17 +0100455 }
456}