blob: 177359fc59268b283a7d14b0e04000e30eaffa6b [file] [log] [blame]
Steinar Midtskogenbe668e92016-08-05 12:12:38 +02001/*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +020012#include "./aom_dsp_rtcd.h"
Steinar Midtskogenbe668e92016-08-05 12:12:38 +020013#include "aom_dsp/aom_simd.h"
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +020014#include "aom_ports/mem.h"
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +010015#include "aom_ports/bitops.h"
16#include "av1/common/clpf_simd_kernel.h"
Steinar Midtskogenbe668e92016-08-05 12:12:38 +020017
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +010018SIMD_INLINE void clip_sides(v128 *c, v128 *d, v128 *e, v128 *f, int left,
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +020019 int right) {
20 DECLARE_ALIGNED(16, static const uint64_t,
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +010021 c_shuff[]) = { 0x0504030201000000LL, 0x0d0c0b0a09080808LL };
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +020022 DECLARE_ALIGNED(16, static const uint64_t,
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +010023 d_shuff[]) = { 0x0605040302010000LL, 0x0e0d0c0b0a090808LL };
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +020024 DECLARE_ALIGNED(16, static const uint64_t,
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +010025 e_shuff[]) = { 0x0707060504030201LL, 0x0f0f0e0d0c0b0a09LL };
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +020026 DECLARE_ALIGNED(16, static const uint64_t,
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +010027 f_shuff[]) = { 0x0707070605040302LL, 0x0f0f0f0e0d0c0b0aLL };
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +020028
29 if (!left) { // Left clipping
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +020030 *c = v128_shuffle_8(*c, v128_load_aligned(c_shuff));
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +010031 *d = v128_shuffle_8(*d, v128_load_aligned(d_shuff));
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +020032 }
33 if (!right) { // Right clipping
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +020034 *e = v128_shuffle_8(*e, v128_load_aligned(e_shuff));
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +010035 *f = v128_shuffle_8(*f, v128_load_aligned(f_shuff));
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +020036 }
37}
38
39SIMD_INLINE void read_two_lines(const uint8_t *rec, const uint8_t *org,
40 int rstride, int ostride, int x0, int y0,
41 int bottom, int right, int y, v128 *o, v128 *r,
42 v128 *a, v128 *b, v128 *c, v128 *d, v128 *e,
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +010043 v128 *f, v128 *g, v128 *h) {
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +020044 const v64 k1 = v64_load_aligned(org);
45 const v64 k2 = v64_load_aligned(org + ostride);
46 const v64 l1 = v64_load_aligned(rec);
47 const v64 l2 = v64_load_aligned(rec + rstride);
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +010048 const v64 l3 = v64_load_aligned(rec - (y != -y0) * rstride);
49 const v64 l4 = v64_load_aligned(rec + ((y != bottom) + 1) * rstride);
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +020050 *o = v128_from_v64(k1, k2);
51 *r = v128_from_v64(l1, l2);
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +010052 *a = v128_from_v64(v64_load_aligned(rec - 2 * (y != -y0) * rstride), l3);
53 *b = v128_from_v64(l3, l1);
54 *g = v128_from_v64(l2, l4);
55 *h = v128_from_v64(l4,
56 v64_load_aligned(rec + (2 * (y != bottom) + 1) * rstride));
57 *c = v128_from_v64(v64_load_unaligned(rec - 2 * !!x0),
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +020058 v64_load_unaligned(rec - 2 * !!x0 + rstride));
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +010059 *d = v128_from_v64(v64_load_unaligned(rec - !!x0),
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +020060 v64_load_unaligned(rec - !!x0 + rstride));
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +010061 *e = v128_from_v64(v64_load_unaligned(rec + !!right),
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +020062 v64_load_unaligned(rec + !!right + rstride));
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +010063 *f = v128_from_v64(v64_load_unaligned(rec + 2 * !!right),
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +020064 v64_load_unaligned(rec + 2 * !!right + rstride));
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +010065 clip_sides(c, d, e, f, x0, right);
Steinar Midtskogen3dbd55a2016-09-09 15:23:35 +020066}
67
Steinar Midtskogenbe668e92016-08-05 12:12:38 +020068void SIMD_FUNC(aom_clpf_detect)(const uint8_t *rec, const uint8_t *org,
69 int rstride, int ostride, int x0, int y0,
70 int width, int height, int *sum0, int *sum1,
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +010071 unsigned int strength, int size,
72 unsigned int bd) {
Steinar Midtskogenbe668e92016-08-05 12:12:38 +020073 const int bottom = height - 2 - y0;
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +010074 const int right = width - 8 - x0;
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +020075 ssd128_internal ssd0 = v128_ssd_u8_init();
76 ssd128_internal ssd1 = v128_ssd_u8_init();
77 int y;
78
79 if (size != 8) { // Fallback to plain C
80 aom_clpf_detect_c(rec, org, rstride, ostride, x0, y0, width, height, sum0,
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +010081 sum1, strength, size, bd);
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +020082 return;
83 }
Steinar Midtskogenbe668e92016-08-05 12:12:38 +020084
85 rec += x0 + y0 * rstride;
86 org += x0 + y0 * ostride;
87
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +020088 for (y = 0; y < 8; y += 2) {
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +010089 v128 a, b, c, d, e, f, g, h, o, r;
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +020090 read_two_lines(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o, &r,
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +010091 &a, &b, &c, &d, &e, &f, &g, &h);
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +020092 ssd0 = v128_ssd_u8(ssd0, o, r);
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +010093 ssd1 =
94 v128_ssd_u8(ssd1, o, calc_delta(r, a, b, c, d, e, f, g, h, strength));
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +020095 rec += rstride * 2;
96 org += ostride * 2;
Steinar Midtskogenbe668e92016-08-05 12:12:38 +020097 }
98 *sum0 += v128_ssd_u8_sum(ssd0);
99 *sum1 += v128_ssd_u8_sum(ssd1);
100}
101
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200102SIMD_INLINE void calc_delta_multi(v128 r, v128 o, v128 a, v128 b, v128 c,
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100103 v128 d, v128 e, v128 f, v128 g, v128 h,
104 ssd128_internal *ssd1, ssd128_internal *ssd2,
Steinar Midtskogen3dbd55a2016-09-09 15:23:35 +0200105 ssd128_internal *ssd3) {
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100106 *ssd1 = v128_ssd_u8(*ssd1, o, calc_delta(r, a, b, c, d, e, f, g, h, 1));
107 *ssd2 = v128_ssd_u8(*ssd2, o, calc_delta(r, a, b, c, d, e, f, g, h, 2));
108 *ssd3 = v128_ssd_u8(*ssd3, o, calc_delta(r, a, b, c, d, e, f, g, h, 4));
Steinar Midtskogen3dbd55a2016-09-09 15:23:35 +0200109}
110
111// Test multiple filter strengths at once.
Steinar Midtskogenbe668e92016-08-05 12:12:38 +0200112void SIMD_FUNC(aom_clpf_detect_multi)(const uint8_t *rec, const uint8_t *org,
113 int rstride, int ostride, int x0, int y0,
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100114 int width, int height, int *sum, int size,
115 unsigned int bd) {
Steinar Midtskogenbe668e92016-08-05 12:12:38 +0200116 const int bottom = height - 2 - y0;
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200117 const int right = width - 8 - x0;
Steinar Midtskogenbe668e92016-08-05 12:12:38 +0200118 ssd128_internal ssd0 = v128_ssd_u8_init();
119 ssd128_internal ssd1 = v128_ssd_u8_init();
120 ssd128_internal ssd2 = v128_ssd_u8_init();
121 ssd128_internal ssd3 = v128_ssd_u8_init();
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200122 int y;
123
124 if (size != 8) { // Fallback to plain C
125 aom_clpf_detect_multi_c(rec, org, rstride, ostride, x0, y0, width, height,
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100126 sum, size, bd);
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200127 return;
128 }
Steinar Midtskogenbe668e92016-08-05 12:12:38 +0200129
130 rec += x0 + y0 * rstride;
131 org += x0 + y0 * ostride;
132
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200133 for (y = 0; y < 8; y += 2) {
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100134 v128 a, b, c, d, e, f, g, h, o, r;
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200135 read_two_lines(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o, &r,
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100136 &a, &b, &c, &d, &e, &f, &g, &h);
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200137 ssd0 = v128_ssd_u8(ssd0, o, r);
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100138 calc_delta_multi(r, o, a, b, c, d, e, f, g, h, &ssd1, &ssd2, &ssd3);
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200139 rec += 2 * rstride;
140 org += 2 * ostride;
Steinar Midtskogenbe668e92016-08-05 12:12:38 +0200141 }
142 sum[0] += v128_ssd_u8_sum(ssd0);
143 sum[1] += v128_ssd_u8_sum(ssd1);
144 sum[2] += v128_ssd_u8_sum(ssd2);
145 sum[3] += v128_ssd_u8_sum(ssd3);
146}
Steinar Midtskogen3dbd55a2016-09-09 15:23:35 +0200147
148#if CONFIG_AOM_HIGHBITDEPTH
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200149SIMD_INLINE void read_two_lines_hbd(const uint16_t *rec, const uint16_t *org,
150 int rstride, int ostride, int x0, int y0,
151 int bottom, int right, int y, v128 *o,
152 v128 *r, v128 *a, v128 *b, v128 *c, v128 *d,
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100153 v128 *e, v128 *f, v128 *g, v128 *h,
154 int shift) {
155 const v128 k1 = v128_shr_u16(v128_load_aligned(org), shift);
156 const v128 k2 = v128_shr_u16(v128_load_aligned(org + ostride), shift);
157 const v128 l1 = v128_shr_u16(v128_load_aligned(rec), shift);
158 const v128 l2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift);
159 const v128 l3 =
160 v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride), shift);
161 const v128 l4 = v128_shr_u16(
162 v128_load_aligned(rec + ((y != bottom) + 1) * rstride), shift);
163 *o = v128_unziplo_8(k1, k2);
164 *r = v128_unziplo_8(l1, l2);
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200165 *a = v128_unziplo_8(
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100166 v128_shr_u16(v128_load_aligned(rec - 2 * (y != -y0) * rstride), shift),
167 l3);
168 *b = v128_unziplo_8(l3, l1);
169 *g = v128_unziplo_8(l2, l4);
170 *h = v128_unziplo_8(
171 l4,
172 v128_shr_u16(v128_load_unaligned(rec + (2 * (y != bottom) + 1) * rstride),
173 shift));
174 *c = v128_unziplo_8(
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200175 v128_shr_u16(v128_load_unaligned(rec - 2 * !!x0), shift),
176 v128_shr_u16(v128_load_unaligned(rec - 2 * !!x0 + rstride), shift));
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100177 *d = v128_unziplo_8(
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200178 v128_shr_u16(v128_load_unaligned(rec - !!x0), shift),
179 v128_shr_u16(v128_load_unaligned(rec - !!x0 + rstride), shift));
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100180 *e = v128_unziplo_8(
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200181 v128_shr_u16(v128_load_unaligned(rec + !!right), shift),
182 v128_shr_u16(v128_load_unaligned(rec + !!right + rstride), shift));
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100183 *f = v128_unziplo_8(
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200184 v128_shr_u16(v128_load_unaligned(rec + 2 * !!right), shift),
185 v128_shr_u16(v128_load_unaligned(rec + 2 * !!right + rstride), shift));
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100186 clip_sides(c, d, e, f, x0, right);
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200187}
188
Steinar Midtskogen3dbd55a2016-09-09 15:23:35 +0200189void SIMD_FUNC(aom_clpf_detect_hbd)(const uint16_t *rec, const uint16_t *org,
190 int rstride, int ostride, int x0, int y0,
191 int width, int height, int *sum0, int *sum1,
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100192 unsigned int strength, int size,
193 unsigned int bitdepth) {
194 const int shift = bitdepth - 8;
Steinar Midtskogen3dbd55a2016-09-09 15:23:35 +0200195 const int bottom = height - 2 - y0;
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200196 const int right = width - 8 - x0;
197 ssd128_internal ssd0 = v128_ssd_u8_init();
198 ssd128_internal ssd1 = v128_ssd_u8_init();
199 int y;
200
201 if (size != 8) { // Fallback to plain C
202 aom_clpf_detect_hbd_c(rec, org, rstride, ostride, x0, y0, width, height,
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100203 sum0, sum1, strength, size, bitdepth);
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200204 return;
205 }
Steinar Midtskogen3dbd55a2016-09-09 15:23:35 +0200206
207 rec += x0 + y0 * rstride;
208 org += x0 + y0 * ostride;
209
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200210 for (y = 0; y < 8; y += 2) {
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100211 v128 a, b, c, d, e, f, g, h, o, r;
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200212 read_two_lines_hbd(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o,
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100213 &r, &a, &b, &c, &d, &e, &f, &g, &h, shift);
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200214 ssd0 = v128_ssd_u8(ssd0, o, r);
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100215 ssd1 = v128_ssd_u8(
216 ssd1, o, calc_delta(r, a, b, c, d, e, f, g, h, strength >> shift));
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200217 rec += rstride * 2;
218 org += ostride * 2;
Steinar Midtskogen3dbd55a2016-09-09 15:23:35 +0200219 }
220 *sum0 += v128_ssd_u8_sum(ssd0);
221 *sum1 += v128_ssd_u8_sum(ssd1);
222}
223
224void SIMD_FUNC(aom_clpf_detect_multi_hbd)(const uint16_t *rec,
225 const uint16_t *org, int rstride,
226 int ostride, int x0, int y0,
227 int width, int height, int *sum,
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100228 int size, unsigned int bitdepth) {
Steinar Midtskogen3dbd55a2016-09-09 15:23:35 +0200229 const int bottom = height - 2 - y0;
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200230 const int right = width - 8 - x0;
Steinar Midtskogen3dbd55a2016-09-09 15:23:35 +0200231 ssd128_internal ssd0 = v128_ssd_u8_init();
232 ssd128_internal ssd1 = v128_ssd_u8_init();
233 ssd128_internal ssd2 = v128_ssd_u8_init();
234 ssd128_internal ssd3 = v128_ssd_u8_init();
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200235 int y;
236
237 if (size != 8) { // Fallback to plain C
238 aom_clpf_detect_multi_hbd_c(rec, org, rstride, ostride, x0, y0, width,
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100239 height, sum, size, bitdepth);
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200240 return;
241 }
Steinar Midtskogen3dbd55a2016-09-09 15:23:35 +0200242
243 rec += x0 + y0 * rstride;
244 org += x0 + y0 * ostride;
245
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200246 for (y = 0; y < 8; y += 2) {
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100247 v128 a, b, c, d, e, f, g, h, o, r;
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200248 read_two_lines_hbd(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o,
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100249 &r, &a, &b, &c, &d, &e, &f, &g, &h, bitdepth - 8);
Steinar Midtskogenecf9a0c2016-09-13 16:37:13 +0200250 ssd0 = v128_ssd_u8(ssd0, o, r);
Steinar Midtskogen4f0b3ed2017-02-08 18:48:07 +0100251 calc_delta_multi(r, o, a, b, c, d, e, f, g, h, &ssd1, &ssd2, &ssd3);
252 rec += rstride * 2;
253 org += ostride * 2;
Steinar Midtskogen3dbd55a2016-09-09 15:23:35 +0200254 }
255 sum[0] += v128_ssd_u8_sum(ssd0);
256 sum[1] += v128_ssd_u8_sum(ssd1);
257 sum[2] += v128_ssd_u8_sum(ssd2);
258 sum[3] += v128_ssd_u8_sum(ssd3);
259}
260#endif