Steinar Midtskogen | be668e9 | 2016-08-05 12:12:38 +0200 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (c) 2016, Alliance for Open Media. All rights reserved |
| 3 | * |
| 4 | * This source code is subject to the terms of the BSD 2 Clause License and |
| 5 | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| 6 | * was not distributed with this source code in the LICENSE file, you can |
| 7 | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| 8 | * Media Patent License 1.0 was not distributed with this source code in the |
| 9 | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
| 10 | */ |
| 11 | |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 12 | #include "./aom_dsp_rtcd.h" |
Steinar Midtskogen | be668e9 | 2016-08-05 12:12:38 +0200 | [diff] [blame] | 13 | #include "aom_dsp/aom_simd.h" |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 14 | #include "aom_ports/mem.h" |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame^] | 15 | #include "aom_ports/bitops.h" |
| 16 | #include "av1/common/clpf_simd_kernel.h" |
Steinar Midtskogen | be668e9 | 2016-08-05 12:12:38 +0200 | [diff] [blame] | 17 | |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame^] | 18 | SIMD_INLINE void clip_sides(v128 *c, v128 *d, v128 *e, v128 *f, int left, |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 19 | int right) { |
| 20 | DECLARE_ALIGNED(16, static const uint64_t, |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame^] | 21 | c_shuff[]) = { 0x0504030201000000LL, 0x0d0c0b0a09080808LL }; |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 22 | DECLARE_ALIGNED(16, static const uint64_t, |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame^] | 23 | d_shuff[]) = { 0x0605040302010000LL, 0x0e0d0c0b0a090808LL }; |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 24 | DECLARE_ALIGNED(16, static const uint64_t, |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame^] | 25 | e_shuff[]) = { 0x0707060504030201LL, 0x0f0f0e0d0c0b0a09LL }; |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 26 | DECLARE_ALIGNED(16, static const uint64_t, |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame^] | 27 | f_shuff[]) = { 0x0707070605040302LL, 0x0f0f0f0e0d0c0b0aLL }; |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 28 | |
| 29 | if (!left) { // Left clipping |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 30 | *c = v128_shuffle_8(*c, v128_load_aligned(c_shuff)); |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame^] | 31 | *d = v128_shuffle_8(*d, v128_load_aligned(d_shuff)); |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 32 | } |
| 33 | if (!right) { // Right clipping |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 34 | *e = v128_shuffle_8(*e, v128_load_aligned(e_shuff)); |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame^] | 35 | *f = v128_shuffle_8(*f, v128_load_aligned(f_shuff)); |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 36 | } |
| 37 | } |
| 38 | |
| 39 | SIMD_INLINE void read_two_lines(const uint8_t *rec, const uint8_t *org, |
| 40 | int rstride, int ostride, int x0, int y0, |
| 41 | int bottom, int right, int y, v128 *o, v128 *r, |
| 42 | v128 *a, v128 *b, v128 *c, v128 *d, v128 *e, |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame^] | 43 | v128 *f, v128 *g, v128 *h) { |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 44 | const v64 k1 = v64_load_aligned(org); |
| 45 | const v64 k2 = v64_load_aligned(org + ostride); |
| 46 | const v64 l1 = v64_load_aligned(rec); |
| 47 | const v64 l2 = v64_load_aligned(rec + rstride); |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame^] | 48 | const v64 l3 = v64_load_aligned(rec - (y != -y0) * rstride); |
| 49 | const v64 l4 = v64_load_aligned(rec + ((y != bottom) + 1) * rstride); |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 50 | *o = v128_from_v64(k1, k2); |
| 51 | *r = v128_from_v64(l1, l2); |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame^] | 52 | *a = v128_from_v64(v64_load_aligned(rec - 2 * (y != -y0) * rstride), l3); |
| 53 | *b = v128_from_v64(l3, l1); |
| 54 | *g = v128_from_v64(l2, l4); |
| 55 | *h = v128_from_v64(l4, |
| 56 | v64_load_aligned(rec + (2 * (y != bottom) + 1) * rstride)); |
| 57 | *c = v128_from_v64(v64_load_unaligned(rec - 2 * !!x0), |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 58 | v64_load_unaligned(rec - 2 * !!x0 + rstride)); |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame^] | 59 | *d = v128_from_v64(v64_load_unaligned(rec - !!x0), |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 60 | v64_load_unaligned(rec - !!x0 + rstride)); |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame^] | 61 | *e = v128_from_v64(v64_load_unaligned(rec + !!right), |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 62 | v64_load_unaligned(rec + !!right + rstride)); |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame^] | 63 | *f = v128_from_v64(v64_load_unaligned(rec + 2 * !!right), |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 64 | v64_load_unaligned(rec + 2 * !!right + rstride)); |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame^] | 65 | clip_sides(c, d, e, f, x0, right); |
Steinar Midtskogen | 3dbd55a | 2016-09-09 15:23:35 +0200 | [diff] [blame] | 66 | } |
| 67 | |
Steinar Midtskogen | be668e9 | 2016-08-05 12:12:38 +0200 | [diff] [blame] | 68 | void SIMD_FUNC(aom_clpf_detect)(const uint8_t *rec, const uint8_t *org, |
| 69 | int rstride, int ostride, int x0, int y0, |
| 70 | int width, int height, int *sum0, int *sum1, |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame^] | 71 | unsigned int strength, int size, |
| 72 | unsigned int bd) { |
Steinar Midtskogen | be668e9 | 2016-08-05 12:12:38 +0200 | [diff] [blame] | 73 | const int bottom = height - 2 - y0; |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame^] | 74 | const int right = width - 8 - x0; |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 75 | ssd128_internal ssd0 = v128_ssd_u8_init(); |
| 76 | ssd128_internal ssd1 = v128_ssd_u8_init(); |
| 77 | int y; |
| 78 | |
| 79 | if (size != 8) { // Fallback to plain C |
| 80 | aom_clpf_detect_c(rec, org, rstride, ostride, x0, y0, width, height, sum0, |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame^] | 81 | sum1, strength, size, bd); |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 82 | return; |
| 83 | } |
Steinar Midtskogen | be668e9 | 2016-08-05 12:12:38 +0200 | [diff] [blame] | 84 | |
| 85 | rec += x0 + y0 * rstride; |
| 86 | org += x0 + y0 * ostride; |
| 87 | |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 88 | for (y = 0; y < 8; y += 2) { |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame^] | 89 | v128 a, b, c, d, e, f, g, h, o, r; |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 90 | read_two_lines(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o, &r, |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame^] | 91 | &a, &b, &c, &d, &e, &f, &g, &h); |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 92 | ssd0 = v128_ssd_u8(ssd0, o, r); |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame^] | 93 | ssd1 = |
| 94 | v128_ssd_u8(ssd1, o, calc_delta(r, a, b, c, d, e, f, g, h, strength)); |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 95 | rec += rstride * 2; |
| 96 | org += ostride * 2; |
Steinar Midtskogen | be668e9 | 2016-08-05 12:12:38 +0200 | [diff] [blame] | 97 | } |
| 98 | *sum0 += v128_ssd_u8_sum(ssd0); |
| 99 | *sum1 += v128_ssd_u8_sum(ssd1); |
| 100 | } |
| 101 | |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 102 | SIMD_INLINE void calc_delta_multi(v128 r, v128 o, v128 a, v128 b, v128 c, |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame^] | 103 | v128 d, v128 e, v128 f, v128 g, v128 h, |
| 104 | ssd128_internal *ssd1, ssd128_internal *ssd2, |
Steinar Midtskogen | 3dbd55a | 2016-09-09 15:23:35 +0200 | [diff] [blame] | 105 | ssd128_internal *ssd3) { |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame^] | 106 | *ssd1 = v128_ssd_u8(*ssd1, o, calc_delta(r, a, b, c, d, e, f, g, h, 1)); |
| 107 | *ssd2 = v128_ssd_u8(*ssd2, o, calc_delta(r, a, b, c, d, e, f, g, h, 2)); |
| 108 | *ssd3 = v128_ssd_u8(*ssd3, o, calc_delta(r, a, b, c, d, e, f, g, h, 4)); |
Steinar Midtskogen | 3dbd55a | 2016-09-09 15:23:35 +0200 | [diff] [blame] | 109 | } |
| 110 | |
| 111 | // Test multiple filter strengths at once. |
Steinar Midtskogen | be668e9 | 2016-08-05 12:12:38 +0200 | [diff] [blame] | 112 | void SIMD_FUNC(aom_clpf_detect_multi)(const uint8_t *rec, const uint8_t *org, |
| 113 | int rstride, int ostride, int x0, int y0, |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame^] | 114 | int width, int height, int *sum, int size, |
| 115 | unsigned int bd) { |
Steinar Midtskogen | be668e9 | 2016-08-05 12:12:38 +0200 | [diff] [blame] | 116 | const int bottom = height - 2 - y0; |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 117 | const int right = width - 8 - x0; |
Steinar Midtskogen | be668e9 | 2016-08-05 12:12:38 +0200 | [diff] [blame] | 118 | ssd128_internal ssd0 = v128_ssd_u8_init(); |
| 119 | ssd128_internal ssd1 = v128_ssd_u8_init(); |
| 120 | ssd128_internal ssd2 = v128_ssd_u8_init(); |
| 121 | ssd128_internal ssd3 = v128_ssd_u8_init(); |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 122 | int y; |
| 123 | |
| 124 | if (size != 8) { // Fallback to plain C |
| 125 | aom_clpf_detect_multi_c(rec, org, rstride, ostride, x0, y0, width, height, |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame^] | 126 | sum, size, bd); |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 127 | return; |
| 128 | } |
Steinar Midtskogen | be668e9 | 2016-08-05 12:12:38 +0200 | [diff] [blame] | 129 | |
| 130 | rec += x0 + y0 * rstride; |
| 131 | org += x0 + y0 * ostride; |
| 132 | |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 133 | for (y = 0; y < 8; y += 2) { |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame^] | 134 | v128 a, b, c, d, e, f, g, h, o, r; |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 135 | read_two_lines(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o, &r, |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame^] | 136 | &a, &b, &c, &d, &e, &f, &g, &h); |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 137 | ssd0 = v128_ssd_u8(ssd0, o, r); |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame^] | 138 | calc_delta_multi(r, o, a, b, c, d, e, f, g, h, &ssd1, &ssd2, &ssd3); |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 139 | rec += 2 * rstride; |
| 140 | org += 2 * ostride; |
Steinar Midtskogen | be668e9 | 2016-08-05 12:12:38 +0200 | [diff] [blame] | 141 | } |
| 142 | sum[0] += v128_ssd_u8_sum(ssd0); |
| 143 | sum[1] += v128_ssd_u8_sum(ssd1); |
| 144 | sum[2] += v128_ssd_u8_sum(ssd2); |
| 145 | sum[3] += v128_ssd_u8_sum(ssd3); |
| 146 | } |
Steinar Midtskogen | 3dbd55a | 2016-09-09 15:23:35 +0200 | [diff] [blame] | 147 | |
| 148 | #if CONFIG_AOM_HIGHBITDEPTH |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 149 | SIMD_INLINE void read_two_lines_hbd(const uint16_t *rec, const uint16_t *org, |
| 150 | int rstride, int ostride, int x0, int y0, |
| 151 | int bottom, int right, int y, v128 *o, |
| 152 | v128 *r, v128 *a, v128 *b, v128 *c, v128 *d, |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame^] | 153 | v128 *e, v128 *f, v128 *g, v128 *h, |
| 154 | int shift) { |
| 155 | const v128 k1 = v128_shr_u16(v128_load_aligned(org), shift); |
| 156 | const v128 k2 = v128_shr_u16(v128_load_aligned(org + ostride), shift); |
| 157 | const v128 l1 = v128_shr_u16(v128_load_aligned(rec), shift); |
| 158 | const v128 l2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift); |
| 159 | const v128 l3 = |
| 160 | v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride), shift); |
| 161 | const v128 l4 = v128_shr_u16( |
| 162 | v128_load_aligned(rec + ((y != bottom) + 1) * rstride), shift); |
| 163 | *o = v128_unziplo_8(k1, k2); |
| 164 | *r = v128_unziplo_8(l1, l2); |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 165 | *a = v128_unziplo_8( |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame^] | 166 | v128_shr_u16(v128_load_aligned(rec - 2 * (y != -y0) * rstride), shift), |
| 167 | l3); |
| 168 | *b = v128_unziplo_8(l3, l1); |
| 169 | *g = v128_unziplo_8(l2, l4); |
| 170 | *h = v128_unziplo_8( |
| 171 | l4, |
| 172 | v128_shr_u16(v128_load_unaligned(rec + (2 * (y != bottom) + 1) * rstride), |
| 173 | shift)); |
| 174 | *c = v128_unziplo_8( |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 175 | v128_shr_u16(v128_load_unaligned(rec - 2 * !!x0), shift), |
| 176 | v128_shr_u16(v128_load_unaligned(rec - 2 * !!x0 + rstride), shift)); |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame^] | 177 | *d = v128_unziplo_8( |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 178 | v128_shr_u16(v128_load_unaligned(rec - !!x0), shift), |
| 179 | v128_shr_u16(v128_load_unaligned(rec - !!x0 + rstride), shift)); |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame^] | 180 | *e = v128_unziplo_8( |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 181 | v128_shr_u16(v128_load_unaligned(rec + !!right), shift), |
| 182 | v128_shr_u16(v128_load_unaligned(rec + !!right + rstride), shift)); |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame^] | 183 | *f = v128_unziplo_8( |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 184 | v128_shr_u16(v128_load_unaligned(rec + 2 * !!right), shift), |
| 185 | v128_shr_u16(v128_load_unaligned(rec + 2 * !!right + rstride), shift)); |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame^] | 186 | clip_sides(c, d, e, f, x0, right); |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 187 | } |
| 188 | |
Steinar Midtskogen | 3dbd55a | 2016-09-09 15:23:35 +0200 | [diff] [blame] | 189 | void SIMD_FUNC(aom_clpf_detect_hbd)(const uint16_t *rec, const uint16_t *org, |
| 190 | int rstride, int ostride, int x0, int y0, |
| 191 | int width, int height, int *sum0, int *sum1, |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame^] | 192 | unsigned int strength, int size, |
| 193 | unsigned int bitdepth) { |
| 194 | const int shift = bitdepth - 8; |
Steinar Midtskogen | 3dbd55a | 2016-09-09 15:23:35 +0200 | [diff] [blame] | 195 | const int bottom = height - 2 - y0; |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 196 | const int right = width - 8 - x0; |
| 197 | ssd128_internal ssd0 = v128_ssd_u8_init(); |
| 198 | ssd128_internal ssd1 = v128_ssd_u8_init(); |
| 199 | int y; |
| 200 | |
| 201 | if (size != 8) { // Fallback to plain C |
| 202 | aom_clpf_detect_hbd_c(rec, org, rstride, ostride, x0, y0, width, height, |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame^] | 203 | sum0, sum1, strength, size, bitdepth); |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 204 | return; |
| 205 | } |
Steinar Midtskogen | 3dbd55a | 2016-09-09 15:23:35 +0200 | [diff] [blame] | 206 | |
| 207 | rec += x0 + y0 * rstride; |
| 208 | org += x0 + y0 * ostride; |
| 209 | |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 210 | for (y = 0; y < 8; y += 2) { |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame^] | 211 | v128 a, b, c, d, e, f, g, h, o, r; |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 212 | read_two_lines_hbd(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o, |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame^] | 213 | &r, &a, &b, &c, &d, &e, &f, &g, &h, shift); |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 214 | ssd0 = v128_ssd_u8(ssd0, o, r); |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame^] | 215 | ssd1 = v128_ssd_u8( |
| 216 | ssd1, o, calc_delta(r, a, b, c, d, e, f, g, h, strength >> shift)); |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 217 | rec += rstride * 2; |
| 218 | org += ostride * 2; |
Steinar Midtskogen | 3dbd55a | 2016-09-09 15:23:35 +0200 | [diff] [blame] | 219 | } |
| 220 | *sum0 += v128_ssd_u8_sum(ssd0); |
| 221 | *sum1 += v128_ssd_u8_sum(ssd1); |
| 222 | } |
| 223 | |
| 224 | void SIMD_FUNC(aom_clpf_detect_multi_hbd)(const uint16_t *rec, |
| 225 | const uint16_t *org, int rstride, |
| 226 | int ostride, int x0, int y0, |
| 227 | int width, int height, int *sum, |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame^] | 228 | int size, unsigned int bitdepth) { |
Steinar Midtskogen | 3dbd55a | 2016-09-09 15:23:35 +0200 | [diff] [blame] | 229 | const int bottom = height - 2 - y0; |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 230 | const int right = width - 8 - x0; |
Steinar Midtskogen | 3dbd55a | 2016-09-09 15:23:35 +0200 | [diff] [blame] | 231 | ssd128_internal ssd0 = v128_ssd_u8_init(); |
| 232 | ssd128_internal ssd1 = v128_ssd_u8_init(); |
| 233 | ssd128_internal ssd2 = v128_ssd_u8_init(); |
| 234 | ssd128_internal ssd3 = v128_ssd_u8_init(); |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 235 | int y; |
| 236 | |
| 237 | if (size != 8) { // Fallback to plain C |
| 238 | aom_clpf_detect_multi_hbd_c(rec, org, rstride, ostride, x0, y0, width, |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame^] | 239 | height, sum, size, bitdepth); |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 240 | return; |
| 241 | } |
Steinar Midtskogen | 3dbd55a | 2016-09-09 15:23:35 +0200 | [diff] [blame] | 242 | |
| 243 | rec += x0 + y0 * rstride; |
| 244 | org += x0 + y0 * ostride; |
| 245 | |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 246 | for (y = 0; y < 8; y += 2) { |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame^] | 247 | v128 a, b, c, d, e, f, g, h, o, r; |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 248 | read_two_lines_hbd(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o, |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame^] | 249 | &r, &a, &b, &c, &d, &e, &f, &g, &h, bitdepth - 8); |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 250 | ssd0 = v128_ssd_u8(ssd0, o, r); |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame^] | 251 | calc_delta_multi(r, o, a, b, c, d, e, f, g, h, &ssd1, &ssd2, &ssd3); |
| 252 | rec += rstride * 2; |
| 253 | org += ostride * 2; |
Steinar Midtskogen | 3dbd55a | 2016-09-09 15:23:35 +0200 | [diff] [blame] | 254 | } |
| 255 | sum[0] += v128_ssd_u8_sum(ssd0); |
| 256 | sum[1] += v128_ssd_u8_sum(ssd1); |
| 257 | sum[2] += v128_ssd_u8_sum(ssd2); |
| 258 | sum[3] += v128_ssd_u8_sum(ssd3); |
| 259 | } |
| 260 | #endif |