Steinar Midtskogen | be668e9 | 2016-08-05 12:12:38 +0200 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (c) 2016, Alliance for Open Media. All rights reserved |
| 3 | * |
| 4 | * This source code is subject to the terms of the BSD 2 Clause License and |
| 5 | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| 6 | * was not distributed with this source code in the LICENSE file, you can |
| 7 | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| 8 | * Media Patent License 1.0 was not distributed with this source code in the |
| 9 | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
| 10 | */ |
| 11 | |
Steinar Midtskogen | b8ff6aa | 2017-03-25 18:52:22 +0100 | [diff] [blame] | 12 | #include "./av1_rtcd.h" |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 13 | #include "aom_ports/bitops.h" |
Steinar Midtskogen | 8ff52fc | 2017-04-04 12:29:19 +0200 | [diff] [blame] | 14 | #include "aom_ports/mem.h" |
Steinar Midtskogen | 45544f9 | 2017-03-31 14:40:06 +0200 | [diff] [blame] | 15 | |
Steinar Midtskogen | 94de0aa | 2017-08-02 10:30:12 +0200 | [diff] [blame] | 16 | // sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp))) |
| 17 | SIMD_INLINE v128 constrain16(v128 a, v128 b, unsigned int threshold, |
| 18 | unsigned int adjdamp) { |
| 19 | v128 diff = v128_sub_16(a, b); |
| 20 | const v128 sign = v128_shr_n_s16(diff, 15); |
| 21 | diff = v128_abs_s16(diff); |
| 22 | const v128 s = |
| 23 | v128_ssub_u16(v128_dup_16(threshold), v128_shr_u16(diff, adjdamp)); |
| 24 | return v128_xor(v128_add_16(sign, v128_min_s16(diff, s)), sign); |
| 25 | } |
| 26 | |
Steinar Midtskogen | febe223 | 2017-04-02 09:47:27 +0200 | [diff] [blame] | 27 | // sign(a - b) * min(abs(a - b), max(0, strength - (abs(a - b) >> adjdamp))) |
Steinar Midtskogen | bab3d6a | 2017-04-02 21:40:14 +0200 | [diff] [blame] | 28 | SIMD_INLINE v128 constrain(v256 a, v256 b, unsigned int strength, |
Steinar Midtskogen | febe223 | 2017-04-02 09:47:27 +0200 | [diff] [blame] | 29 | unsigned int adjdamp) { |
Steinar Midtskogen | bab3d6a | 2017-04-02 21:40:14 +0200 | [diff] [blame] | 30 | const v256 diff16 = v256_sub_16(a, b); |
| 31 | v128 diff = v128_pack_s16_s8(v256_high_v128(diff16), v256_low_v128(diff16)); |
Steinar Midtskogen | 9b501e1 | 2017-04-03 13:31:54 +0200 | [diff] [blame] | 32 | const v128 sign = v128_cmplt_s8(diff, v128_zero()); |
Steinar Midtskogen | bab3d6a | 2017-04-02 21:40:14 +0200 | [diff] [blame] | 33 | diff = v128_abs_s8(diff); |
| 34 | return v128_xor( |
| 35 | v128_add_8(sign, |
| 36 | v128_min_u8(diff, v128_ssub_u8(v128_dup_8(strength), |
| 37 | v128_shr_u8(diff, adjdamp)))), |
| 38 | sign); |
Steinar Midtskogen | 45544f9 | 2017-03-31 14:40:06 +0200 | [diff] [blame] | 39 | } |
| 40 | |
Steinar Midtskogen | febe223 | 2017-04-02 09:47:27 +0200 | [diff] [blame] | 41 | // delta = 1/16 * constrain(a, x, s, d) + 3/16 * constrain(b, x, s, d) + |
| 42 | // 1/16 * constrain(c, x, s, d) + 3/16 * constrain(d, x, s, d) + |
| 43 | // 3/16 * constrain(e, x, s, d) + 1/16 * constrain(f, x, s, d) + |
| 44 | // 3/16 * constrain(g, x, s, d) + 1/16 * constrain(h, x, s, d) |
Steinar Midtskogen | bab3d6a | 2017-04-02 21:40:14 +0200 | [diff] [blame] | 45 | SIMD_INLINE v128 calc_delta(v256 x, v256 a, v256 b, v256 c, v256 d, v256 e, |
| 46 | v256 f, v256 g, v256 h, unsigned int s, |
Steinar Midtskogen | 45544f9 | 2017-03-31 14:40:06 +0200 | [diff] [blame] | 47 | unsigned int dmp) { |
| 48 | const v128 bdeg = |
| 49 | v128_add_8(v128_add_8(constrain(b, x, s, dmp), constrain(d, x, s, dmp)), |
| 50 | v128_add_8(constrain(e, x, s, dmp), constrain(g, x, s, dmp))); |
| 51 | const v128 delta = v128_add_8( |
| 52 | v128_add_8(v128_add_8(constrain(a, x, s, dmp), constrain(c, x, s, dmp)), |
| 53 | v128_add_8(constrain(f, x, s, dmp), constrain(h, x, s, dmp))), |
| 54 | v128_add_8(v128_add_8(bdeg, bdeg), bdeg)); |
| 55 | return v128_add_8( |
Steinar Midtskogen | bab3d6a | 2017-04-02 21:40:14 +0200 | [diff] [blame] | 56 | v128_pack_s16_u8(v256_high_v128(x), v256_low_v128(x)), |
| 57 | v128_shr_s8( |
| 58 | v128_add_8(v128_dup_8(8), |
| 59 | v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))), |
| 60 | 4)); |
Steinar Midtskogen | 45544f9 | 2017-03-31 14:40:06 +0200 | [diff] [blame] | 61 | } |
| 62 | |
Steinar Midtskogen | febe223 | 2017-04-02 09:47:27 +0200 | [diff] [blame] | 63 | // delta = 1/8 * constrain(a, x, s, d) + 3/8 * constrain(b, x, s, d) + |
| 64 | // 3/8 * constrain(c, x, s, d) + 1/8 * constrain(d, x, s, d) + |
Steinar Midtskogen | bab3d6a | 2017-04-02 21:40:14 +0200 | [diff] [blame] | 65 | SIMD_INLINE v128 calc_hdelta(v256 x, v256 a, v256 b, v256 c, v256 d, |
Steinar Midtskogen | 45544f9 | 2017-03-31 14:40:06 +0200 | [diff] [blame] | 66 | unsigned int s, unsigned int dmp) { |
| 67 | const v128 bc = v128_add_8(constrain(b, x, s, dmp), constrain(c, x, s, dmp)); |
| 68 | const v128 delta = |
| 69 | v128_add_8(v128_add_8(constrain(a, x, s, dmp), constrain(d, x, s, dmp)), |
| 70 | v128_add_8(v128_add_8(bc, bc), bc)); |
| 71 | return v128_add_8( |
Steinar Midtskogen | bab3d6a | 2017-04-02 21:40:14 +0200 | [diff] [blame] | 72 | v128_pack_s16_u8(v256_high_v128(x), v256_low_v128(x)), |
| 73 | v128_shr_s8( |
| 74 | v128_add_8(v128_dup_8(4), |
| 75 | v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))), |
| 76 | 3)); |
Steinar Midtskogen | 45544f9 | 2017-03-31 14:40:06 +0200 | [diff] [blame] | 77 | } |
Steinar Midtskogen | 3dbd55a | 2016-09-09 15:23:35 +0200 | [diff] [blame] | 78 | |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 79 | // Process blocks of width 8, two lines at a time, 8 bit. |
Steinar Midtskogen | 569c7b9 | 2017-04-02 10:45:16 +0200 | [diff] [blame] | 80 | static void SIMD_FUNC(clpf_block8)(uint8_t *dst, const uint16_t *src, |
| 81 | int dstride, int sstride, int sizey, |
| 82 | unsigned int strength, |
| 83 | unsigned int adjdamp) { |
Steinar Midtskogen | f844e6e | 2017-02-09 17:24:37 +0100 | [diff] [blame] | 84 | int y; |
| 85 | |
Steinar Midtskogen | f844e6e | 2017-02-09 17:24:37 +0100 | [diff] [blame] | 86 | for (y = 0; y < sizey; y += 2) { |
Steinar Midtskogen | 73aa77c | 2017-03-27 17:50:30 +0200 | [diff] [blame] | 87 | const v128 l1 = v128_load_aligned(src); |
| 88 | const v128 l2 = v128_load_aligned(src + sstride); |
| 89 | const v128 l3 = v128_load_aligned(src - sstride); |
| 90 | const v128 l4 = v128_load_aligned(src + 2 * sstride); |
Steinar Midtskogen | bab3d6a | 2017-04-02 21:40:14 +0200 | [diff] [blame] | 91 | const v256 a = v256_from_v128(v128_load_aligned(src - 2 * sstride), l3); |
| 92 | const v256 b = v256_from_v128(l3, l1); |
| 93 | const v256 g = v256_from_v128(l2, l4); |
| 94 | const v256 h = v256_from_v128(l4, v128_load_aligned(src + 3 * sstride)); |
| 95 | const v256 c = v256_from_v128(v128_load_unaligned(src - 2), |
| 96 | v128_load_unaligned(src - 2 + sstride)); |
| 97 | const v256 d = v256_from_v128(v128_load_unaligned(src - 1), |
| 98 | v128_load_unaligned(src - 1 + sstride)); |
| 99 | const v256 e = v256_from_v128(v128_load_unaligned(src + 1), |
| 100 | v128_load_unaligned(src + 1 + sstride)); |
| 101 | const v256 f = v256_from_v128(v128_load_unaligned(src + 2), |
| 102 | v128_load_unaligned(src + 2 + sstride)); |
| 103 | const v128 o = calc_delta(v256_from_v128(l1, l2), a, b, c, d, e, f, g, h, |
Steinar Midtskogen | febe223 | 2017-04-02 09:47:27 +0200 | [diff] [blame] | 104 | strength, adjdamp); |
Steinar Midtskogen | f844e6e | 2017-02-09 17:24:37 +0100 | [diff] [blame] | 105 | |
| 106 | v64_store_aligned(dst, v128_high_v64(o)); |
| 107 | v64_store_aligned(dst + dstride, v128_low_v64(o)); |
| 108 | src += sstride * 2; |
| 109 | dst += dstride * 2; |
| 110 | } |
| 111 | } |
| 112 | |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 113 | // Process blocks of width 4, four lines at a time, 8 bit. |
Steinar Midtskogen | 569c7b9 | 2017-04-02 10:45:16 +0200 | [diff] [blame] | 114 | static void SIMD_FUNC(clpf_block4)(uint8_t *dst, const uint16_t *src, |
| 115 | int dstride, int sstride, int sizey, |
| 116 | unsigned int strength, |
| 117 | unsigned int adjdamp) { |
Steinar Midtskogen | f844e6e | 2017-02-09 17:24:37 +0100 | [diff] [blame] | 118 | int y; |
| 119 | |
Steinar Midtskogen | f844e6e | 2017-02-09 17:24:37 +0100 | [diff] [blame] | 120 | for (y = 0; y < sizey; y += 4) { |
Steinar Midtskogen | 73aa77c | 2017-03-27 17:50:30 +0200 | [diff] [blame] | 121 | const v64 l0 = v64_load_aligned(src - 2 * sstride); |
| 122 | const v64 l1 = v64_load_aligned(src - sstride); |
| 123 | const v64 l2 = v64_load_aligned(src); |
| 124 | const v64 l3 = v64_load_aligned(src + sstride); |
| 125 | const v64 l4 = v64_load_aligned(src + 2 * sstride); |
| 126 | const v64 l5 = v64_load_aligned(src + 3 * sstride); |
| 127 | const v64 l6 = v64_load_aligned(src + 4 * sstride); |
| 128 | const v64 l7 = v64_load_aligned(src + 5 * sstride); |
Steinar Midtskogen | bab3d6a | 2017-04-02 21:40:14 +0200 | [diff] [blame] | 129 | const v128 o = |
Steinar Midtskogen | 9b501e1 | 2017-04-03 13:31:54 +0200 | [diff] [blame] | 130 | calc_delta(v256_from_v64(l2, l3, l4, l5), v256_from_v64(l0, l1, l2, l3), |
| 131 | v256_from_v64(l1, l2, l3, l4), |
| 132 | v256_from_v64(v64_load_unaligned(src - 2), |
| 133 | v64_load_unaligned(src + sstride - 2), |
| 134 | v64_load_unaligned(src + 2 * sstride - 2), |
| 135 | v64_load_unaligned(src + 3 * sstride - 2)), |
| 136 | v256_from_v64(v64_load_unaligned(src - 1), |
| 137 | v64_load_unaligned(src + sstride - 1), |
| 138 | v64_load_unaligned(src + 2 * sstride - 1), |
| 139 | v64_load_unaligned(src + 3 * sstride - 1)), |
| 140 | v256_from_v64(v64_load_unaligned(src + 1), |
| 141 | v64_load_unaligned(src + sstride + 1), |
| 142 | v64_load_unaligned(src + 2 * sstride + 1), |
| 143 | v64_load_unaligned(src + 3 * sstride + 1)), |
| 144 | v256_from_v64(v64_load_unaligned(src + 2), |
| 145 | v64_load_unaligned(src + sstride + 2), |
| 146 | v64_load_unaligned(src + 2 * sstride + 2), |
| 147 | v64_load_unaligned(src + 3 * sstride + 2)), |
| 148 | v256_from_v64(l3, l4, l5, l6), v256_from_v64(l4, l5, l6, l7), |
| 149 | strength, adjdamp); |
Steinar Midtskogen | f844e6e | 2017-02-09 17:24:37 +0100 | [diff] [blame] | 150 | |
| 151 | u32_store_aligned(dst, v128_low_u32(v128_shr_n_byte(o, 12))); |
| 152 | u32_store_aligned(dst + dstride, v128_low_u32(v128_shr_n_byte(o, 8))); |
| 153 | u32_store_aligned(dst + 2 * dstride, v128_low_u32(v128_shr_n_byte(o, 4))); |
| 154 | u32_store_aligned(dst + 3 * dstride, v128_low_u32(o)); |
| 155 | |
| 156 | dst += 4 * dstride; |
| 157 | src += 4 * sstride; |
| 158 | } |
| 159 | } |
| 160 | |
Steinar Midtskogen | 569c7b9 | 2017-04-02 10:45:16 +0200 | [diff] [blame] | 161 | static void SIMD_FUNC(clpf_hblock8)(uint8_t *dst, const uint16_t *src, |
| 162 | int dstride, int sstride, int sizey, |
| 163 | unsigned int strength, |
| 164 | unsigned int adjdamp) { |
Steinar Midtskogen | 73aa77c | 2017-03-27 17:50:30 +0200 | [diff] [blame] | 165 | int y; |
| 166 | |
| 167 | for (y = 0; y < sizey; y += 2) { |
Steinar Midtskogen | bab3d6a | 2017-04-02 21:40:14 +0200 | [diff] [blame] | 168 | const v256 x = v256_from_v128(v128_load_aligned(src), |
| 169 | v128_load_aligned(src + sstride)); |
| 170 | const v256 a = v256_from_v128(v128_load_unaligned(src - 2), |
| 171 | v128_load_unaligned(src - 2 + sstride)); |
| 172 | const v256 b = v256_from_v128(v128_load_unaligned(src - 1), |
| 173 | v128_load_unaligned(src - 1 + sstride)); |
| 174 | const v256 c = v256_from_v128(v128_load_unaligned(src + 1), |
| 175 | v128_load_unaligned(src + 1 + sstride)); |
| 176 | const v256 d = v256_from_v128(v128_load_unaligned(src + 2), |
| 177 | v128_load_unaligned(src + 2 + sstride)); |
| 178 | const v128 o = calc_hdelta(x, a, b, c, d, strength, adjdamp); |
Steinar Midtskogen | 73aa77c | 2017-03-27 17:50:30 +0200 | [diff] [blame] | 179 | |
| 180 | v64_store_aligned(dst, v128_high_v64(o)); |
| 181 | v64_store_aligned(dst + dstride, v128_low_v64(o)); |
| 182 | src += sstride * 2; |
| 183 | dst += dstride * 2; |
| 184 | } |
| 185 | } |
| 186 | |
| 187 | // Process blocks of width 4, four lines at a time, 8 bit. |
Steinar Midtskogen | 569c7b9 | 2017-04-02 10:45:16 +0200 | [diff] [blame] | 188 | static void SIMD_FUNC(clpf_hblock4)(uint8_t *dst, const uint16_t *src, |
| 189 | int dstride, int sstride, int sizey, |
| 190 | unsigned int strength, |
| 191 | unsigned int adjdamp) { |
Steinar Midtskogen | 73aa77c | 2017-03-27 17:50:30 +0200 | [diff] [blame] | 192 | int y; |
| 193 | |
| 194 | for (y = 0; y < sizey; y += 4) { |
Steinar Midtskogen | 9b501e1 | 2017-04-03 13:31:54 +0200 | [diff] [blame] | 195 | const v256 a = v256_from_v64(v64_load_unaligned(src - 2), |
| 196 | v64_load_unaligned(src + sstride - 2), |
| 197 | v64_load_unaligned(src + 2 * sstride - 2), |
| 198 | v64_load_unaligned(src + 3 * sstride - 2)); |
| 199 | const v256 b = v256_from_v64(v64_load_unaligned(src - 1), |
| 200 | v64_load_unaligned(src + sstride - 1), |
| 201 | v64_load_unaligned(src + 2 * sstride - 1), |
| 202 | v64_load_unaligned(src + 3 * sstride - 1)); |
| 203 | const v256 c = v256_from_v64(v64_load_unaligned(src + 1), |
| 204 | v64_load_unaligned(src + sstride + 1), |
| 205 | v64_load_unaligned(src + 2 * sstride + 1), |
| 206 | v64_load_unaligned(src + 3 * sstride + 1)); |
| 207 | const v256 d = v256_from_v64(v64_load_unaligned(src + 2), |
| 208 | v64_load_unaligned(src + sstride + 2), |
| 209 | v64_load_unaligned(src + 2 * sstride + 2), |
| 210 | v64_load_unaligned(src + 3 * sstride + 2)); |
Steinar Midtskogen | 73aa77c | 2017-03-27 17:50:30 +0200 | [diff] [blame] | 211 | |
| 212 | const v128 o = calc_hdelta( |
Steinar Midtskogen | 9b501e1 | 2017-04-03 13:31:54 +0200 | [diff] [blame] | 213 | v256_from_v64(v64_load_aligned(src), v64_load_aligned(src + sstride), |
| 214 | v64_load_aligned(src + 2 * sstride), |
| 215 | v64_load_aligned(src + 3 * sstride)), |
| 216 | a, b, c, d, strength, adjdamp); |
Steinar Midtskogen | 73aa77c | 2017-03-27 17:50:30 +0200 | [diff] [blame] | 217 | |
| 218 | u32_store_aligned(dst, v128_low_u32(v128_shr_n_byte(o, 12))); |
| 219 | u32_store_aligned(dst + dstride, v128_low_u32(v128_shr_n_byte(o, 8))); |
| 220 | u32_store_aligned(dst + 2 * dstride, v128_low_u32(v128_shr_n_byte(o, 4))); |
| 221 | u32_store_aligned(dst + 3 * dstride, v128_low_u32(o)); |
| 222 | |
| 223 | dst += 4 * dstride; |
| 224 | src += 4 * sstride; |
| 225 | } |
| 226 | } |
| 227 | |
| 228 | void SIMD_FUNC(aom_clpf_block)(uint8_t *dst, const uint16_t *src, int dstride, |
| 229 | int sstride, int sizex, int sizey, |
| 230 | unsigned int strength, unsigned int dmp) { |
Steinar Midtskogen | e66fc87 | 2016-09-26 12:51:25 +0200 | [diff] [blame] | 231 | if ((sizex != 4 && sizex != 8) || ((sizey & 3) && sizex == 4)) { |
| 232 | // Fallback to C for odd sizes: |
| 233 | // * block widths not 4 or 8 |
| 234 | // * block heights not a multiple of 4 if the block width is 4 |
Steinar Midtskogen | 73aa77c | 2017-03-27 17:50:30 +0200 | [diff] [blame] | 235 | aom_clpf_block_c(dst, src, dstride, sstride, sizex, sizey, strength, dmp); |
Steinar Midtskogen | be668e9 | 2016-08-05 12:12:38 +0200 | [diff] [blame] | 236 | } else { |
Steinar Midtskogen | 569c7b9 | 2017-04-02 10:45:16 +0200 | [diff] [blame] | 237 | (sizex == 4 ? SIMD_FUNC(clpf_block4) : SIMD_FUNC(clpf_block8))( |
| 238 | dst, src, dstride, sstride, sizey, strength, dmp - get_msb(strength)); |
Steinar Midtskogen | be668e9 | 2016-08-05 12:12:38 +0200 | [diff] [blame] | 239 | } |
| 240 | } |
Steinar Midtskogen | 3dbd55a | 2016-09-09 15:23:35 +0200 | [diff] [blame] | 241 | |
Steinar Midtskogen | 73aa77c | 2017-03-27 17:50:30 +0200 | [diff] [blame] | 242 | void SIMD_FUNC(aom_clpf_hblock)(uint8_t *dst, const uint16_t *src, int dstride, |
| 243 | int sstride, int sizex, int sizey, |
| 244 | unsigned int strength, unsigned int dmp) { |
| 245 | if ((sizex != 4 && sizex != 8) || ((sizey & 3) && sizex == 4)) { |
| 246 | // Fallback to C for odd sizes: |
| 247 | // * block widths not 4 or 8 |
| 248 | // * block heights not a multiple of 4 if the block width is 4 |
| 249 | aom_clpf_hblock_c(dst, src, dstride, sstride, sizex, sizey, strength, dmp); |
| 250 | } else { |
Steinar Midtskogen | 569c7b9 | 2017-04-02 10:45:16 +0200 | [diff] [blame] | 251 | (sizex == 4 ? SIMD_FUNC(clpf_hblock4) : SIMD_FUNC(clpf_hblock8))( |
Steinar Midtskogen | febe223 | 2017-04-02 09:47:27 +0200 | [diff] [blame] | 252 | dst, src, dstride, sstride, sizey, strength, dmp - get_msb(strength)); |
Steinar Midtskogen | 73aa77c | 2017-03-27 17:50:30 +0200 | [diff] [blame] | 253 | } |
| 254 | } |
| 255 | |
Steinar Midtskogen | febe223 | 2017-04-02 09:47:27 +0200 | [diff] [blame] | 256 | // delta = 1/16 * constrain(a, x, s, d) + 3/16 * constrain(b, x, s, d) + |
| 257 | // 1/16 * constrain(c, x, s, d) + 3/16 * constrain(d, x, s, d) + |
| 258 | // 3/16 * constrain(e, x, s, d) + 1/16 * constrain(f, x, s, d) + |
| 259 | // 3/16 * constrain(g, x, s, d) + 1/16 * constrain(h, x, s, d) |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 260 | SIMD_INLINE v128 calc_delta_hbd(v128 x, v128 a, v128 b, v128 c, v128 d, v128 e, |
| 261 | v128 f, v128 g, v128 h, unsigned int s, |
Steinar Midtskogen | 4305e6b | 2017-02-17 14:07:29 +0100 | [diff] [blame] | 262 | unsigned int dmp) { |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 263 | const v128 bdeg = v128_add_16( |
Steinar Midtskogen | 8ff52fc | 2017-04-04 12:29:19 +0200 | [diff] [blame] | 264 | v128_add_16(constrain16(b, x, s, dmp), constrain16(d, x, s, dmp)), |
| 265 | v128_add_16(constrain16(e, x, s, dmp), constrain16(g, x, s, dmp))); |
Steinar Midtskogen | 3dbd55a | 2016-09-09 15:23:35 +0200 | [diff] [blame] | 266 | const v128 delta = v128_add_16( |
| 267 | v128_add_16( |
Steinar Midtskogen | 8ff52fc | 2017-04-04 12:29:19 +0200 | [diff] [blame] | 268 | v128_add_16(constrain16(a, x, s, dmp), constrain16(c, x, s, dmp)), |
| 269 | v128_add_16(constrain16(f, x, s, dmp), constrain16(h, x, s, dmp))), |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 270 | v128_add_16(v128_add_16(bdeg, bdeg), bdeg)); |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 271 | return v128_add_16( |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 272 | x, |
| 273 | v128_shr_s16( |
| 274 | v128_add_16(v128_dup_16(8), |
| 275 | v128_add_16(delta, v128_cmplt_s16(delta, v128_zero()))), |
| 276 | 4)); |
Steinar Midtskogen | 3dbd55a | 2016-09-09 15:23:35 +0200 | [diff] [blame] | 277 | } |
| 278 | |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 279 | static void calc_delta_hbd4(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e, |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 280 | v128 f, v128 g, v128 h, uint16_t *dst, |
Steinar Midtskogen | 4305e6b | 2017-02-17 14:07:29 +0100 | [diff] [blame] | 281 | unsigned int s, unsigned int dmp, int dstride) { |
| 282 | o = calc_delta_hbd(o, a, b, c, d, e, f, g, h, s, dmp); |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 283 | v64_store_aligned(dst, v128_high_v64(o)); |
| 284 | v64_store_aligned(dst + dstride, v128_low_v64(o)); |
| 285 | } |
| 286 | |
| 287 | static void calc_delta_hbd8(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e, |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 288 | v128 f, v128 g, v128 h, uint16_t *dst, |
Steinar Midtskogen | febe223 | 2017-04-02 09:47:27 +0200 | [diff] [blame] | 289 | unsigned int s, unsigned int adjdamp) { |
| 290 | v128_store_aligned(dst, |
| 291 | calc_delta_hbd(o, a, b, c, d, e, f, g, h, s, adjdamp)); |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 292 | } |
| 293 | |
Steinar Midtskogen | 3c33def | 2017-03-21 12:56:17 +0100 | [diff] [blame] | 294 | // delta = 1/16 * constrain(a, x, s, dmp) + 3/16 * constrain(b, x, s, dmp) + |
| 295 | // 3/16 * constrain(c, x, s, dmp) + 1/16 * constrain(d, x, s, dmp) |
| 296 | SIMD_INLINE v128 calc_hdelta_hbd(v128 x, v128 a, v128 b, v128 c, v128 d, |
| 297 | unsigned int s, unsigned int dmp) { |
| 298 | const v128 bc = |
Steinar Midtskogen | 8ff52fc | 2017-04-04 12:29:19 +0200 | [diff] [blame] | 299 | v128_add_16(constrain16(b, x, s, dmp), constrain16(c, x, s, dmp)); |
Steinar Midtskogen | 3c33def | 2017-03-21 12:56:17 +0100 | [diff] [blame] | 300 | const v128 delta = v128_add_16( |
Steinar Midtskogen | 8ff52fc | 2017-04-04 12:29:19 +0200 | [diff] [blame] | 301 | v128_add_16(constrain16(a, x, s, dmp), constrain16(d, x, s, dmp)), |
Steinar Midtskogen | 3c33def | 2017-03-21 12:56:17 +0100 | [diff] [blame] | 302 | v128_add_16(v128_add_16(bc, bc), bc)); |
| 303 | return v128_add_16( |
| 304 | x, |
| 305 | v128_shr_s16( |
| 306 | v128_add_16(v128_dup_16(4), |
| 307 | v128_add_16(delta, v128_cmplt_s16(delta, v128_zero()))), |
| 308 | 3)); |
| 309 | } |
| 310 | |
| 311 | static void calc_hdelta_hbd4(v128 o, v128 a, v128 b, v128 c, v128 d, |
Steinar Midtskogen | febe223 | 2017-04-02 09:47:27 +0200 | [diff] [blame] | 312 | uint16_t *dst, unsigned int s, |
| 313 | unsigned int adjdamp, int dstride) { |
| 314 | o = calc_hdelta_hbd(o, a, b, c, d, s, adjdamp); |
Steinar Midtskogen | 3c33def | 2017-03-21 12:56:17 +0100 | [diff] [blame] | 315 | v64_store_aligned(dst, v128_high_v64(o)); |
| 316 | v64_store_aligned(dst + dstride, v128_low_v64(o)); |
| 317 | } |
| 318 | |
| 319 | static void calc_hdelta_hbd8(v128 o, v128 a, v128 b, v128 c, v128 d, |
Steinar Midtskogen | febe223 | 2017-04-02 09:47:27 +0200 | [diff] [blame] | 320 | uint16_t *dst, unsigned int s, |
| 321 | unsigned int adjdamp) { |
| 322 | v128_store_aligned(dst, calc_hdelta_hbd(o, a, b, c, d, s, adjdamp)); |
Steinar Midtskogen | 3c33def | 2017-03-21 12:56:17 +0100 | [diff] [blame] | 323 | } |
| 324 | |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 325 | // Process blocks of width 4, two lines at time. |
Steinar Midtskogen | 569c7b9 | 2017-04-02 10:45:16 +0200 | [diff] [blame] | 326 | static void SIMD_FUNC(clpf_block_hbd4)(uint16_t *dst, const uint16_t *src, |
| 327 | int dstride, int sstride, int sizey, |
| 328 | unsigned int strength, |
| 329 | unsigned int adjdamp) { |
Steinar Midtskogen | f844e6e | 2017-02-09 17:24:37 +0100 | [diff] [blame] | 330 | int y; |
| 331 | |
Steinar Midtskogen | f844e6e | 2017-02-09 17:24:37 +0100 | [diff] [blame] | 332 | for (y = 0; y < sizey; y += 2) { |
| 333 | const v64 l1 = v64_load_aligned(src); |
| 334 | const v64 l2 = v64_load_aligned(src + sstride); |
| 335 | const v64 l3 = v64_load_aligned(src - sstride); |
| 336 | const v64 l4 = v64_load_aligned(src + 2 * sstride); |
| 337 | const v128 a = v128_from_v64(v64_load_aligned(src - 2 * sstride), l3); |
| 338 | const v128 b = v128_from_v64(l3, l1); |
| 339 | const v128 g = v128_from_v64(l2, l4); |
| 340 | const v128 h = v128_from_v64(l4, v64_load_aligned(src + 3 * sstride)); |
| 341 | const v128 c = v128_from_v64(v64_load_unaligned(src - 2), |
| 342 | v64_load_unaligned(src - 2 + sstride)); |
| 343 | const v128 d = v128_from_v64(v64_load_unaligned(src - 1), |
| 344 | v64_load_unaligned(src - 1 + sstride)); |
| 345 | const v128 e = v128_from_v64(v64_load_unaligned(src + 1), |
| 346 | v64_load_unaligned(src + 1 + sstride)); |
| 347 | const v128 f = v128_from_v64(v64_load_unaligned(src + 2), |
| 348 | v64_load_unaligned(src + 2 + sstride)); |
| 349 | |
| 350 | calc_delta_hbd4(v128_from_v64(l1, l2), a, b, c, d, e, f, g, h, dst, |
Steinar Midtskogen | febe223 | 2017-04-02 09:47:27 +0200 | [diff] [blame] | 351 | strength, adjdamp, dstride); |
Steinar Midtskogen | f844e6e | 2017-02-09 17:24:37 +0100 | [diff] [blame] | 352 | src += sstride * 2; |
| 353 | dst += dstride * 2; |
| 354 | } |
| 355 | } |
| 356 | |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 357 | // The most simple case. Start here if you need to understand the functions. |
Steinar Midtskogen | 569c7b9 | 2017-04-02 10:45:16 +0200 | [diff] [blame] | 358 | static void SIMD_FUNC(clpf_block_hbd)(uint16_t *dst, const uint16_t *src, |
| 359 | int dstride, int sstride, int sizey, |
| 360 | unsigned int strength, |
| 361 | unsigned int adjdamp) { |
Steinar Midtskogen | f844e6e | 2017-02-09 17:24:37 +0100 | [diff] [blame] | 362 | int y; |
| 363 | |
Steinar Midtskogen | f844e6e | 2017-02-09 17:24:37 +0100 | [diff] [blame] | 364 | for (y = 0; y < sizey; y++) { |
| 365 | const v128 o = v128_load_aligned(src); |
| 366 | const v128 a = v128_load_aligned(src - 2 * sstride); |
| 367 | const v128 b = v128_load_aligned(src - 1 * sstride); |
| 368 | const v128 g = v128_load_aligned(src + sstride); |
| 369 | const v128 h = v128_load_aligned(src + 2 * sstride); |
| 370 | const v128 c = v128_load_unaligned(src - 2); |
| 371 | const v128 d = v128_load_unaligned(src - 1); |
| 372 | const v128 e = v128_load_unaligned(src + 1); |
| 373 | const v128 f = v128_load_unaligned(src + 2); |
| 374 | |
Steinar Midtskogen | febe223 | 2017-04-02 09:47:27 +0200 | [diff] [blame] | 375 | calc_delta_hbd8(o, a, b, c, d, e, f, g, h, dst, strength, adjdamp); |
Steinar Midtskogen | f844e6e | 2017-02-09 17:24:37 +0100 | [diff] [blame] | 376 | src += sstride; |
| 377 | dst += dstride; |
| 378 | } |
| 379 | } |
| 380 | |
Steinar Midtskogen | 3c33def | 2017-03-21 12:56:17 +0100 | [diff] [blame] | 381 | // Process blocks of width 4, horizontal filter, two lines at time. |
Steinar Midtskogen | 569c7b9 | 2017-04-02 10:45:16 +0200 | [diff] [blame] | 382 | static void SIMD_FUNC(clpf_hblock_hbd4)(uint16_t *dst, const uint16_t *src, |
| 383 | int dstride, int sstride, int sizey, |
| 384 | unsigned int strength, |
| 385 | unsigned int adjdamp) { |
Steinar Midtskogen | 3c33def | 2017-03-21 12:56:17 +0100 | [diff] [blame] | 386 | int y; |
| 387 | |
Steinar Midtskogen | 3c33def | 2017-03-21 12:56:17 +0100 | [diff] [blame] | 388 | for (y = 0; y < sizey; y += 2) { |
| 389 | const v128 a = v128_from_v64(v64_load_unaligned(src - 2), |
| 390 | v64_load_unaligned(src - 2 + sstride)); |
| 391 | const v128 b = v128_from_v64(v64_load_unaligned(src - 1), |
| 392 | v64_load_unaligned(src - 1 + sstride)); |
| 393 | const v128 c = v128_from_v64(v64_load_unaligned(src + 1), |
| 394 | v64_load_unaligned(src + 1 + sstride)); |
| 395 | const v128 d = v128_from_v64(v64_load_unaligned(src + 2), |
| 396 | v64_load_unaligned(src + 2 + sstride)); |
| 397 | |
| 398 | calc_hdelta_hbd4(v128_from_v64(v64_load_unaligned(src), |
| 399 | v64_load_unaligned(src + sstride)), |
Steinar Midtskogen | febe223 | 2017-04-02 09:47:27 +0200 | [diff] [blame] | 400 | a, b, c, d, dst, strength, adjdamp, dstride); |
Steinar Midtskogen | 3c33def | 2017-03-21 12:56:17 +0100 | [diff] [blame] | 401 | src += sstride * 2; |
| 402 | dst += dstride * 2; |
| 403 | } |
| 404 | } |
| 405 | |
| 406 | // Process blocks of width 8, horizontal filter, two lines at time. |
Steinar Midtskogen | 569c7b9 | 2017-04-02 10:45:16 +0200 | [diff] [blame] | 407 | static void SIMD_FUNC(clpf_hblock_hbd)(uint16_t *dst, const uint16_t *src, |
| 408 | int dstride, int sstride, int sizey, |
| 409 | unsigned int strength, |
| 410 | unsigned int adjdamp) { |
Steinar Midtskogen | 3c33def | 2017-03-21 12:56:17 +0100 | [diff] [blame] | 411 | int y; |
| 412 | |
Steinar Midtskogen | 3c33def | 2017-03-21 12:56:17 +0100 | [diff] [blame] | 413 | for (y = 0; y < sizey; y++) { |
| 414 | const v128 o = v128_load_aligned(src); |
| 415 | const v128 a = v128_load_unaligned(src - 2); |
| 416 | const v128 b = v128_load_unaligned(src - 1); |
| 417 | const v128 c = v128_load_unaligned(src + 1); |
| 418 | const v128 d = v128_load_unaligned(src + 2); |
| 419 | |
Steinar Midtskogen | febe223 | 2017-04-02 09:47:27 +0200 | [diff] [blame] | 420 | calc_hdelta_hbd8(o, a, b, c, d, dst, strength, adjdamp); |
Steinar Midtskogen | 3c33def | 2017-03-21 12:56:17 +0100 | [diff] [blame] | 421 | src += sstride; |
| 422 | dst += dstride; |
| 423 | } |
| 424 | } |
| 425 | |
Steinar Midtskogen | 73aa77c | 2017-03-27 17:50:30 +0200 | [diff] [blame] | 426 | void SIMD_FUNC(aom_clpf_block_hbd)(uint16_t *dst, const uint16_t *src, |
| 427 | int dstride, int sstride, int sizex, |
| 428 | int sizey, unsigned int strength, |
Steinar Midtskogen | d280a84 | 2017-03-21 09:59:14 +0100 | [diff] [blame] | 429 | unsigned int dmp) { |
Steinar Midtskogen | e66fc87 | 2016-09-26 12:51:25 +0200 | [diff] [blame] | 430 | if ((sizex != 4 && sizex != 8) || ((sizey & 1) && sizex == 4)) { |
| 431 | // Fallback to C for odd sizes: |
| 432 | // * block width not 4 or 8 |
| 433 | // * block heights not a multiple of 2 if the block width is 4 |
Steinar Midtskogen | 73aa77c | 2017-03-27 17:50:30 +0200 | [diff] [blame] | 434 | aom_clpf_block_hbd_c(dst, src, dstride, sstride, sizex, sizey, strength, |
| 435 | dmp); |
Steinar Midtskogen | 3dbd55a | 2016-09-09 15:23:35 +0200 | [diff] [blame] | 436 | } else { |
Steinar Midtskogen | 569c7b9 | 2017-04-02 10:45:16 +0200 | [diff] [blame] | 437 | (sizex == 4 ? SIMD_FUNC(clpf_block_hbd4) : SIMD_FUNC(clpf_block_hbd))( |
Steinar Midtskogen | febe223 | 2017-04-02 09:47:27 +0200 | [diff] [blame] | 438 | dst, src, dstride, sstride, sizey, strength, dmp - get_msb(strength)); |
Steinar Midtskogen | 3dbd55a | 2016-09-09 15:23:35 +0200 | [diff] [blame] | 439 | } |
| 440 | } |
Steinar Midtskogen | 3c33def | 2017-03-21 12:56:17 +0100 | [diff] [blame] | 441 | |
Steinar Midtskogen | 73aa77c | 2017-03-27 17:50:30 +0200 | [diff] [blame] | 442 | void SIMD_FUNC(aom_clpf_hblock_hbd)(uint16_t *dst, const uint16_t *src, |
| 443 | int dstride, int sstride, int sizex, |
| 444 | int sizey, unsigned int strength, |
Steinar Midtskogen | 3c33def | 2017-03-21 12:56:17 +0100 | [diff] [blame] | 445 | unsigned int dmp) { |
| 446 | if ((sizex != 4 && sizex != 8) || ((sizey & 1) && sizex == 4)) { |
| 447 | // Fallback to C for odd sizes: |
| 448 | // * block width not 4 or 8 |
| 449 | // * block heights not a multiple of 2 if the block width is 4 |
Steinar Midtskogen | 73aa77c | 2017-03-27 17:50:30 +0200 | [diff] [blame] | 450 | aom_clpf_hblock_hbd_c(dst, src, dstride, sstride, sizex, sizey, strength, |
| 451 | dmp); |
Steinar Midtskogen | 3c33def | 2017-03-21 12:56:17 +0100 | [diff] [blame] | 452 | } else { |
Steinar Midtskogen | 569c7b9 | 2017-04-02 10:45:16 +0200 | [diff] [blame] | 453 | (sizex == 4 ? SIMD_FUNC(clpf_hblock_hbd4) : SIMD_FUNC(clpf_hblock_hbd))( |
Steinar Midtskogen | febe223 | 2017-04-02 09:47:27 +0200 | [diff] [blame] | 454 | dst, src, dstride, sstride, sizey, strength, dmp - get_msb(strength)); |
Steinar Midtskogen | 3c33def | 2017-03-21 12:56:17 +0100 | [diff] [blame] | 455 | } |
| 456 | } |