Steinar Midtskogen | be668e9 | 2016-08-05 12:12:38 +0200 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (c) 2016, Alliance for Open Media. All rights reserved |
| 3 | * |
| 4 | * This source code is subject to the terms of the BSD 2 Clause License and |
| 5 | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| 6 | * was not distributed with this source code in the LICENSE file, you can |
| 7 | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| 8 | * Media Patent License 1.0 was not distributed with this source code in the |
| 9 | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
| 10 | */ |
| 11 | |
| 12 | #include "./aom_dsp_rtcd.h" |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 13 | #include "aom_ports/mem.h" |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 14 | #include "aom_ports/bitops.h" |
| 15 | #include "av1/common/clpf_simd_kernel.h" |
Steinar Midtskogen | 3dbd55a | 2016-09-09 15:23:35 +0200 | [diff] [blame] | 16 | |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 17 | // Process blocks of width 8, two lines at a time, 8 bit. |
| 18 | static void clpf_block8(const uint8_t *src, uint8_t *dst, int sstride, |
Steinar Midtskogen | 73ad523 | 2017-01-30 14:39:07 +0100 | [diff] [blame] | 19 | int dstride, int x0, int y0, int sizey, |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 20 | BOUNDARY_TYPE bt, unsigned int strength) { |
Steinar Midtskogen | 73ad523 | 2017-01-30 14:39:07 +0100 | [diff] [blame] | 21 | const int bottom = bt & TILE_BOTTOM_BOUNDARY ? sizey - 2 : -1; |
| 22 | const int right = !(bt & TILE_RIGHT_BOUNDARY); |
| 23 | const int left = !(bt & TILE_LEFT_BOUNDARY); |
| 24 | const int top = bt & TILE_ABOVE_BOUNDARY ? y0 : -1; |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 25 | DECLARE_ALIGNED(16, static const uint64_t, |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 26 | c_shuff[]) = { 0x0504030201000000LL, 0x0d0c0b0a09080808LL }; |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 27 | DECLARE_ALIGNED(16, static const uint64_t, |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 28 | d_shuff[]) = { 0x0605040302010000LL, 0x0e0d0c0b0a090808LL }; |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 29 | DECLARE_ALIGNED(16, static const uint64_t, |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 30 | e_shuff[]) = { 0x0707060504030201LL, 0x0f0f0e0d0c0b0a09LL }; |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 31 | DECLARE_ALIGNED(16, static const uint64_t, |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 32 | f_shuff[]) = { 0x0707070605040302LL, 0x0f0f0f0e0d0c0b0aLL }; |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 33 | int y; |
| 34 | |
Steinar Midtskogen | e8224c7 | 2016-08-24 13:00:04 +0200 | [diff] [blame] | 35 | dst += x0 + y0 * dstride; |
| 36 | src += x0 + y0 * sstride; |
Steinar Midtskogen | be668e9 | 2016-08-05 12:12:38 +0200 | [diff] [blame] | 37 | |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 38 | for (y = 0; y < sizey; y += 2) { |
| 39 | const v64 l1 = v64_load_aligned(src); |
| 40 | const v64 l2 = v64_load_aligned(src + sstride); |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 41 | const v64 l3 = v64_load_aligned(src - (y != top) * sstride); |
| 42 | const v64 l4 = v64_load_aligned(src + ((y != bottom) + 1) * sstride); |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 43 | v128 o = v128_from_v64(l1, l2); |
| 44 | const v128 a = |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 45 | v128_from_v64(v64_load_aligned(src - 2 * (y != top) * sstride), l3); |
| 46 | const v128 b = v128_from_v64(l3, l1); |
| 47 | const v128 g = v128_from_v64(l2, l4); |
| 48 | const v128 h = v128_from_v64( |
| 49 | l4, v64_load_aligned(src + (2 * (y != bottom) + 1) * sstride)); |
| 50 | v128 c, d, e, f; |
Steinar Midtskogen | be668e9 | 2016-08-05 12:12:38 +0200 | [diff] [blame] | 51 | |
Steinar Midtskogen | 73ad523 | 2017-01-30 14:39:07 +0100 | [diff] [blame] | 52 | if (left) { |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 53 | c = v128_from_v64(v64_load_unaligned(src - 2), |
Steinar Midtskogen | e66fc87 | 2016-09-26 12:51:25 +0200 | [diff] [blame] | 54 | v64_load_unaligned(src - 2 + sstride)); |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 55 | d = v128_from_v64(v64_load_unaligned(src - 1), |
Steinar Midtskogen | e66fc87 | 2016-09-26 12:51:25 +0200 | [diff] [blame] | 56 | v64_load_unaligned(src - 1 + sstride)); |
| 57 | } else { // Left clipping |
Steinar Midtskogen | e66fc87 | 2016-09-26 12:51:25 +0200 | [diff] [blame] | 58 | c = v128_shuffle_8(o, v128_load_aligned(c_shuff)); |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 59 | d = v128_shuffle_8(o, v128_load_aligned(d_shuff)); |
Steinar Midtskogen | 3dbd55a | 2016-09-09 15:23:35 +0200 | [diff] [blame] | 60 | } |
Steinar Midtskogen | e66fc87 | 2016-09-26 12:51:25 +0200 | [diff] [blame] | 61 | if (right) { |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 62 | e = v128_from_v64(v64_load_unaligned(src + 1), |
Steinar Midtskogen | e66fc87 | 2016-09-26 12:51:25 +0200 | [diff] [blame] | 63 | v64_load_unaligned(src + 1 + sstride)); |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 64 | f = v128_from_v64(v64_load_unaligned(src + 2), |
Steinar Midtskogen | e66fc87 | 2016-09-26 12:51:25 +0200 | [diff] [blame] | 65 | v64_load_unaligned(src + 2 + sstride)); |
| 66 | } else { // Right clipping |
Steinar Midtskogen | e66fc87 | 2016-09-26 12:51:25 +0200 | [diff] [blame] | 67 | e = v128_shuffle_8(o, v128_load_aligned(e_shuff)); |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 68 | f = v128_shuffle_8(o, v128_load_aligned(f_shuff)); |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 69 | } |
Steinar Midtskogen | be668e9 | 2016-08-05 12:12:38 +0200 | [diff] [blame] | 70 | |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 71 | o = calc_delta(o, a, b, c, d, e, f, g, h, strength); |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 72 | v64_store_aligned(dst, v128_high_v64(o)); |
| 73 | v64_store_aligned(dst + dstride, v128_low_v64(o)); |
| 74 | src += sstride * 2; |
| 75 | dst += dstride * 2; |
| 76 | } |
| 77 | } |
| 78 | |
Steinar Midtskogen | f844e6e | 2017-02-09 17:24:37 +0100 | [diff] [blame^] | 79 | // As above, but with no clipping tests |
| 80 | static void clpf_block8_noclip(const uint8_t *src, uint8_t *dst, int sstride, |
| 81 | int dstride, int x0, int y0, int sizey, |
| 82 | unsigned int strength) { |
| 83 | int y; |
| 84 | |
| 85 | dst += x0 + y0 * dstride; |
| 86 | src += x0 + y0 * sstride; |
| 87 | |
| 88 | for (y = 0; y < sizey; y += 2) { |
| 89 | const v64 l1 = v64_load_aligned(src); |
| 90 | const v64 l2 = v64_load_aligned(src + sstride); |
| 91 | const v64 l3 = v64_load_aligned(src - sstride); |
| 92 | const v64 l4 = v64_load_aligned(src + 2 * sstride); |
| 93 | const v128 a = v128_from_v64(v64_load_aligned(src - 2 * sstride), l3); |
| 94 | const v128 b = v128_from_v64(l3, l1); |
| 95 | const v128 g = v128_from_v64(l2, l4); |
| 96 | const v128 h = v128_from_v64(l4, v64_load_aligned(src + 3 * sstride)); |
| 97 | const v128 c = v128_from_v64(v64_load_unaligned(src - 2), |
| 98 | v64_load_unaligned(src - 2 + sstride)); |
| 99 | const v128 d = v128_from_v64(v64_load_unaligned(src - 1), |
| 100 | v64_load_unaligned(src - 1 + sstride)); |
| 101 | const v128 e = v128_from_v64(v64_load_unaligned(src + 1), |
| 102 | v64_load_unaligned(src + 1 + sstride)); |
| 103 | const v128 f = v128_from_v64(v64_load_unaligned(src + 2), |
| 104 | v64_load_unaligned(src + 2 + sstride)); |
| 105 | const v128 o = |
| 106 | calc_delta(v128_from_v64(l1, l2), a, b, c, d, e, f, g, h, strength); |
| 107 | |
| 108 | v64_store_aligned(dst, v128_high_v64(o)); |
| 109 | v64_store_aligned(dst + dstride, v128_low_v64(o)); |
| 110 | src += sstride * 2; |
| 111 | dst += dstride * 2; |
| 112 | } |
| 113 | } |
| 114 | |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 115 | // Process blocks of width 4, four lines at a time, 8 bit. |
| 116 | static void clpf_block4(const uint8_t *src, uint8_t *dst, int sstride, |
Steinar Midtskogen | 73ad523 | 2017-01-30 14:39:07 +0100 | [diff] [blame] | 117 | int dstride, int x0, int y0, int sizey, |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 118 | BOUNDARY_TYPE bt, unsigned int strength) { |
Steinar Midtskogen | 73ad523 | 2017-01-30 14:39:07 +0100 | [diff] [blame] | 119 | const int right = !(bt & TILE_RIGHT_BOUNDARY); |
| 120 | const int bottom = bt & TILE_BOTTOM_BOUNDARY ? sizey - 4 : -1; |
| 121 | const int left = !(bt & TILE_LEFT_BOUNDARY); |
| 122 | const int top = bt & TILE_ABOVE_BOUNDARY ? y0 : -1; |
| 123 | |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 124 | DECLARE_ALIGNED(16, static const uint64_t, |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 125 | c_shuff[]) = { 0x0504040401000000LL, 0x0d0c0c0c09080808LL }; |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 126 | DECLARE_ALIGNED(16, static const uint64_t, |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 127 | d_shuff[]) = { 0x0605040402010000LL, 0x0e0d0c0c0a090808LL }; |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 128 | DECLARE_ALIGNED(16, static const uint64_t, |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 129 | e_shuff[]) = { 0x0707060503030201LL, 0x0f0f0e0d0b0b0a09LL }; |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 130 | DECLARE_ALIGNED(16, static const uint64_t, |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 131 | f_shuff[]) = { 0x0707070603030302LL, 0x0f0f0f0e0b0b0b0aLL }; |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 132 | int y; |
| 133 | |
| 134 | dst += x0 + y0 * dstride; |
| 135 | src += x0 + y0 * sstride; |
| 136 | |
| 137 | for (y = 0; y < sizey; y += 4) { |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 138 | const uint32_t l0 = u32_load_aligned(src - 2 * (y != top) * sstride); |
| 139 | const uint32_t l1 = u32_load_aligned(src - (y != top) * sstride); |
| 140 | const uint32_t l2 = u32_load_aligned(src); |
| 141 | const uint32_t l3 = u32_load_aligned(src + sstride); |
| 142 | const uint32_t l4 = u32_load_aligned(src + 2 * sstride); |
| 143 | const uint32_t l5 = u32_load_aligned(src + 3 * sstride); |
| 144 | const uint32_t l6 = u32_load_aligned(src + ((y != bottom) + 3) * sstride); |
| 145 | const uint32_t l7 = |
| 146 | u32_load_aligned(src + (2 * (y != bottom) + 3) * sstride); |
| 147 | v128 o = v128_from_32(l2, l3, l4, l5); |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 148 | const v128 a = v128_from_32(l0, l1, l2, l3); |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 149 | const v128 b = v128_from_32(l1, l2, l3, l4); |
| 150 | const v128 g = v128_from_32(l3, l4, l5, l6); |
| 151 | const v128 h = v128_from_32(l4, l5, l6, l7); |
| 152 | v128 c, d, e, f; |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 153 | |
Steinar Midtskogen | 73ad523 | 2017-01-30 14:39:07 +0100 | [diff] [blame] | 154 | if (left) { |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 155 | c = v128_from_32(u32_load_unaligned(src - 2), |
Steinar Midtskogen | e66fc87 | 2016-09-26 12:51:25 +0200 | [diff] [blame] | 156 | u32_load_unaligned(src + sstride - 2), |
| 157 | u32_load_unaligned(src + 2 * sstride - 2), |
| 158 | u32_load_unaligned(src + 3 * sstride - 2)); |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 159 | d = v128_from_32(u32_load_unaligned(src - 1), |
Steinar Midtskogen | e66fc87 | 2016-09-26 12:51:25 +0200 | [diff] [blame] | 160 | u32_load_unaligned(src + sstride - 1), |
| 161 | u32_load_unaligned(src + 2 * sstride - 1), |
| 162 | u32_load_unaligned(src + 3 * sstride - 1)); |
| 163 | } else { // Left clipping |
Steinar Midtskogen | e66fc87 | 2016-09-26 12:51:25 +0200 | [diff] [blame] | 164 | c = v128_shuffle_8(o, v128_load_aligned(c_shuff)); |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 165 | d = v128_shuffle_8(o, v128_load_aligned(d_shuff)); |
Steinar Midtskogen | 3dbd55a | 2016-09-09 15:23:35 +0200 | [diff] [blame] | 166 | } |
Steinar Midtskogen | e66fc87 | 2016-09-26 12:51:25 +0200 | [diff] [blame] | 167 | if (right) { |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 168 | e = v128_from_32(u32_load_unaligned(src + 1), |
Steinar Midtskogen | e66fc87 | 2016-09-26 12:51:25 +0200 | [diff] [blame] | 169 | u32_load_unaligned(src + sstride + 1), |
| 170 | u32_load_unaligned(src + 2 * sstride + 1), |
| 171 | u32_load_unaligned(src + 3 * sstride + 1)); |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 172 | f = v128_from_32(u32_load_unaligned(src + 2), |
Steinar Midtskogen | e66fc87 | 2016-09-26 12:51:25 +0200 | [diff] [blame] | 173 | u32_load_unaligned(src + sstride + 2), |
| 174 | u32_load_unaligned(src + 2 * sstride + 2), |
| 175 | u32_load_unaligned(src + 3 * sstride + 2)); |
| 176 | } else { // Right clipping |
Steinar Midtskogen | e66fc87 | 2016-09-26 12:51:25 +0200 | [diff] [blame] | 177 | e = v128_shuffle_8(o, v128_load_aligned(e_shuff)); |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 178 | f = v128_shuffle_8(o, v128_load_aligned(f_shuff)); |
Steinar Midtskogen | be668e9 | 2016-08-05 12:12:38 +0200 | [diff] [blame] | 179 | } |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 180 | |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 181 | o = calc_delta(o, a, b, c, d, e, f, g, h, strength); |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 182 | u32_store_aligned(dst, v128_low_u32(v128_shr_n_byte(o, 12))); |
| 183 | u32_store_aligned(dst + dstride, v128_low_u32(v128_shr_n_byte(o, 8))); |
| 184 | u32_store_aligned(dst + 2 * dstride, v128_low_u32(v128_shr_n_byte(o, 4))); |
| 185 | u32_store_aligned(dst + 3 * dstride, v128_low_u32(o)); |
| 186 | |
| 187 | dst += 4 * dstride; |
| 188 | src += 4 * sstride; |
Steinar Midtskogen | be668e9 | 2016-08-05 12:12:38 +0200 | [diff] [blame] | 189 | } |
| 190 | } |
| 191 | |
Steinar Midtskogen | f844e6e | 2017-02-09 17:24:37 +0100 | [diff] [blame^] | 192 | // As above, but with no clipping tests |
| 193 | static void clpf_block4_noclip(const uint8_t *src, uint8_t *dst, int sstride, |
| 194 | int dstride, int x0, int y0, int sizey, |
| 195 | unsigned int strength) { |
| 196 | int y; |
| 197 | |
| 198 | dst += x0 + y0 * dstride; |
| 199 | src += x0 + y0 * sstride; |
| 200 | |
| 201 | for (y = 0; y < sizey; y += 4) { |
| 202 | const uint32_t l0 = u32_load_aligned(src - 2 * sstride); |
| 203 | const uint32_t l1 = u32_load_aligned(src - sstride); |
| 204 | const uint32_t l2 = u32_load_aligned(src); |
| 205 | const uint32_t l3 = u32_load_aligned(src + sstride); |
| 206 | const uint32_t l4 = u32_load_aligned(src + 2 * sstride); |
| 207 | const uint32_t l5 = u32_load_aligned(src + 3 * sstride); |
| 208 | const uint32_t l6 = u32_load_aligned(src + 4 * sstride); |
| 209 | const uint32_t l7 = u32_load_aligned(src + 5 * sstride); |
| 210 | const v128 a = v128_from_32(l0, l1, l2, l3); |
| 211 | const v128 b = v128_from_32(l1, l2, l3, l4); |
| 212 | const v128 g = v128_from_32(l3, l4, l5, l6); |
| 213 | const v128 h = v128_from_32(l4, l5, l6, l7); |
| 214 | const v128 c = v128_from_32(u32_load_unaligned(src - 2), |
| 215 | u32_load_unaligned(src + sstride - 2), |
| 216 | u32_load_unaligned(src + 2 * sstride - 2), |
| 217 | u32_load_unaligned(src + 3 * sstride - 2)); |
| 218 | const v128 d = v128_from_32(u32_load_unaligned(src - 1), |
| 219 | u32_load_unaligned(src + sstride - 1), |
| 220 | u32_load_unaligned(src + 2 * sstride - 1), |
| 221 | u32_load_unaligned(src + 3 * sstride - 1)); |
| 222 | const v128 e = v128_from_32(u32_load_unaligned(src + 1), |
| 223 | u32_load_unaligned(src + sstride + 1), |
| 224 | u32_load_unaligned(src + 2 * sstride + 1), |
| 225 | u32_load_unaligned(src + 3 * sstride + 1)); |
| 226 | const v128 f = v128_from_32(u32_load_unaligned(src + 2), |
| 227 | u32_load_unaligned(src + sstride + 2), |
| 228 | u32_load_unaligned(src + 2 * sstride + 2), |
| 229 | u32_load_unaligned(src + 3 * sstride + 2)); |
| 230 | |
| 231 | const v128 o = calc_delta(v128_from_32(l2, l3, l4, l5), a, b, c, d, e, f, g, |
| 232 | h, strength); |
| 233 | |
| 234 | u32_store_aligned(dst, v128_low_u32(v128_shr_n_byte(o, 12))); |
| 235 | u32_store_aligned(dst + dstride, v128_low_u32(v128_shr_n_byte(o, 8))); |
| 236 | u32_store_aligned(dst + 2 * dstride, v128_low_u32(v128_shr_n_byte(o, 4))); |
| 237 | u32_store_aligned(dst + 3 * dstride, v128_low_u32(o)); |
| 238 | |
| 239 | dst += 4 * dstride; |
| 240 | src += 4 * sstride; |
| 241 | } |
| 242 | } |
| 243 | |
Steinar Midtskogen | e8224c7 | 2016-08-24 13:00:04 +0200 | [diff] [blame] | 244 | void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int sstride, |
| 245 | int dstride, int x0, int y0, int sizex, |
Steinar Midtskogen | 73ad523 | 2017-01-30 14:39:07 +0100 | [diff] [blame] | 246 | int sizey, unsigned int strength, |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 247 | BOUNDARY_TYPE bt, unsigned int bd) { |
Steinar Midtskogen | e66fc87 | 2016-09-26 12:51:25 +0200 | [diff] [blame] | 248 | if ((sizex != 4 && sizex != 8) || ((sizey & 3) && sizex == 4)) { |
| 249 | // Fallback to C for odd sizes: |
| 250 | // * block widths not 4 or 8 |
| 251 | // * block heights not a multiple of 4 if the block width is 4 |
Steinar Midtskogen | 73ad523 | 2017-01-30 14:39:07 +0100 | [diff] [blame] | 252 | aom_clpf_block_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, strength, |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 253 | bt, bd); |
Steinar Midtskogen | be668e9 | 2016-08-05 12:12:38 +0200 | [diff] [blame] | 254 | } else { |
Steinar Midtskogen | f844e6e | 2017-02-09 17:24:37 +0100 | [diff] [blame^] | 255 | if (bt) |
| 256 | (sizex == 4 ? clpf_block4 : clpf_block8)(src, dst, sstride, dstride, x0, |
| 257 | y0, sizey, bt, strength); |
| 258 | else |
| 259 | (sizex == 4 ? clpf_block4_noclip : clpf_block8_noclip)( |
| 260 | src, dst, sstride, dstride, x0, y0, sizey, strength); |
Steinar Midtskogen | be668e9 | 2016-08-05 12:12:38 +0200 | [diff] [blame] | 261 | } |
| 262 | } |
Steinar Midtskogen | 3dbd55a | 2016-09-09 15:23:35 +0200 | [diff] [blame] | 263 | |
| 264 | #if CONFIG_AOM_HIGHBITDEPTH |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 265 | // sign(a - b) * max(0, abs(a - b) - max(0, abs(a - b) - |
| 266 | // strength + (abs(a - b) >> (bd - 3 - log2(s))))) |
| 267 | SIMD_INLINE v128 constrain_hbd(v128 a, v128 b, unsigned int strength, |
| 268 | unsigned int bd) { |
| 269 | const v128 diff = v128_sub_16(v128_max_s16(a, b), v128_min_s16(a, b)); |
| 270 | const v128 sign = v128_cmpeq_16(v128_min_s16(a, b), a); // -(a <= b) |
| 271 | const v128 zero = v128_zero(); |
| 272 | const v128 s = v128_max_s16( |
| 273 | zero, v128_sub_16(v128_dup_16(strength), |
| 274 | v128_shr_u16(diff, bd - 3 - get_msb(strength)))); |
| 275 | return v128_sub_16( |
| 276 | v128_xor(sign, |
| 277 | v128_max_s16( |
| 278 | zero, v128_sub_16( |
| 279 | diff, v128_max_s16(zero, v128_sub_16(diff, s))))), |
| 280 | sign); |
| 281 | } |
| 282 | |
| 283 | // delta = 1/16 * constrain(a, x, s, bd) + 3/16 * constrain(b, x, s, bd) + |
| 284 | // 1/16 * constrain(c, x, s, bd) + 3/16 * constrain(d, x, s, bd) + |
| 285 | // 3/16 * constrain(e, x, s, bd) + 1/16 * constrain(f, x, s, bd) + |
| 286 | // 3/16 * constrain(g, x, s, bd) + 1/16 * constrain(h, x, s, bd) |
| 287 | SIMD_INLINE v128 calc_delta_hbd(v128 x, v128 a, v128 b, v128 c, v128 d, v128 e, |
| 288 | v128 f, v128 g, v128 h, unsigned int s, |
| 289 | unsigned int bd) { |
| 290 | const v128 bdeg = v128_add_16( |
| 291 | v128_add_16(constrain_hbd(b, x, s, bd), constrain_hbd(d, x, s, bd)), |
| 292 | v128_add_16(constrain_hbd(e, x, s, bd), constrain_hbd(g, x, s, bd))); |
Steinar Midtskogen | 3dbd55a | 2016-09-09 15:23:35 +0200 | [diff] [blame] | 293 | const v128 delta = v128_add_16( |
| 294 | v128_add_16( |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 295 | v128_add_16(constrain_hbd(a, x, s, bd), constrain_hbd(c, x, s, bd)), |
| 296 | v128_add_16(constrain_hbd(f, x, s, bd), constrain_hbd(h, x, s, bd))), |
| 297 | v128_add_16(v128_add_16(bdeg, bdeg), bdeg)); |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 298 | return v128_add_16( |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 299 | x, |
| 300 | v128_shr_s16( |
| 301 | v128_add_16(v128_dup_16(8), |
| 302 | v128_add_16(delta, v128_cmplt_s16(delta, v128_zero()))), |
| 303 | 4)); |
Steinar Midtskogen | 3dbd55a | 2016-09-09 15:23:35 +0200 | [diff] [blame] | 304 | } |
| 305 | |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 306 | static void calc_delta_hbd4(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e, |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 307 | v128 f, v128 g, v128 h, uint16_t *dst, |
| 308 | unsigned int s, unsigned int bd, int dstride) { |
| 309 | o = calc_delta_hbd(o, a, b, c, d, e, f, g, h, s, bd); |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 310 | v64_store_aligned(dst, v128_high_v64(o)); |
| 311 | v64_store_aligned(dst + dstride, v128_low_v64(o)); |
| 312 | } |
| 313 | |
| 314 | static void calc_delta_hbd8(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e, |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 315 | v128 f, v128 g, v128 h, uint16_t *dst, |
| 316 | unsigned int s, unsigned int bd) { |
| 317 | v128_store_aligned(dst, calc_delta_hbd(o, a, b, c, d, e, f, g, h, s, bd)); |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 318 | } |
| 319 | |
| 320 | // Process blocks of width 4, two lines at time. |
| 321 | SIMD_INLINE void clpf_block_hbd4(const uint16_t *src, uint16_t *dst, |
| 322 | int sstride, int dstride, int x0, int y0, |
Steinar Midtskogen | 73ad523 | 2017-01-30 14:39:07 +0100 | [diff] [blame] | 323 | int sizey, unsigned int strength, |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 324 | BOUNDARY_TYPE bt, unsigned int bd) { |
Steinar Midtskogen | 73ad523 | 2017-01-30 14:39:07 +0100 | [diff] [blame] | 325 | const int right = !(bt & TILE_RIGHT_BOUNDARY); |
| 326 | const int bottom = bt & TILE_BOTTOM_BOUNDARY ? sizey - 2 : -1; |
| 327 | const int left = !(bt & TILE_LEFT_BOUNDARY); |
| 328 | const int top = bt & TILE_ABOVE_BOUNDARY ? y0 : -1; |
| 329 | |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 330 | DECLARE_ALIGNED(16, static const uint64_t, |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 331 | c_shuff[]) = { 0x0302010001000100LL, 0x0b0a090809080908LL }; |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 332 | DECLARE_ALIGNED(16, static const uint64_t, |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 333 | d_shuff[]) = { 0x0504030201000100LL, 0x0d0c0b0a09080908LL }; |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 334 | DECLARE_ALIGNED(16, static const uint64_t, |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 335 | e_shuff[]) = { 0x0706070605040302LL, 0x0f0e0f0e0d0c0b0aLL }; |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 336 | DECLARE_ALIGNED(16, static const uint64_t, |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 337 | f_shuff[]) = { 0x0706070607060504LL, 0x0f0e0f0e0f0e0d0cLL }; |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 338 | int y; |
Steinar Midtskogen | 3dbd55a | 2016-09-09 15:23:35 +0200 | [diff] [blame] | 339 | |
| 340 | dst += x0 + y0 * dstride; |
| 341 | src += x0 + y0 * sstride; |
| 342 | |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 343 | for (y = 0; y < sizey; y += 2) { |
| 344 | const v64 l1 = v64_load_aligned(src); |
| 345 | const v64 l2 = v64_load_aligned(src + sstride); |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 346 | const v64 l3 = v64_load_aligned(src - (y != top) * sstride); |
| 347 | const v64 l4 = v64_load_aligned(src + ((y != bottom) + 1) * sstride); |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 348 | v128 o = v128_from_v64(l1, l2); |
| 349 | const v128 a = |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 350 | v128_from_v64(v64_load_aligned(src - 2 * (y != top) * sstride), l3); |
| 351 | const v128 b = v128_from_v64(l3, l1); |
| 352 | const v128 g = v128_from_v64(l2, l4); |
| 353 | const v128 h = v128_from_v64( |
| 354 | l4, v64_load_aligned(src + (2 * (y != bottom) + 1) * sstride)); |
| 355 | v128 c, d, e, f; |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 356 | |
Steinar Midtskogen | 73ad523 | 2017-01-30 14:39:07 +0100 | [diff] [blame] | 357 | if (left) { |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 358 | c = v128_from_v64(v64_load_unaligned(src - 2), |
Steinar Midtskogen | e66fc87 | 2016-09-26 12:51:25 +0200 | [diff] [blame] | 359 | v64_load_unaligned(src - 2 + sstride)); |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 360 | d = v128_from_v64(v64_load_unaligned(src - 1), |
Steinar Midtskogen | e66fc87 | 2016-09-26 12:51:25 +0200 | [diff] [blame] | 361 | v64_load_unaligned(src - 1 + sstride)); |
| 362 | } else { // Left clipping |
Steinar Midtskogen | e66fc87 | 2016-09-26 12:51:25 +0200 | [diff] [blame] | 363 | c = v128_shuffle_8(o, v128_load_aligned(c_shuff)); |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 364 | d = v128_shuffle_8(o, v128_load_aligned(d_shuff)); |
Steinar Midtskogen | 3dbd55a | 2016-09-09 15:23:35 +0200 | [diff] [blame] | 365 | } |
Steinar Midtskogen | e66fc87 | 2016-09-26 12:51:25 +0200 | [diff] [blame] | 366 | if (right) { |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 367 | e = v128_from_v64(v64_load_unaligned(src + 1), |
Steinar Midtskogen | e66fc87 | 2016-09-26 12:51:25 +0200 | [diff] [blame] | 368 | v64_load_unaligned(src + 1 + sstride)); |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 369 | f = v128_from_v64(v64_load_unaligned(src + 2), |
Steinar Midtskogen | e66fc87 | 2016-09-26 12:51:25 +0200 | [diff] [blame] | 370 | v64_load_unaligned(src + 2 + sstride)); |
| 371 | } else { // Right clipping |
Steinar Midtskogen | e66fc87 | 2016-09-26 12:51:25 +0200 | [diff] [blame] | 372 | e = v128_shuffle_8(o, v128_load_aligned(e_shuff)); |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 373 | f = v128_shuffle_8(o, v128_load_aligned(f_shuff)); |
Steinar Midtskogen | 3dbd55a | 2016-09-09 15:23:35 +0200 | [diff] [blame] | 374 | } |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 375 | calc_delta_hbd4(o, a, b, c, d, e, f, g, h, dst, strength, bd, dstride); |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 376 | src += sstride * 2; |
| 377 | dst += dstride * 2; |
| 378 | } |
| 379 | } |
| 380 | |
Steinar Midtskogen | f844e6e | 2017-02-09 17:24:37 +0100 | [diff] [blame^] | 381 | // As above, but with no clipping tests |
| 382 | SIMD_INLINE void clpf_block_hbd4_noclip(const uint16_t *src, uint16_t *dst, |
| 383 | int sstride, int dstride, int x0, |
| 384 | int y0, int sizey, |
| 385 | unsigned int strength, |
| 386 | unsigned int bd) { |
| 387 | int y; |
| 388 | |
| 389 | dst += x0 + y0 * dstride; |
| 390 | src += x0 + y0 * sstride; |
| 391 | |
| 392 | for (y = 0; y < sizey; y += 2) { |
| 393 | const v64 l1 = v64_load_aligned(src); |
| 394 | const v64 l2 = v64_load_aligned(src + sstride); |
| 395 | const v64 l3 = v64_load_aligned(src - sstride); |
| 396 | const v64 l4 = v64_load_aligned(src + 2 * sstride); |
| 397 | const v128 a = v128_from_v64(v64_load_aligned(src - 2 * sstride), l3); |
| 398 | const v128 b = v128_from_v64(l3, l1); |
| 399 | const v128 g = v128_from_v64(l2, l4); |
| 400 | const v128 h = v128_from_v64(l4, v64_load_aligned(src + 3 * sstride)); |
| 401 | const v128 c = v128_from_v64(v64_load_unaligned(src - 2), |
| 402 | v64_load_unaligned(src - 2 + sstride)); |
| 403 | const v128 d = v128_from_v64(v64_load_unaligned(src - 1), |
| 404 | v64_load_unaligned(src - 1 + sstride)); |
| 405 | const v128 e = v128_from_v64(v64_load_unaligned(src + 1), |
| 406 | v64_load_unaligned(src + 1 + sstride)); |
| 407 | const v128 f = v128_from_v64(v64_load_unaligned(src + 2), |
| 408 | v64_load_unaligned(src + 2 + sstride)); |
| 409 | |
| 410 | calc_delta_hbd4(v128_from_v64(l1, l2), a, b, c, d, e, f, g, h, dst, |
| 411 | strength, bd, dstride); |
| 412 | src += sstride * 2; |
| 413 | dst += dstride * 2; |
| 414 | } |
| 415 | } |
| 416 | |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 417 | // The most simple case. Start here if you need to understand the functions. |
| 418 | SIMD_INLINE void clpf_block_hbd(const uint16_t *src, uint16_t *dst, int sstride, |
| 419 | int dstride, int x0, int y0, int sizey, |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 420 | unsigned int strength, BOUNDARY_TYPE bt, |
| 421 | unsigned int bd) { |
Steinar Midtskogen | 73ad523 | 2017-01-30 14:39:07 +0100 | [diff] [blame] | 422 | const int right = !(bt & TILE_RIGHT_BOUNDARY); |
Steinar Midtskogen | 73ad523 | 2017-01-30 14:39:07 +0100 | [diff] [blame] | 423 | const int left = !(bt & TILE_LEFT_BOUNDARY); |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 424 | const int ymin = -!(bt & TILE_ABOVE_BOUNDARY) * 2; |
| 425 | const int ymax = sizey + !(bt & TILE_BOTTOM_BOUNDARY) * 2 - 1; |
Steinar Midtskogen | 73ad523 | 2017-01-30 14:39:07 +0100 | [diff] [blame] | 426 | |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 427 | DECLARE_ALIGNED(16, static const uint64_t, |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 428 | c_shuff[]) = { 0x0302010001000100LL, 0x0b0a090807060504LL }; |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 429 | DECLARE_ALIGNED(16, static const uint64_t, |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 430 | d_shuff[]) = { 0x0504030201000100LL, 0x0d0c0b0a09080706LL }; |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 431 | DECLARE_ALIGNED(16, static const uint64_t, |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 432 | e_shuff[]) = { 0x0908070605040302LL, 0x0f0e0f0e0d0c0b0aLL }; |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 433 | DECLARE_ALIGNED(16, static const uint64_t, |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 434 | f_shuff[]) = { 0x0b0a090807060504LL, 0x0f0e0f0e0f0e0d0cLL }; |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 435 | int y; |
| 436 | |
| 437 | dst += x0 + y0 * dstride; |
| 438 | src += x0 + y0 * sstride; |
| 439 | |
| 440 | // Read 8 set of pixels at a time. Clipping along upper and lower |
| 441 | // edges is handled by reading the upper or lower line twice. |
| 442 | // Clipping along the left and right edges is handled by shuffle |
| 443 | // instructions doing shift and pad. |
| 444 | for (y = 0; y < sizey; y++) { |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 445 | const v128 o = v128_load_aligned(src + y * sstride); |
| 446 | const v128 a = v128_load_aligned(src + AOMMAX(ymin, y - 2) * sstride); |
| 447 | const v128 b = v128_load_aligned(src + AOMMAX(ymin, y - 1) * sstride); |
| 448 | const v128 g = v128_load_aligned(src + AOMMIN(ymax, y + 1) * sstride); |
| 449 | const v128 h = v128_load_aligned(src + AOMMIN(ymax, y + 2) * sstride); |
| 450 | v128 c, d, e, f; |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 451 | |
Steinar Midtskogen | 73ad523 | 2017-01-30 14:39:07 +0100 | [diff] [blame] | 452 | if (left) { |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 453 | c = v128_load_unaligned(src + y * sstride - 2); |
| 454 | d = v128_load_unaligned(src + y * sstride - 1); |
Steinar Midtskogen | e66fc87 | 2016-09-26 12:51:25 +0200 | [diff] [blame] | 455 | } else { // Left clipping |
Steinar Midtskogen | e66fc87 | 2016-09-26 12:51:25 +0200 | [diff] [blame] | 456 | c = v128_shuffle_8(o, v128_load_aligned(c_shuff)); |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 457 | d = v128_shuffle_8(o, v128_load_aligned(d_shuff)); |
Steinar Midtskogen | 3dbd55a | 2016-09-09 15:23:35 +0200 | [diff] [blame] | 458 | } |
Steinar Midtskogen | e66fc87 | 2016-09-26 12:51:25 +0200 | [diff] [blame] | 459 | if (right) { |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 460 | e = v128_load_unaligned(src + y * sstride + 1); |
| 461 | f = v128_load_unaligned(src + y * sstride + 2); |
Steinar Midtskogen | e66fc87 | 2016-09-26 12:51:25 +0200 | [diff] [blame] | 462 | } else { // Right clipping |
Steinar Midtskogen | e66fc87 | 2016-09-26 12:51:25 +0200 | [diff] [blame] | 463 | e = v128_shuffle_8(o, v128_load_aligned(e_shuff)); |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 464 | f = v128_shuffle_8(o, v128_load_aligned(f_shuff)); |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 465 | } |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 466 | calc_delta_hbd8(o, a, b, c, d, e, f, g, h, dst, strength, bd); |
Steinar Midtskogen | ecf9a0c | 2016-09-13 16:37:13 +0200 | [diff] [blame] | 467 | dst += dstride; |
Steinar Midtskogen | 3dbd55a | 2016-09-09 15:23:35 +0200 | [diff] [blame] | 468 | } |
| 469 | } |
| 470 | |
Steinar Midtskogen | f844e6e | 2017-02-09 17:24:37 +0100 | [diff] [blame^] | 471 | // As above, but with no clipping tests |
| 472 | SIMD_INLINE void clpf_block_hbd_noclip(const uint16_t *src, uint16_t *dst, |
| 473 | int sstride, int dstride, int x0, int y0, |
| 474 | int sizey, unsigned int strength, |
| 475 | unsigned int bd) { |
| 476 | int y; |
| 477 | |
| 478 | dst += x0 + y0 * dstride; |
| 479 | src += x0 + y0 * sstride; |
| 480 | |
| 481 | for (y = 0; y < sizey; y++) { |
| 482 | const v128 o = v128_load_aligned(src); |
| 483 | const v128 a = v128_load_aligned(src - 2 * sstride); |
| 484 | const v128 b = v128_load_aligned(src - 1 * sstride); |
| 485 | const v128 g = v128_load_aligned(src + sstride); |
| 486 | const v128 h = v128_load_aligned(src + 2 * sstride); |
| 487 | const v128 c = v128_load_unaligned(src - 2); |
| 488 | const v128 d = v128_load_unaligned(src - 1); |
| 489 | const v128 e = v128_load_unaligned(src + 1); |
| 490 | const v128 f = v128_load_unaligned(src + 2); |
| 491 | |
| 492 | calc_delta_hbd8(o, a, b, c, d, e, f, g, h, dst, strength, bd); |
| 493 | src += sstride; |
| 494 | dst += dstride; |
| 495 | } |
| 496 | } |
| 497 | |
Steinar Midtskogen | 3dbd55a | 2016-09-09 15:23:35 +0200 | [diff] [blame] | 498 | void SIMD_FUNC(aom_clpf_block_hbd)(const uint16_t *src, uint16_t *dst, |
| 499 | int sstride, int dstride, int x0, int y0, |
Steinar Midtskogen | 73ad523 | 2017-01-30 14:39:07 +0100 | [diff] [blame] | 500 | int sizex, int sizey, unsigned int strength, |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 501 | BOUNDARY_TYPE bt, unsigned int bd) { |
Steinar Midtskogen | e66fc87 | 2016-09-26 12:51:25 +0200 | [diff] [blame] | 502 | if ((sizex != 4 && sizex != 8) || ((sizey & 1) && sizex == 4)) { |
| 503 | // Fallback to C for odd sizes: |
| 504 | // * block width not 4 or 8 |
| 505 | // * block heights not a multiple of 2 if the block width is 4 |
Steinar Midtskogen | 3dbd55a | 2016-09-09 15:23:35 +0200 | [diff] [blame] | 506 | aom_clpf_block_hbd_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, |
Steinar Midtskogen | 4f0b3ed | 2017-02-08 18:48:07 +0100 | [diff] [blame] | 507 | strength, bt, bd); |
Steinar Midtskogen | 3dbd55a | 2016-09-09 15:23:35 +0200 | [diff] [blame] | 508 | } else { |
Steinar Midtskogen | f844e6e | 2017-02-09 17:24:37 +0100 | [diff] [blame^] | 509 | if (bt) |
| 510 | (sizex == 4 ? clpf_block_hbd4 : clpf_block_hbd)( |
| 511 | src, dst, sstride, dstride, x0, y0, sizey, strength, bt, bd); |
| 512 | else |
| 513 | (sizex == 4 ? clpf_block_hbd4_noclip : clpf_block_hbd_noclip)( |
| 514 | src, dst, sstride, dstride, x0, y0, sizey, strength, bd); |
Steinar Midtskogen | 3dbd55a | 2016-09-09 15:23:35 +0200 | [diff] [blame] | 515 | } |
| 516 | } |
| 517 | #endif |