Imdad Sardharwalla | c6acc53 | 2018-01-03 15:18:24 +0000 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (c) 2018, Alliance for Open Media. All rights reserved |
| 3 | * |
| 4 | * This source code is subject to the terms of the BSD 2 Clause License and |
| 5 | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| 6 | * was not distributed with this source code in the LICENSE file, you can |
| 7 | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| 8 | * Media Patent License 1.0 was not distributed with this source code in the |
| 9 | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
| 10 | */ |
| 11 | |
| 12 | #include <immintrin.h> |
| 13 | |
Tom Finegan | 60e653d | 2018-05-22 11:34:58 -0700 | [diff] [blame] | 14 | #include "config/aom_config.h" |
Tom Finegan | 44702c8 | 2018-05-22 13:00:39 -0700 | [diff] [blame^] | 15 | #include "config/av1_rtcd.h" |
Tom Finegan | 60e653d | 2018-05-22 11:34:58 -0700 | [diff] [blame] | 16 | |
Imdad Sardharwalla | c6acc53 | 2018-01-03 15:18:24 +0000 | [diff] [blame] | 17 | #include "av1/common/restoration.h" |
| 18 | #include "aom_dsp/x86/synonyms.h" |
| 19 | #include "aom_dsp/x86/synonyms_avx2.h" |
| 20 | |
| 21 | // Load 8 bytes from the possibly-misaligned pointer p, extend each byte to |
| 22 | // 32-bit precision and return them in an AVX2 register. |
| 23 | static __m256i yy256_load_extend_8_32(const void *p) { |
| 24 | return _mm256_cvtepu8_epi32(xx_loadl_64(p)); |
| 25 | } |
| 26 | |
| 27 | // Load 8 halfwords from the possibly-misaligned pointer p, extend each |
| 28 | // halfword to 32-bit precision and return them in an AVX2 register. |
| 29 | static __m256i yy256_load_extend_16_32(const void *p) { |
| 30 | return _mm256_cvtepu16_epi32(xx_loadu_128(p)); |
| 31 | } |
| 32 | |
| 33 | // Compute the scan of an AVX2 register holding 8 32-bit integers. If the |
| 34 | // register holds x0..x7 then the scan will hold x0, x0+x1, x0+x1+x2, ..., |
| 35 | // x0+x1+...+x7 |
| 36 | // |
| 37 | // Let [...] represent a 128-bit block, and let a, ..., h be 32-bit integers |
| 38 | // (assumed small enough to be able to add them without overflow). |
| 39 | // |
| 40 | // Use -> as shorthand for summing, i.e. h->a = h + g + f + e + d + c + b + a. |
| 41 | // |
| 42 | // x = [h g f e][d c b a] |
| 43 | // x01 = [g f e 0][c b a 0] |
| 44 | // x02 = [g+h f+g e+f e][c+d b+c a+b a] |
| 45 | // x03 = [e+f e 0 0][a+b a 0 0] |
| 46 | // x04 = [e->h e->g e->f e][a->d a->c a->b a] |
| 47 | // s = a->d |
| 48 | // s01 = [a->d a->d a->d a->d] |
| 49 | // s02 = [a->d a->d a->d a->d][0 0 0 0] |
| 50 | // ret = [a->h a->g a->f a->e][a->d a->c a->b a] |
| 51 | static __m256i scan_32(__m256i x) { |
| 52 | const __m256i x01 = _mm256_slli_si256(x, 4); |
| 53 | const __m256i x02 = _mm256_add_epi32(x, x01); |
| 54 | const __m256i x03 = _mm256_slli_si256(x02, 8); |
| 55 | const __m256i x04 = _mm256_add_epi32(x02, x03); |
| 56 | const int32_t s = _mm256_extract_epi32(x04, 3); |
| 57 | const __m128i s01 = _mm_set1_epi32(s); |
| 58 | const __m256i s02 = _mm256_insertf128_si256(_mm256_setzero_si256(), s01, 1); |
| 59 | return _mm256_add_epi32(x04, s02); |
| 60 | } |
| 61 | |
| 62 | // Compute two integral images from src. B sums elements; A sums their |
| 63 | // squares. The images are offset by one pixel, so will have width and height |
| 64 | // equal to width + 1, height + 1 and the first row and column will be zero. |
| 65 | // |
| 66 | // A+1 and B+1 should be aligned to 32 bytes. buf_stride should be a multiple |
| 67 | // of 8. |
Victoria Zhislina | eef1cb1 | 2018-04-23 15:46:30 +0300 | [diff] [blame] | 68 | |
Victoria Zhislina | 4198d39 | 2018-05-17 15:19:26 +0300 | [diff] [blame] | 69 | static void *memset_zero_avx(int32_t *dest, const __m256i *zero, size_t count) { |
Victoria Zhislina | eef1cb1 | 2018-04-23 15:46:30 +0300 | [diff] [blame] | 70 | unsigned int i = 0; |
| 71 | for (i = 0; i < (count & 0xffffffe0); i += 32) { |
Victoria Zhislina | 4198d39 | 2018-05-17 15:19:26 +0300 | [diff] [blame] | 72 | _mm256_storeu_si256((__m256i *)(dest + i), *zero); |
| 73 | _mm256_storeu_si256((__m256i *)(dest + i + 8), *zero); |
| 74 | _mm256_storeu_si256((__m256i *)(dest + i + 16), *zero); |
| 75 | _mm256_storeu_si256((__m256i *)(dest + i + 24), *zero); |
Victoria Zhislina | eef1cb1 | 2018-04-23 15:46:30 +0300 | [diff] [blame] | 76 | } |
| 77 | for (; i < (count & 0xfffffff8); i += 8) { |
Victoria Zhislina | 4198d39 | 2018-05-17 15:19:26 +0300 | [diff] [blame] | 78 | _mm256_storeu_si256((__m256i *)(dest + i), *zero); |
Victoria Zhislina | eef1cb1 | 2018-04-23 15:46:30 +0300 | [diff] [blame] | 79 | } |
| 80 | for (; i < count; i++) { |
Victoria Zhislina | 4198d39 | 2018-05-17 15:19:26 +0300 | [diff] [blame] | 81 | dest[i] = 0; |
Victoria Zhislina | eef1cb1 | 2018-04-23 15:46:30 +0300 | [diff] [blame] | 82 | } |
| 83 | return dest; |
| 84 | } |
| 85 | |
Imdad Sardharwalla | c6acc53 | 2018-01-03 15:18:24 +0000 | [diff] [blame] | 86 | static void integral_images(const uint8_t *src, int src_stride, int width, |
| 87 | int height, int32_t *A, int32_t *B, |
| 88 | int buf_stride) { |
Imdad Sardharwalla | c6acc53 | 2018-01-03 15:18:24 +0000 | [diff] [blame] | 89 | const __m256i zero = _mm256_setzero_si256(); |
Victoria Zhislina | eef1cb1 | 2018-04-23 15:46:30 +0300 | [diff] [blame] | 90 | // Write out the zero top row |
Victoria Zhislina | 4198d39 | 2018-05-17 15:19:26 +0300 | [diff] [blame] | 91 | memset_zero_avx(A, &zero, (width + 8)); |
| 92 | memset_zero_avx(B, &zero, (width + 8)); |
Imdad Sardharwalla | c6acc53 | 2018-01-03 15:18:24 +0000 | [diff] [blame] | 93 | for (int i = 0; i < height; ++i) { |
| 94 | // Zero the left column. |
| 95 | A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0; |
| 96 | |
| 97 | // ldiff is the difference H - D where H is the output sample immediately |
| 98 | // to the left and D is the output sample above it. These are scalars, |
| 99 | // replicated across the eight lanes. |
| 100 | __m256i ldiff1 = zero, ldiff2 = zero; |
| 101 | for (int j = 0; j < width; j += 8) { |
| 102 | const int ABj = 1 + j; |
| 103 | |
| 104 | const __m256i above1 = yy_load_256(B + ABj + i * buf_stride); |
| 105 | const __m256i above2 = yy_load_256(A + ABj + i * buf_stride); |
| 106 | |
| 107 | const __m256i x1 = yy256_load_extend_8_32(src + j + i * src_stride); |
| 108 | const __m256i x2 = _mm256_madd_epi16(x1, x1); |
| 109 | |
| 110 | const __m256i sc1 = scan_32(x1); |
| 111 | const __m256i sc2 = scan_32(x2); |
| 112 | |
| 113 | const __m256i row1 = |
| 114 | _mm256_add_epi32(_mm256_add_epi32(sc1, above1), ldiff1); |
| 115 | const __m256i row2 = |
| 116 | _mm256_add_epi32(_mm256_add_epi32(sc2, above2), ldiff2); |
| 117 | |
| 118 | yy_store_256(B + ABj + (i + 1) * buf_stride, row1); |
| 119 | yy_store_256(A + ABj + (i + 1) * buf_stride, row2); |
| 120 | |
| 121 | // Calculate the new H - D. |
| 122 | ldiff1 = _mm256_set1_epi32( |
| 123 | _mm256_extract_epi32(_mm256_sub_epi32(row1, above1), 7)); |
| 124 | ldiff2 = _mm256_set1_epi32( |
| 125 | _mm256_extract_epi32(_mm256_sub_epi32(row2, above2), 7)); |
| 126 | } |
| 127 | } |
| 128 | } |
| 129 | |
| 130 | // Compute two integral images from src. B sums elements; A sums their squares |
| 131 | // |
| 132 | // A and B should be aligned to 32 bytes. buf_stride should be a multiple of 8. |
| 133 | static void integral_images_highbd(const uint16_t *src, int src_stride, |
| 134 | int width, int height, int32_t *A, |
| 135 | int32_t *B, int buf_stride) { |
Imdad Sardharwalla | c6acc53 | 2018-01-03 15:18:24 +0000 | [diff] [blame] | 136 | const __m256i zero = _mm256_setzero_si256(); |
Victoria Zhislina | eef1cb1 | 2018-04-23 15:46:30 +0300 | [diff] [blame] | 137 | // Write out the zero top row |
Victoria Zhislina | 4198d39 | 2018-05-17 15:19:26 +0300 | [diff] [blame] | 138 | memset_zero_avx(A, &zero, (width + 8)); |
| 139 | memset_zero_avx(B, &zero, (width + 8)); |
Victoria Zhislina | eef1cb1 | 2018-04-23 15:46:30 +0300 | [diff] [blame] | 140 | |
Imdad Sardharwalla | c6acc53 | 2018-01-03 15:18:24 +0000 | [diff] [blame] | 141 | for (int i = 0; i < height; ++i) { |
| 142 | // Zero the left column. |
| 143 | A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0; |
| 144 | |
| 145 | // ldiff is the difference H - D where H is the output sample immediately |
| 146 | // to the left and D is the output sample above it. These are scalars, |
| 147 | // replicated across the eight lanes. |
| 148 | __m256i ldiff1 = zero, ldiff2 = zero; |
| 149 | for (int j = 0; j < width; j += 8) { |
| 150 | const int ABj = 1 + j; |
| 151 | |
| 152 | const __m256i above1 = yy_load_256(B + ABj + i * buf_stride); |
| 153 | const __m256i above2 = yy_load_256(A + ABj + i * buf_stride); |
| 154 | |
| 155 | const __m256i x1 = yy256_load_extend_16_32(src + j + i * src_stride); |
| 156 | const __m256i x2 = _mm256_madd_epi16(x1, x1); |
| 157 | |
| 158 | const __m256i sc1 = scan_32(x1); |
| 159 | const __m256i sc2 = scan_32(x2); |
| 160 | |
| 161 | const __m256i row1 = |
| 162 | _mm256_add_epi32(_mm256_add_epi32(sc1, above1), ldiff1); |
| 163 | const __m256i row2 = |
| 164 | _mm256_add_epi32(_mm256_add_epi32(sc2, above2), ldiff2); |
| 165 | |
| 166 | yy_store_256(B + ABj + (i + 1) * buf_stride, row1); |
| 167 | yy_store_256(A + ABj + (i + 1) * buf_stride, row2); |
| 168 | |
| 169 | // Calculate the new H - D. |
| 170 | ldiff1 = _mm256_set1_epi32( |
| 171 | _mm256_extract_epi32(_mm256_sub_epi32(row1, above1), 7)); |
| 172 | ldiff2 = _mm256_set1_epi32( |
| 173 | _mm256_extract_epi32(_mm256_sub_epi32(row2, above2), 7)); |
| 174 | } |
| 175 | } |
| 176 | } |
| 177 | |
Imdad Sardharwalla | 2f4d0f4 | 2018-02-06 10:45:26 +0000 | [diff] [blame] | 178 | // Compute 8 values of boxsum from the given integral image. ii should point |
| 179 | // at the middle of the box (for the first value). r is the box radius. |
Kyle Siefring | f14613e | 2018-03-01 11:19:16 -0500 | [diff] [blame] | 180 | static INLINE __m256i boxsum_from_ii(const int32_t *ii, int stride, int r) { |
Imdad Sardharwalla | c6acc53 | 2018-01-03 15:18:24 +0000 | [diff] [blame] | 181 | const __m256i tl = yy_loadu_256(ii - (r + 1) - (r + 1) * stride); |
| 182 | const __m256i tr = yy_loadu_256(ii + (r + 0) - (r + 1) * stride); |
| 183 | const __m256i bl = yy_loadu_256(ii - (r + 1) + r * stride); |
| 184 | const __m256i br = yy_loadu_256(ii + (r + 0) + r * stride); |
| 185 | const __m256i u = _mm256_sub_epi32(tr, tl); |
| 186 | const __m256i v = _mm256_sub_epi32(br, bl); |
| 187 | return _mm256_sub_epi32(v, u); |
| 188 | } |
| 189 | |
| 190 | static __m256i round_for_shift(unsigned shift) { |
| 191 | return _mm256_set1_epi32((1 << shift) >> 1); |
| 192 | } |
| 193 | |
| 194 | static __m256i compute_p(__m256i sum1, __m256i sum2, int bit_depth, int n) { |
| 195 | __m256i an, bb; |
| 196 | if (bit_depth > 8) { |
| 197 | const __m256i rounding_a = round_for_shift(2 * (bit_depth - 8)); |
| 198 | const __m256i rounding_b = round_for_shift(bit_depth - 8); |
| 199 | const __m128i shift_a = _mm_cvtsi32_si128(2 * (bit_depth - 8)); |
| 200 | const __m128i shift_b = _mm_cvtsi32_si128(bit_depth - 8); |
| 201 | const __m256i a = |
| 202 | _mm256_srl_epi32(_mm256_add_epi32(sum2, rounding_a), shift_a); |
| 203 | const __m256i b = |
| 204 | _mm256_srl_epi32(_mm256_add_epi32(sum1, rounding_b), shift_b); |
| 205 | // b < 2^14, so we can use a 16-bit madd rather than a 32-bit |
| 206 | // mullo to square it |
| 207 | bb = _mm256_madd_epi16(b, b); |
| 208 | an = _mm256_max_epi32(_mm256_mullo_epi32(a, _mm256_set1_epi32(n)), bb); |
| 209 | } else { |
| 210 | bb = _mm256_madd_epi16(sum1, sum1); |
| 211 | an = _mm256_mullo_epi32(sum2, _mm256_set1_epi32(n)); |
| 212 | } |
| 213 | return _mm256_sub_epi32(an, bb); |
| 214 | } |
| 215 | |
| 216 | // Assumes that C, D are integral images for the original buffer which has been |
| 217 | // extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels |
| 218 | // on the sides. A, B, C, D point at logical position (0, 0). |
| 219 | static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D, |
Urvang Joshi | c079f7a | 2018-05-11 16:13:56 -0700 | [diff] [blame] | 220 | int width, int height, int buf_stride, int bit_depth, |
| 221 | int sgr_params_idx, int radius_idx) { |
| 222 | const sgr_params_type *const params = &sgr_params[sgr_params_idx]; |
Urvang Joshi | 3715b88 | 2018-05-14 20:05:25 -0400 | [diff] [blame] | 223 | const int r = params->r[radius_idx]; |
Imdad Sardharwalla | c6acc53 | 2018-01-03 15:18:24 +0000 | [diff] [blame] | 224 | const int n = (2 * r + 1) * (2 * r + 1); |
Urvang Joshi | 3715b88 | 2018-05-14 20:05:25 -0400 | [diff] [blame] | 225 | const __m256i s = _mm256_set1_epi32(params->s[radius_idx]); |
Imdad Sardharwalla | c6acc53 | 2018-01-03 15:18:24 +0000 | [diff] [blame] | 226 | // one_over_n[n-1] is 2^12/n, so easily fits in an int16 |
| 227 | const __m256i one_over_n = _mm256_set1_epi32(one_by_x[n - 1]); |
| 228 | |
| 229 | const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS); |
| 230 | const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS); |
| 231 | |
Imdad Sardharwalla | f32dabd | 2018-01-17 13:55:37 +0000 | [diff] [blame] | 232 | // Set up masks |
Imdad Sardharwalla | 5123251 | 2018-04-30 14:41:28 +0100 | [diff] [blame] | 233 | const __m128i ones32 = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff); |
Imdad Sardharwalla | f32dabd | 2018-01-17 13:55:37 +0000 | [diff] [blame] | 234 | __m256i mask[8]; |
| 235 | for (int idx = 0; idx < 8; idx++) { |
Imdad Sardharwalla | 5123251 | 2018-04-30 14:41:28 +0100 | [diff] [blame] | 236 | const __m128i shift = _mm_cvtsi32_si128(8 * (8 - idx)); |
Imdad Sardharwalla | f32dabd | 2018-01-17 13:55:37 +0000 | [diff] [blame] | 237 | mask[idx] = _mm256_cvtepi8_epi32(_mm_srl_epi64(ones32, shift)); |
| 238 | } |
| 239 | |
Imdad Sardharwalla | c6acc53 | 2018-01-03 15:18:24 +0000 | [diff] [blame] | 240 | for (int i = -1; i < height + 1; ++i) { |
| 241 | for (int j = -1; j < width + 1; j += 8) { |
| 242 | const int32_t *Cij = C + i * buf_stride + j; |
| 243 | const int32_t *Dij = D + i * buf_stride + j; |
| 244 | |
Imdad Sardharwalla | f32dabd | 2018-01-17 13:55:37 +0000 | [diff] [blame] | 245 | __m256i sum1 = boxsum_from_ii(Dij, buf_stride, r); |
| 246 | __m256i sum2 = boxsum_from_ii(Cij, buf_stride, r); |
Imdad Sardharwalla | c6acc53 | 2018-01-03 15:18:24 +0000 | [diff] [blame] | 247 | |
Imdad Sardharwalla | f32dabd | 2018-01-17 13:55:37 +0000 | [diff] [blame] | 248 | // When width + 2 isn't a multiple of 8, sum1 and sum2 will contain |
| 249 | // some uninitialised data in their upper words. We use a mask to |
| 250 | // ensure that these bits are set to 0. |
| 251 | int idx = AOMMIN(8, width + 1 - j); |
| 252 | assert(idx >= 1); |
Imdad Sardharwalla | c6acc53 | 2018-01-03 15:18:24 +0000 | [diff] [blame] | 253 | |
Imdad Sardharwalla | f32dabd | 2018-01-17 13:55:37 +0000 | [diff] [blame] | 254 | if (idx < 8) { |
| 255 | sum1 = _mm256_and_si256(mask[idx], sum1); |
| 256 | sum2 = _mm256_and_si256(mask[idx], sum2); |
| 257 | } |
Imdad Sardharwalla | c6acc53 | 2018-01-03 15:18:24 +0000 | [diff] [blame] | 258 | |
| 259 | const __m256i p = compute_p(sum1, sum2, bit_depth, n); |
| 260 | |
| 261 | const __m256i z = _mm256_min_epi32( |
| 262 | _mm256_srli_epi32(_mm256_add_epi32(_mm256_mullo_epi32(p, s), rnd_z), |
| 263 | SGRPROJ_MTABLE_BITS), |
| 264 | _mm256_set1_epi32(255)); |
| 265 | |
| 266 | const __m256i a_res = _mm256_i32gather_epi32(x_by_xplus1, z, 4); |
| 267 | |
| 268 | yy_storeu_256(A + i * buf_stride + j, a_res); |
| 269 | |
| 270 | const __m256i a_complement = |
| 271 | _mm256_sub_epi32(_mm256_set1_epi32(SGRPROJ_SGR), a_res); |
| 272 | |
| 273 | // sum1 might have lanes greater than 2^15, so we can't use madd to do |
| 274 | // multiplication involving sum1. However, a_complement and one_over_n |
| 275 | // are both less than 256, so we can multiply them first. |
| 276 | const __m256i a_comp_over_n = _mm256_madd_epi16(a_complement, one_over_n); |
| 277 | const __m256i b_int = _mm256_mullo_epi32(a_comp_over_n, sum1); |
| 278 | const __m256i b_res = _mm256_srli_epi32(_mm256_add_epi32(b_int, rnd_res), |
| 279 | SGRPROJ_RECIP_BITS); |
| 280 | |
| 281 | yy_storeu_256(B + i * buf_stride + j, b_res); |
| 282 | } |
| 283 | } |
| 284 | } |
| 285 | |
Imdad Sardharwalla | 2f4d0f4 | 2018-02-06 10:45:26 +0000 | [diff] [blame] | 286 | // Calculate 8 values of the "cross sum" starting at buf. This is a 3x3 filter |
Imdad Sardharwalla | c6acc53 | 2018-01-03 15:18:24 +0000 | [diff] [blame] | 287 | // where the outer four corners have weight 3 and all other pixels have weight |
| 288 | // 4. |
| 289 | // |
| 290 | // Pixels are indexed as follows: |
| 291 | // xtl xt xtr |
| 292 | // xl x xr |
| 293 | // xbl xb xbr |
| 294 | // |
| 295 | // buf points to x |
| 296 | // |
| 297 | // fours = xl + xt + xr + xb + x |
| 298 | // threes = xtl + xtr + xbr + xbl |
| 299 | // cross_sum = 4 * fours + 3 * threes |
| 300 | // = 4 * (fours + threes) - threes |
| 301 | // = (fours + threes) << 2 - threes |
Kyle Siefring | f14613e | 2018-03-01 11:19:16 -0500 | [diff] [blame] | 302 | static INLINE __m256i cross_sum(const int32_t *buf, int stride) { |
Imdad Sardharwalla | c6acc53 | 2018-01-03 15:18:24 +0000 | [diff] [blame] | 303 | const __m256i xtl = yy_loadu_256(buf - 1 - stride); |
| 304 | const __m256i xt = yy_loadu_256(buf - stride); |
| 305 | const __m256i xtr = yy_loadu_256(buf + 1 - stride); |
| 306 | const __m256i xl = yy_loadu_256(buf - 1); |
| 307 | const __m256i x = yy_loadu_256(buf); |
| 308 | const __m256i xr = yy_loadu_256(buf + 1); |
| 309 | const __m256i xbl = yy_loadu_256(buf - 1 + stride); |
| 310 | const __m256i xb = yy_loadu_256(buf + stride); |
| 311 | const __m256i xbr = yy_loadu_256(buf + 1 + stride); |
| 312 | |
| 313 | const __m256i fours = _mm256_add_epi32( |
| 314 | xl, _mm256_add_epi32(xt, _mm256_add_epi32(xr, _mm256_add_epi32(xb, x)))); |
| 315 | const __m256i threes = |
| 316 | _mm256_add_epi32(xtl, _mm256_add_epi32(xtr, _mm256_add_epi32(xbr, xbl))); |
| 317 | |
| 318 | return _mm256_sub_epi32(_mm256_slli_epi32(_mm256_add_epi32(fours, threes), 2), |
| 319 | threes); |
| 320 | } |
| 321 | |
| 322 | // The final filter for self-guided restoration. Computes a weighted average |
Imdad Sardharwalla | 2f4d0f4 | 2018-02-06 10:45:26 +0000 | [diff] [blame] | 323 | // across A, B with "cross sums" (see cross_sum implementation above). |
Imdad Sardharwalla | c6acc53 | 2018-01-03 15:18:24 +0000 | [diff] [blame] | 324 | static void final_filter(int32_t *dst, int dst_stride, const int32_t *A, |
| 325 | const int32_t *B, int buf_stride, const void *dgd8, |
| 326 | int dgd_stride, int width, int height, int highbd) { |
| 327 | const int nb = 5; |
| 328 | const __m256i rounding = |
| 329 | round_for_shift(SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS); |
| 330 | const uint8_t *dgd_real = |
| 331 | highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8; |
| 332 | |
| 333 | for (int i = 0; i < height; ++i) { |
Imdad Sardharwalla | d051e56 | 2018-02-02 09:42:07 +0000 | [diff] [blame] | 334 | for (int j = 0; j < width; j += 8) { |
Imdad Sardharwalla | c6acc53 | 2018-01-03 15:18:24 +0000 | [diff] [blame] | 335 | const __m256i a = cross_sum(A + i * buf_stride + j, buf_stride); |
| 336 | const __m256i b = cross_sum(B + i * buf_stride + j, buf_stride); |
| 337 | |
| 338 | const __m128i raw = |
| 339 | xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd)); |
| 340 | const __m256i src = |
| 341 | highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw); |
| 342 | |
| 343 | __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b); |
| 344 | __m256i w = _mm256_srai_epi32(_mm256_add_epi32(v, rounding), |
| 345 | SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS); |
| 346 | |
| 347 | yy_storeu_256(dst + i * dst_stride + j, w); |
| 348 | } |
| 349 | } |
| 350 | } |
| 351 | |
Imdad Sardharwalla | 9d23457 | 2018-01-24 13:39:00 +0000 | [diff] [blame] | 352 | // Assumes that C, D are integral images for the original buffer which has been |
| 353 | // extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels |
| 354 | // on the sides. A, B, C, D point at logical position (0, 0). |
| 355 | static void calc_ab_fast(int32_t *A, int32_t *B, const int32_t *C, |
| 356 | const int32_t *D, int width, int height, |
Urvang Joshi | c079f7a | 2018-05-11 16:13:56 -0700 | [diff] [blame] | 357 | int buf_stride, int bit_depth, int sgr_params_idx, |
| 358 | int radius_idx) { |
| 359 | const sgr_params_type *const params = &sgr_params[sgr_params_idx]; |
Urvang Joshi | 3715b88 | 2018-05-14 20:05:25 -0400 | [diff] [blame] | 360 | const int r = params->r[radius_idx]; |
Imdad Sardharwalla | 9d23457 | 2018-01-24 13:39:00 +0000 | [diff] [blame] | 361 | const int n = (2 * r + 1) * (2 * r + 1); |
Urvang Joshi | 3715b88 | 2018-05-14 20:05:25 -0400 | [diff] [blame] | 362 | const __m256i s = _mm256_set1_epi32(params->s[radius_idx]); |
Imdad Sardharwalla | 9d23457 | 2018-01-24 13:39:00 +0000 | [diff] [blame] | 363 | // one_over_n[n-1] is 2^12/n, so easily fits in an int16 |
| 364 | const __m256i one_over_n = _mm256_set1_epi32(one_by_x[n - 1]); |
| 365 | |
| 366 | const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS); |
| 367 | const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS); |
| 368 | |
| 369 | // Set up masks |
Imdad Sardharwalla | 5123251 | 2018-04-30 14:41:28 +0100 | [diff] [blame] | 370 | const __m128i ones32 = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff); |
Imdad Sardharwalla | 9d23457 | 2018-01-24 13:39:00 +0000 | [diff] [blame] | 371 | __m256i mask[8]; |
| 372 | for (int idx = 0; idx < 8; idx++) { |
Imdad Sardharwalla | 5123251 | 2018-04-30 14:41:28 +0100 | [diff] [blame] | 373 | const __m128i shift = _mm_cvtsi32_si128(8 * (8 - idx)); |
Imdad Sardharwalla | 9d23457 | 2018-01-24 13:39:00 +0000 | [diff] [blame] | 374 | mask[idx] = _mm256_cvtepi8_epi32(_mm_srl_epi64(ones32, shift)); |
| 375 | } |
| 376 | |
| 377 | for (int i = -1; i < height + 1; i += 2) { |
| 378 | for (int j = -1; j < width + 1; j += 8) { |
| 379 | const int32_t *Cij = C + i * buf_stride + j; |
| 380 | const int32_t *Dij = D + i * buf_stride + j; |
| 381 | |
| 382 | __m256i sum1 = boxsum_from_ii(Dij, buf_stride, r); |
| 383 | __m256i sum2 = boxsum_from_ii(Cij, buf_stride, r); |
| 384 | |
| 385 | // When width + 2 isn't a multiple of 8, sum1 and sum2 will contain |
| 386 | // some uninitialised data in their upper words. We use a mask to |
| 387 | // ensure that these bits are set to 0. |
| 388 | int idx = AOMMIN(8, width + 1 - j); |
| 389 | assert(idx >= 1); |
| 390 | |
| 391 | if (idx < 8) { |
| 392 | sum1 = _mm256_and_si256(mask[idx], sum1); |
| 393 | sum2 = _mm256_and_si256(mask[idx], sum2); |
| 394 | } |
| 395 | |
| 396 | const __m256i p = compute_p(sum1, sum2, bit_depth, n); |
| 397 | |
| 398 | const __m256i z = _mm256_min_epi32( |
| 399 | _mm256_srli_epi32(_mm256_add_epi32(_mm256_mullo_epi32(p, s), rnd_z), |
| 400 | SGRPROJ_MTABLE_BITS), |
| 401 | _mm256_set1_epi32(255)); |
| 402 | |
| 403 | const __m256i a_res = _mm256_i32gather_epi32(x_by_xplus1, z, 4); |
| 404 | |
| 405 | yy_storeu_256(A + i * buf_stride + j, a_res); |
| 406 | |
| 407 | const __m256i a_complement = |
| 408 | _mm256_sub_epi32(_mm256_set1_epi32(SGRPROJ_SGR), a_res); |
| 409 | |
| 410 | // sum1 might have lanes greater than 2^15, so we can't use madd to do |
| 411 | // multiplication involving sum1. However, a_complement and one_over_n |
| 412 | // are both less than 256, so we can multiply them first. |
| 413 | const __m256i a_comp_over_n = _mm256_madd_epi16(a_complement, one_over_n); |
| 414 | const __m256i b_int = _mm256_mullo_epi32(a_comp_over_n, sum1); |
| 415 | const __m256i b_res = _mm256_srli_epi32(_mm256_add_epi32(b_int, rnd_res), |
| 416 | SGRPROJ_RECIP_BITS); |
| 417 | |
| 418 | yy_storeu_256(B + i * buf_stride + j, b_res); |
| 419 | } |
| 420 | } |
| 421 | } |
| 422 | |
Imdad Sardharwalla | d051e56 | 2018-02-02 09:42:07 +0000 | [diff] [blame] | 423 | // Calculate 8 values of the "cross sum" starting at buf. |
Imdad Sardharwalla | 9d23457 | 2018-01-24 13:39:00 +0000 | [diff] [blame] | 424 | // |
| 425 | // Pixels are indexed like this: |
| 426 | // xtl xt xtr |
| 427 | // - buf - |
| 428 | // xbl xb xbr |
| 429 | // |
| 430 | // Pixels are weighted like this: |
| 431 | // 5 6 5 |
| 432 | // 0 0 0 |
| 433 | // 5 6 5 |
| 434 | // |
| 435 | // fives = xtl + xtr + xbl + xbr |
| 436 | // sixes = xt + xb |
| 437 | // cross_sum = 6 * sixes + 5 * fives |
| 438 | // = 5 * (fives + sixes) - sixes |
| 439 | // = (fives + sixes) << 2 + (fives + sixes) + sixes |
Kyle Siefring | f14613e | 2018-03-01 11:19:16 -0500 | [diff] [blame] | 440 | static INLINE __m256i cross_sum_fast_even_row(const int32_t *buf, int stride) { |
Imdad Sardharwalla | 9d23457 | 2018-01-24 13:39:00 +0000 | [diff] [blame] | 441 | const __m256i xtl = yy_loadu_256(buf - 1 - stride); |
| 442 | const __m256i xt = yy_loadu_256(buf - stride); |
| 443 | const __m256i xtr = yy_loadu_256(buf + 1 - stride); |
| 444 | const __m256i xbl = yy_loadu_256(buf - 1 + stride); |
| 445 | const __m256i xb = yy_loadu_256(buf + stride); |
| 446 | const __m256i xbr = yy_loadu_256(buf + 1 + stride); |
| 447 | |
| 448 | const __m256i fives = |
| 449 | _mm256_add_epi32(xtl, _mm256_add_epi32(xtr, _mm256_add_epi32(xbr, xbl))); |
| 450 | const __m256i sixes = _mm256_add_epi32(xt, xb); |
| 451 | const __m256i fives_plus_sixes = _mm256_add_epi32(fives, sixes); |
| 452 | |
| 453 | return _mm256_add_epi32( |
| 454 | _mm256_add_epi32(_mm256_slli_epi32(fives_plus_sixes, 2), |
| 455 | fives_plus_sixes), |
| 456 | sixes); |
| 457 | } |
| 458 | |
Imdad Sardharwalla | d051e56 | 2018-02-02 09:42:07 +0000 | [diff] [blame] | 459 | // Calculate 8 values of the "cross sum" starting at buf. |
| 460 | // |
| 461 | // Pixels are indexed like this: |
| 462 | // xl x xr |
| 463 | // |
| 464 | // Pixels are weighted like this: |
| 465 | // 5 6 5 |
| 466 | // |
| 467 | // buf points to x |
| 468 | // |
| 469 | // fives = xl + xr |
| 470 | // sixes = x |
| 471 | // cross_sum = 5 * fives + 6 * sixes |
| 472 | // = 4 * (fives + sixes) + (fives + sixes) + sixes |
| 473 | // = (fives + sixes) << 2 + (fives + sixes) + sixes |
Kyle Siefring | f14613e | 2018-03-01 11:19:16 -0500 | [diff] [blame] | 474 | static INLINE __m256i cross_sum_fast_odd_row(const int32_t *buf) { |
Imdad Sardharwalla | d051e56 | 2018-02-02 09:42:07 +0000 | [diff] [blame] | 475 | const __m256i xl = yy_loadu_256(buf - 1); |
| 476 | const __m256i x = yy_loadu_256(buf); |
| 477 | const __m256i xr = yy_loadu_256(buf + 1); |
| 478 | |
| 479 | const __m256i fives = _mm256_add_epi32(xl, xr); |
| 480 | const __m256i sixes = x; |
| 481 | |
| 482 | const __m256i fives_plus_sixes = _mm256_add_epi32(fives, sixes); |
| 483 | |
| 484 | return _mm256_add_epi32( |
| 485 | _mm256_add_epi32(_mm256_slli_epi32(fives_plus_sixes, 2), |
| 486 | fives_plus_sixes), |
| 487 | sixes); |
| 488 | } |
| 489 | |
Debargha Mukherjee | 6a6609b | 2018-03-14 18:45:53 -0700 | [diff] [blame] | 490 | // The final filter for the self-guided restoration. Computes a |
Imdad Sardharwalla | 2f4d0f4 | 2018-02-06 10:45:26 +0000 | [diff] [blame] | 491 | // weighted average across A, B with "cross sums" (see cross_sum_... |
| 492 | // implementations above). |
| 493 | static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A, |
| 494 | const int32_t *B, int buf_stride, |
| 495 | const void *dgd8, int dgd_stride, int width, |
| 496 | int height, int highbd) { |
Imdad Sardharwalla | d051e56 | 2018-02-02 09:42:07 +0000 | [diff] [blame] | 497 | const int nb0 = 5; |
| 498 | const int nb1 = 4; |
| 499 | |
| 500 | const __m256i rounding0 = |
| 501 | round_for_shift(SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS); |
| 502 | const __m256i rounding1 = |
| 503 | round_for_shift(SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS); |
| 504 | |
| 505 | const uint8_t *dgd_real = |
| 506 | highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8; |
| 507 | |
| 508 | for (int i = 0; i < height; ++i) { |
| 509 | if (!(i & 1)) { // even row |
| 510 | for (int j = 0; j < width; j += 8) { |
| 511 | const __m256i a = |
Imdad Sardharwalla | 2f4d0f4 | 2018-02-06 10:45:26 +0000 | [diff] [blame] | 512 | cross_sum_fast_even_row(A + i * buf_stride + j, buf_stride); |
Imdad Sardharwalla | d051e56 | 2018-02-02 09:42:07 +0000 | [diff] [blame] | 513 | const __m256i b = |
Imdad Sardharwalla | 2f4d0f4 | 2018-02-06 10:45:26 +0000 | [diff] [blame] | 514 | cross_sum_fast_even_row(B + i * buf_stride + j, buf_stride); |
Imdad Sardharwalla | d051e56 | 2018-02-02 09:42:07 +0000 | [diff] [blame] | 515 | |
| 516 | const __m128i raw = |
| 517 | xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd)); |
| 518 | const __m256i src = |
| 519 | highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw); |
| 520 | |
| 521 | __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b); |
| 522 | __m256i w = |
| 523 | _mm256_srai_epi32(_mm256_add_epi32(v, rounding0), |
| 524 | SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS); |
| 525 | |
| 526 | yy_storeu_256(dst + i * dst_stride + j, w); |
| 527 | } |
| 528 | } else { // odd row |
| 529 | for (int j = 0; j < width; j += 8) { |
Imdad Sardharwalla | 2f4d0f4 | 2018-02-06 10:45:26 +0000 | [diff] [blame] | 530 | const __m256i a = cross_sum_fast_odd_row(A + i * buf_stride + j); |
| 531 | const __m256i b = cross_sum_fast_odd_row(B + i * buf_stride + j); |
Imdad Sardharwalla | d051e56 | 2018-02-02 09:42:07 +0000 | [diff] [blame] | 532 | |
| 533 | const __m128i raw = |
| 534 | xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd)); |
| 535 | const __m256i src = |
| 536 | highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw); |
| 537 | |
| 538 | __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b); |
| 539 | __m256i w = |
| 540 | _mm256_srai_epi32(_mm256_add_epi32(v, rounding1), |
| 541 | SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS); |
| 542 | |
| 543 | yy_storeu_256(dst + i * dst_stride + j, w); |
| 544 | } |
| 545 | } |
| 546 | } |
| 547 | } |
Imdad Sardharwalla | 9d23457 | 2018-01-24 13:39:00 +0000 | [diff] [blame] | 548 | |
Imdad Sardharwalla | c6acc53 | 2018-01-03 15:18:24 +0000 | [diff] [blame] | 549 | void av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height, |
Imdad Sardharwalla | 7d3bd8d | 2018-02-22 15:47:33 +0000 | [diff] [blame] | 550 | int dgd_stride, int32_t *flt0, |
| 551 | int32_t *flt1, int flt_stride, |
Urvang Joshi | c079f7a | 2018-05-11 16:13:56 -0700 | [diff] [blame] | 552 | int sgr_params_idx, int bit_depth, |
| 553 | int highbd) { |
Imdad Sardharwalla | c6acc53 | 2018-01-03 15:18:24 +0000 | [diff] [blame] | 554 | // The ALIGN_POWER_OF_TWO macro here ensures that column 1 of Atl, Btl, |
| 555 | // Ctl and Dtl is 32-byte aligned. |
| 556 | const int buf_elts = ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3); |
| 557 | |
| 558 | DECLARE_ALIGNED(32, int32_t, |
| 559 | buf[4 * ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3)]); |
Imdad Sardharwalla | c6acc53 | 2018-01-03 15:18:24 +0000 | [diff] [blame] | 560 | |
| 561 | const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; |
| 562 | const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; |
| 563 | |
| 564 | // Adjusting the stride of A and B here appears to avoid bad cache effects, |
| 565 | // leading to a significant speed improvement. |
| 566 | // We also align the stride to a multiple of 32 bytes for efficiency. |
| 567 | int buf_stride = ALIGN_POWER_OF_TWO(width_ext + 16, 3); |
| 568 | |
| 569 | // The "tl" pointers point at the top-left of the initialised data for the |
| 570 | // array. |
| 571 | int32_t *Atl = buf + 0 * buf_elts + 7; |
| 572 | int32_t *Btl = buf + 1 * buf_elts + 7; |
| 573 | int32_t *Ctl = buf + 2 * buf_elts + 7; |
| 574 | int32_t *Dtl = buf + 3 * buf_elts + 7; |
| 575 | |
| 576 | // The "0" pointers are (- SGRPROJ_BORDER_VERT, -SGRPROJ_BORDER_HORZ). Note |
| 577 | // there's a zero row and column in A, B (integral images), so we move down |
| 578 | // and right one for them. |
| 579 | const int buf_diag_border = |
| 580 | SGRPROJ_BORDER_HORZ + buf_stride * SGRPROJ_BORDER_VERT; |
| 581 | |
| 582 | int32_t *A0 = Atl + 1 + buf_stride; |
| 583 | int32_t *B0 = Btl + 1 + buf_stride; |
| 584 | int32_t *C0 = Ctl + 1 + buf_stride; |
| 585 | int32_t *D0 = Dtl + 1 + buf_stride; |
| 586 | |
| 587 | // Finally, A, B, C, D point at position (0, 0). |
| 588 | int32_t *A = A0 + buf_diag_border; |
| 589 | int32_t *B = B0 + buf_diag_border; |
| 590 | int32_t *C = C0 + buf_diag_border; |
| 591 | int32_t *D = D0 + buf_diag_border; |
| 592 | |
| 593 | const int dgd_diag_border = |
| 594 | SGRPROJ_BORDER_HORZ + dgd_stride * SGRPROJ_BORDER_VERT; |
| 595 | const uint8_t *dgd0 = dgd8 - dgd_diag_border; |
| 596 | |
| 597 | // Generate integral images from the input. C will contain sums of squares; D |
| 598 | // will contain just sums |
| 599 | if (highbd) |
| 600 | integral_images_highbd(CONVERT_TO_SHORTPTR(dgd0), dgd_stride, width_ext, |
| 601 | height_ext, Ctl, Dtl, buf_stride); |
| 602 | else |
| 603 | integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl, |
| 604 | buf_stride); |
| 605 | |
Urvang Joshi | c079f7a | 2018-05-11 16:13:56 -0700 | [diff] [blame] | 606 | const sgr_params_type *const params = &sgr_params[sgr_params_idx]; |
Debargha Mukherjee | 25afc9b | 2018-03-27 10:45:19 -0700 | [diff] [blame] | 607 | // Write to flt0 and flt1 |
Imdad Sardharwalla | 81307a3 | 2018-02-21 13:18:06 +0000 | [diff] [blame] | 608 | // If params->r == 0 we skip the corresponding filter. We only allow one of |
| 609 | // the radii to be 0, as having both equal to 0 would be equivalent to |
| 610 | // skipping SGR entirely. |
Urvang Joshi | 3715b88 | 2018-05-14 20:05:25 -0400 | [diff] [blame] | 611 | assert(!(params->r[0] == 0 && params->r[1] == 0)); |
| 612 | assert(params->r[0] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ)); |
| 613 | assert(params->r[1] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ)); |
Imdad Sardharwalla | 81307a3 | 2018-02-21 13:18:06 +0000 | [diff] [blame] | 614 | |
Urvang Joshi | 3715b88 | 2018-05-14 20:05:25 -0400 | [diff] [blame] | 615 | if (params->r[0] > 0) { |
Urvang Joshi | c079f7a | 2018-05-11 16:13:56 -0700 | [diff] [blame] | 616 | calc_ab_fast(A, B, C, D, width, height, buf_stride, bit_depth, |
| 617 | sgr_params_idx, 0); |
Imdad Sardharwalla | 7d3bd8d | 2018-02-22 15:47:33 +0000 | [diff] [blame] | 618 | final_filter_fast(flt0, flt_stride, A, B, buf_stride, dgd8, dgd_stride, |
Imdad Sardharwalla | 81307a3 | 2018-02-21 13:18:06 +0000 | [diff] [blame] | 619 | width, height, highbd); |
| 620 | } |
| 621 | |
Urvang Joshi | 3715b88 | 2018-05-14 20:05:25 -0400 | [diff] [blame] | 622 | if (params->r[1] > 0) { |
Urvang Joshi | c079f7a | 2018-05-11 16:13:56 -0700 | [diff] [blame] | 623 | calc_ab(A, B, C, D, width, height, buf_stride, bit_depth, sgr_params_idx, |
| 624 | 1); |
Imdad Sardharwalla | 7d3bd8d | 2018-02-22 15:47:33 +0000 | [diff] [blame] | 625 | final_filter(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width, |
Imdad Sardharwalla | 81307a3 | 2018-02-21 13:18:06 +0000 | [diff] [blame] | 626 | height, highbd); |
| 627 | } |
Imdad Sardharwalla | c6acc53 | 2018-01-03 15:18:24 +0000 | [diff] [blame] | 628 | } |
| 629 | |
| 630 | void apply_selfguided_restoration_avx2(const uint8_t *dat8, int width, |
| 631 | int height, int stride, int eps, |
| 632 | const int *xqd, uint8_t *dst8, |
| 633 | int dst_stride, int32_t *tmpbuf, |
| 634 | int bit_depth, int highbd) { |
Imdad Sardharwalla | 7d3bd8d | 2018-02-22 15:47:33 +0000 | [diff] [blame] | 635 | int32_t *flt0 = tmpbuf; |
Urvang Joshi | 813186b | 2018-03-08 15:38:46 -0800 | [diff] [blame] | 636 | int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX; |
| 637 | assert(width * height <= RESTORATION_UNITPELS_MAX); |
Imdad Sardharwalla | 7d3bd8d | 2018-02-22 15:47:33 +0000 | [diff] [blame] | 638 | av1_selfguided_restoration_avx2(dat8, width, height, stride, flt0, flt1, |
Urvang Joshi | c079f7a | 2018-05-11 16:13:56 -0700 | [diff] [blame] | 639 | width, eps, bit_depth, highbd); |
| 640 | const sgr_params_type *const params = &sgr_params[eps]; |
Imdad Sardharwalla | fdeb116 | 2018-02-21 17:38:20 +0000 | [diff] [blame] | 641 | int xq[2]; |
| 642 | decode_xq(xqd, xq, params); |
Imdad Sardharwalla | c6acc53 | 2018-01-03 15:18:24 +0000 | [diff] [blame] | 643 | |
| 644 | __m256i xq0 = _mm256_set1_epi32(xq[0]); |
| 645 | __m256i xq1 = _mm256_set1_epi32(xq[1]); |
| 646 | |
| 647 | for (int i = 0; i < height; ++i) { |
| 648 | // Calculate output in batches of 16 pixels |
| 649 | for (int j = 0; j < width; j += 16) { |
| 650 | const int k = i * width + j; |
| 651 | const int m = i * dst_stride + j; |
| 652 | |
| 653 | const uint8_t *dat8ij = dat8 + i * stride + j; |
| 654 | __m256i ep_0, ep_1; |
| 655 | __m128i src_0, src_1; |
| 656 | if (highbd) { |
| 657 | src_0 = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij)); |
| 658 | src_1 = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij + 8)); |
| 659 | ep_0 = _mm256_cvtepu16_epi32(src_0); |
| 660 | ep_1 = _mm256_cvtepu16_epi32(src_1); |
| 661 | } else { |
| 662 | src_0 = xx_loadu_128(dat8ij); |
| 663 | ep_0 = _mm256_cvtepu8_epi32(src_0); |
| 664 | ep_1 = _mm256_cvtepu8_epi32(_mm_srli_si128(src_0, 8)); |
| 665 | } |
| 666 | |
| 667 | const __m256i u_0 = _mm256_slli_epi32(ep_0, SGRPROJ_RST_BITS); |
| 668 | const __m256i u_1 = _mm256_slli_epi32(ep_1, SGRPROJ_RST_BITS); |
| 669 | |
Imdad Sardharwalla | 81307a3 | 2018-02-21 13:18:06 +0000 | [diff] [blame] | 670 | __m256i v_0 = _mm256_slli_epi32(u_0, SGRPROJ_PRJ_BITS); |
| 671 | __m256i v_1 = _mm256_slli_epi32(u_1, SGRPROJ_PRJ_BITS); |
| 672 | |
Urvang Joshi | 3715b88 | 2018-05-14 20:05:25 -0400 | [diff] [blame] | 673 | if (params->r[0] > 0) { |
Imdad Sardharwalla | 7d3bd8d | 2018-02-22 15:47:33 +0000 | [diff] [blame] | 674 | const __m256i f1_0 = _mm256_sub_epi32(yy_loadu_256(&flt0[k]), u_0); |
Imdad Sardharwalla | 81307a3 | 2018-02-21 13:18:06 +0000 | [diff] [blame] | 675 | v_0 = _mm256_add_epi32(v_0, _mm256_mullo_epi32(xq0, f1_0)); |
| 676 | |
Imdad Sardharwalla | 7d3bd8d | 2018-02-22 15:47:33 +0000 | [diff] [blame] | 677 | const __m256i f1_1 = _mm256_sub_epi32(yy_loadu_256(&flt0[k + 8]), u_1); |
Imdad Sardharwalla | 81307a3 | 2018-02-21 13:18:06 +0000 | [diff] [blame] | 678 | v_1 = _mm256_add_epi32(v_1, _mm256_mullo_epi32(xq0, f1_1)); |
| 679 | } |
| 680 | |
Urvang Joshi | 3715b88 | 2018-05-14 20:05:25 -0400 | [diff] [blame] | 681 | if (params->r[1] > 0) { |
Imdad Sardharwalla | 7d3bd8d | 2018-02-22 15:47:33 +0000 | [diff] [blame] | 682 | const __m256i f2_0 = _mm256_sub_epi32(yy_loadu_256(&flt1[k]), u_0); |
Imdad Sardharwalla | 81307a3 | 2018-02-21 13:18:06 +0000 | [diff] [blame] | 683 | v_0 = _mm256_add_epi32(v_0, _mm256_mullo_epi32(xq1, f2_0)); |
| 684 | |
Imdad Sardharwalla | 7d3bd8d | 2018-02-22 15:47:33 +0000 | [diff] [blame] | 685 | const __m256i f2_1 = _mm256_sub_epi32(yy_loadu_256(&flt1[k + 8]), u_1); |
Imdad Sardharwalla | 81307a3 | 2018-02-21 13:18:06 +0000 | [diff] [blame] | 686 | v_1 = _mm256_add_epi32(v_1, _mm256_mullo_epi32(xq1, f2_1)); |
| 687 | } |
Imdad Sardharwalla | c6acc53 | 2018-01-03 15:18:24 +0000 | [diff] [blame] | 688 | |
| 689 | const __m256i rounding = |
| 690 | round_for_shift(SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); |
| 691 | const __m256i w_0 = _mm256_srai_epi32( |
| 692 | _mm256_add_epi32(v_0, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); |
| 693 | const __m256i w_1 = _mm256_srai_epi32( |
| 694 | _mm256_add_epi32(v_1, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); |
| 695 | |
| 696 | if (highbd) { |
| 697 | // Pack into 16 bits and clamp to [0, 2^bit_depth) |
| 698 | // Note that packing into 16 bits messes up the order of the bits, |
| 699 | // so we use a permute function to correct this |
| 700 | const __m256i tmp = _mm256_packus_epi32(w_0, w_1); |
| 701 | const __m256i tmp2 = _mm256_permute4x64_epi64(tmp, 0xd8); |
| 702 | const __m256i max = _mm256_set1_epi16((1 << bit_depth) - 1); |
| 703 | const __m256i res = _mm256_min_epi16(tmp2, max); |
David Barker | 340261d | 2018-02-08 16:52:36 +0000 | [diff] [blame] | 704 | yy_storeu_256(CONVERT_TO_SHORTPTR(dst8 + m), res); |
Imdad Sardharwalla | c6acc53 | 2018-01-03 15:18:24 +0000 | [diff] [blame] | 705 | } else { |
| 706 | // Pack into 8 bits and clamp to [0, 256) |
| 707 | // Note that each pack messes up the order of the bits, |
| 708 | // so we use a permute function to correct this |
| 709 | const __m256i tmp = _mm256_packs_epi32(w_0, w_1); |
| 710 | const __m256i tmp2 = _mm256_permute4x64_epi64(tmp, 0xd8); |
| 711 | const __m256i res = |
| 712 | _mm256_packus_epi16(tmp2, tmp2 /* "don't care" value */); |
| 713 | const __m128i res2 = |
| 714 | _mm256_castsi256_si128(_mm256_permute4x64_epi64(res, 0xd8)); |
David Barker | 340261d | 2018-02-08 16:52:36 +0000 | [diff] [blame] | 715 | xx_storeu_128(dst8 + m, res2); |
Imdad Sardharwalla | c6acc53 | 2018-01-03 15:18:24 +0000 | [diff] [blame] | 716 | } |
| 717 | } |
| 718 | } |
| 719 | } |