Deepa K G | f2f276c | 2018-02-27 19:01:55 +0530 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (c) 2018, Alliance for Open Media. All rights reserved |
| 3 | * |
| 4 | * This source code is subject to the terms of the BSD 2 Clause License and |
| 5 | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| 6 | * was not distributed with this source code in the LICENSE file, you can |
| 7 | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| 8 | * Media Patent License 1.0 was not distributed with this source code in the |
| 9 | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
| 10 | */ |
| 11 | |
| 12 | #include <immintrin.h> |
| 13 | |
Tom Finegan | 44702c8 | 2018-05-22 13:00:39 -0700 | [diff] [blame] | 14 | #include "config/aom_dsp_rtcd.h" |
| 15 | |
Deepa K G | f2f276c | 2018-02-27 19:01:55 +0530 | [diff] [blame] | 16 | #include "aom_dsp/x86/convolve_avx2.h" |
| 17 | #include "aom_dsp/x86/convolve_common_intrin.h" |
| 18 | #include "aom_dsp/x86/convolve_sse4_1.h" |
| 19 | #include "aom_dsp/aom_dsp_common.h" |
| 20 | #include "aom_dsp/aom_filter.h" |
| 21 | #include "av1/common/convolve.h" |
| 22 | |
pengbin | 2645155 | 2018-08-08 10:15:28 +0800 | [diff] [blame] | 23 | static INLINE __m256i unpack_weights_avx2(ConvolveParams *conv_params) { |
| 24 | const int w0 = conv_params->fwd_offset; |
| 25 | const int w1 = conv_params->bck_offset; |
| 26 | const __m256i wt0 = _mm256_set1_epi16(w0); |
| 27 | const __m256i wt1 = _mm256_set1_epi16(w1); |
| 28 | const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1); |
| 29 | return wt; |
| 30 | } |
| 31 | |
| 32 | static INLINE __m256i load_line2_avx2(const void *a, const void *b) { |
| 33 | return _mm256_permute2x128_si256( |
| 34 | _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)a)), |
| 35 | _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)b)), 0x20); |
| 36 | } |
| 37 | |
Debargha Mukherjee | 0c96c11 | 2018-12-20 16:04:18 -0800 | [diff] [blame] | 38 | void av1_dist_wtd_convolve_x_avx2(const uint8_t *src, int src_stride, |
| 39 | uint8_t *dst0, int dst_stride0, int w, int h, |
| 40 | const InterpFilterParams *filter_params_x, |
| 41 | const InterpFilterParams *filter_params_y, |
| 42 | const int subpel_x_q4, const int subpel_y_q4, |
| 43 | ConvolveParams *conv_params) { |
Cherma Rajan A | a7be368 | 2018-03-20 10:00:51 +0530 | [diff] [blame] | 44 | CONV_BUF_TYPE *dst = conv_params->dst; |
| 45 | int dst_stride = conv_params->dst_stride; |
| 46 | const int bd = 8; |
| 47 | int i, j; |
| 48 | const int fo_horiz = filter_params_x->taps / 2 - 1; |
| 49 | const uint8_t *const src_ptr = src - fo_horiz; |
| 50 | const int bits = FILTER_BITS - conv_params->round_1; |
pengbin | 2645155 | 2018-08-08 10:15:28 +0800 | [diff] [blame] | 51 | const __m256i wt = unpack_weights_avx2(conv_params); |
Cherma Rajan A | a7be368 | 2018-03-20 10:00:51 +0530 | [diff] [blame] | 52 | const int do_average = conv_params->do_average; |
Debargha Mukherjee | 7ac3eb1 | 2018-12-12 10:26:50 -0800 | [diff] [blame] | 53 | const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; |
Cherma Rajan A | a7be368 | 2018-03-20 10:00:51 +0530 | [diff] [blame] | 54 | const int offset_0 = |
| 55 | bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
| 56 | const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); |
| 57 | const __m256i offset_const = _mm256_set1_epi16(offset); |
| 58 | const int rounding_shift = |
| 59 | 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
| 60 | const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1); |
| 61 | __m256i filt[4], coeffs[4]; |
| 62 | |
| 63 | assert(bits >= 0); |
| 64 | assert(conv_params->round_0 > 0); |
| 65 | |
Xing Jin | fd68191 | 2018-07-12 20:21:55 +0800 | [diff] [blame] | 66 | filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2); |
| 67 | filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); |
| 68 | filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); |
| 69 | filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); |
Cherma Rajan A | a7be368 | 2018-03-20 10:00:51 +0530 | [diff] [blame] | 70 | |
| 71 | prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs); |
| 72 | |
| 73 | const __m256i round_const = |
| 74 | _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1); |
| 75 | const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1); |
| 76 | |
| 77 | (void)filter_params_y; |
| 78 | (void)subpel_y_q4; |
| 79 | |
| 80 | for (i = 0; i < h; i += 2) { |
pengbin | 2645155 | 2018-08-08 10:15:28 +0800 | [diff] [blame] | 81 | const uint8_t *src_data = src_ptr + i * src_stride; |
| 82 | CONV_BUF_TYPE *dst_data = dst + i * dst_stride; |
Cherma Rajan A | a7be368 | 2018-03-20 10:00:51 +0530 | [diff] [blame] | 83 | for (j = 0; j < w; j += 8) { |
pengbin | 2645155 | 2018-08-08 10:15:28 +0800 | [diff] [blame] | 84 | const __m256i data = |
| 85 | load_line2_avx2(&src_data[j], &src_data[j + src_stride]); |
Cherma Rajan A | a7be368 | 2018-03-20 10:00:51 +0530 | [diff] [blame] | 86 | |
| 87 | __m256i res = convolve_lowbd_x(data, coeffs, filt); |
| 88 | |
| 89 | res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift); |
| 90 | |
| 91 | res = _mm256_slli_epi16(res, bits); |
| 92 | |
| 93 | const __m256i res_unsigned = _mm256_add_epi16(res, offset_const); |
| 94 | |
| 95 | // Accumulate values into the destination buffer |
| 96 | if (do_average) { |
pengbin | 2645155 | 2018-08-08 10:15:28 +0800 | [diff] [blame] | 97 | const __m256i data_ref_0 = |
| 98 | load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]); |
Cherma Rajan A | a7be368 | 2018-03-20 10:00:51 +0530 | [diff] [blame] | 99 | const __m256i comp_avg_res = |
Debargha Mukherjee | 7ac3eb1 | 2018-12-12 10:26:50 -0800 | [diff] [blame] | 100 | comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); |
Cherma Rajan A | a7be368 | 2018-03-20 10:00:51 +0530 | [diff] [blame] | 101 | |
| 102 | const __m256i round_result = convolve_rounding( |
| 103 | &comp_avg_res, &offset_const, &rounding_const, rounding_shift); |
| 104 | |
| 105 | const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); |
| 106 | const __m128i res_0 = _mm256_castsi256_si128(res_8); |
| 107 | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); |
| 108 | |
| 109 | if (w > 4) { |
| 110 | _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); |
| 111 | _mm_storel_epi64( |
| 112 | (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); |
| 113 | } else { |
| 114 | *(uint32_t *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); |
| 115 | *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) = |
| 116 | _mm_cvtsi128_si32(res_1); |
| 117 | } |
| 118 | } else { |
| 119 | const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); |
| 120 | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); |
| 121 | |
| 122 | const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); |
| 123 | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), |
| 124 | res_1); |
| 125 | } |
| 126 | } |
| 127 | } |
| 128 | } |
| 129 | |
Debargha Mukherjee | 0c96c11 | 2018-12-20 16:04:18 -0800 | [diff] [blame] | 130 | void av1_dist_wtd_convolve_y_avx2(const uint8_t *src, int src_stride, |
| 131 | uint8_t *dst0, int dst_stride0, int w, int h, |
| 132 | const InterpFilterParams *filter_params_x, |
| 133 | const InterpFilterParams *filter_params_y, |
| 134 | const int subpel_x_q4, const int subpel_y_q4, |
| 135 | ConvolveParams *conv_params) { |
Cherma Rajan A | a7be368 | 2018-03-20 10:00:51 +0530 | [diff] [blame] | 136 | CONV_BUF_TYPE *dst = conv_params->dst; |
| 137 | int dst_stride = conv_params->dst_stride; |
| 138 | const int bd = 8; |
| 139 | int i, j; |
| 140 | const int fo_vert = filter_params_y->taps / 2 - 1; |
| 141 | const uint8_t *const src_ptr = src - fo_vert * src_stride; |
| 142 | // +1 to compensate for dividing the filter coeffs by 2 |
| 143 | const int left_shift = FILTER_BITS - conv_params->round_0 + 1; |
| 144 | const __m256i round_const = |
| 145 | _mm256_set1_epi32((1 << conv_params->round_1) >> 1); |
| 146 | const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); |
pengbin | 2645155 | 2018-08-08 10:15:28 +0800 | [diff] [blame] | 147 | const __m256i wt = unpack_weights_avx2(conv_params); |
Cherma Rajan A | a7be368 | 2018-03-20 10:00:51 +0530 | [diff] [blame] | 148 | const int do_average = conv_params->do_average; |
Debargha Mukherjee | 7ac3eb1 | 2018-12-12 10:26:50 -0800 | [diff] [blame] | 149 | const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; |
Cherma Rajan A | a7be368 | 2018-03-20 10:00:51 +0530 | [diff] [blame] | 150 | const int offset_0 = |
| 151 | bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
| 152 | const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); |
| 153 | const __m256i offset_const = _mm256_set1_epi16(offset); |
| 154 | const int offset_1 = (1 << (bd + FILTER_BITS - 2)); |
| 155 | const __m256i offset_const_1 = _mm256_set1_epi16(offset_1); |
| 156 | const __m256i offset_const_2 = _mm256_set1_epi16((1 << offset_0)); |
| 157 | const int rounding_shift = |
| 158 | 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
| 159 | const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1); |
| 160 | const __m256i zero = _mm256_setzero_si256(); |
| 161 | __m256i coeffs[4], s[8]; |
| 162 | |
| 163 | assert((FILTER_BITS - conv_params->round_0) >= 0); |
| 164 | |
| 165 | prepare_coeffs_lowbd(filter_params_y, subpel_y_q4, coeffs); |
| 166 | |
| 167 | (void)conv_params; |
| 168 | (void)filter_params_x; |
| 169 | (void)subpel_x_q4; |
| 170 | |
| 171 | for (j = 0; j < w; j += 16) { |
| 172 | const uint8_t *data = &src_ptr[j]; |
| 173 | __m256i src6; |
Cherma Rajan A | a7be368 | 2018-03-20 10:00:51 +0530 | [diff] [blame] | 174 | // Load lines a and b. Line a to lower 128, line b to upper 128 |
pengbin | 2645155 | 2018-08-08 10:15:28 +0800 | [diff] [blame] | 175 | { |
| 176 | __m256i src_ab[7]; |
| 177 | __m256i src_a[7]; |
| 178 | src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); |
| 179 | for (int kk = 0; kk < 6; ++kk) { |
| 180 | data += src_stride; |
| 181 | src_a[kk + 1] = |
| 182 | _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); |
| 183 | src_ab[kk] = _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20); |
| 184 | } |
| 185 | src6 = src_a[6]; |
| 186 | s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]); |
| 187 | s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]); |
| 188 | s[2] = _mm256_unpacklo_epi8(src_ab[4], src_ab[5]); |
| 189 | s[4] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]); |
| 190 | s[5] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]); |
| 191 | s[6] = _mm256_unpackhi_epi8(src_ab[4], src_ab[5]); |
| 192 | } |
Cherma Rajan A | a7be368 | 2018-03-20 10:00:51 +0530 | [diff] [blame] | 193 | |
| 194 | for (i = 0; i < h; i += 2) { |
pengbin | 2645155 | 2018-08-08 10:15:28 +0800 | [diff] [blame] | 195 | data = &src_ptr[(i + 7) * src_stride + j]; |
| 196 | const __m256i src7 = |
| 197 | _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); |
| 198 | const __m256i src_67a = _mm256_permute2x128_si256(src6, src7, 0x20); |
Cherma Rajan A | a7be368 | 2018-03-20 10:00:51 +0530 | [diff] [blame] | 199 | |
| 200 | src6 = _mm256_castsi128_si256( |
pengbin | 2645155 | 2018-08-08 10:15:28 +0800 | [diff] [blame] | 201 | _mm_loadu_si128((__m128i *)(data + src_stride))); |
| 202 | const __m256i src_78a = _mm256_permute2x128_si256(src7, src6, 0x20); |
Cherma Rajan A | a7be368 | 2018-03-20 10:00:51 +0530 | [diff] [blame] | 203 | |
| 204 | s[3] = _mm256_unpacklo_epi8(src_67a, src_78a); |
| 205 | s[7] = _mm256_unpackhi_epi8(src_67a, src_78a); |
| 206 | |
| 207 | __m256i res_lo = convolve_lowbd(s, coeffs); |
| 208 | |
| 209 | res_lo = _mm256_add_epi16(res_lo, offset_const_1); |
| 210 | |
| 211 | const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero); |
| 212 | const __m256i res_lo_0_shift = |
| 213 | _mm256_slli_epi32(res_lo_0_32b, left_shift); |
| 214 | const __m256i res_lo_0_round = _mm256_sra_epi32( |
| 215 | _mm256_add_epi32(res_lo_0_shift, round_const), round_shift); |
| 216 | |
| 217 | const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero); |
| 218 | const __m256i res_lo_1_shift = |
| 219 | _mm256_slli_epi32(res_lo_1_32b, left_shift); |
| 220 | const __m256i res_lo_1_round = _mm256_sra_epi32( |
| 221 | _mm256_add_epi32(res_lo_1_shift, round_const), round_shift); |
| 222 | |
| 223 | const __m256i res_lo_round = |
| 224 | _mm256_packs_epi32(res_lo_0_round, res_lo_1_round); |
| 225 | |
| 226 | const __m256i res_lo_unsigned = |
| 227 | _mm256_add_epi16(res_lo_round, offset_const_2); |
| 228 | |
| 229 | if (w - j < 16) { |
| 230 | if (do_average) { |
pengbin | 2645155 | 2018-08-08 10:15:28 +0800 | [diff] [blame] | 231 | const __m256i data_ref_0 = load_line2_avx2( |
| 232 | &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]); |
Debargha Mukherjee | 7ac3eb1 | 2018-12-12 10:26:50 -0800 | [diff] [blame] | 233 | const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_lo_unsigned, |
| 234 | &wt, use_dist_wtd_comp_avg); |
Cherma Rajan A | a7be368 | 2018-03-20 10:00:51 +0530 | [diff] [blame] | 235 | |
| 236 | const __m256i round_result = convolve_rounding( |
| 237 | &comp_avg_res, &offset_const, &rounding_const, rounding_shift); |
| 238 | |
| 239 | const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); |
| 240 | const __m128i res_0 = _mm256_castsi256_si128(res_8); |
| 241 | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); |
| 242 | |
| 243 | if (w - j > 4) { |
| 244 | _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); |
| 245 | _mm_storel_epi64( |
| 246 | (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); |
| 247 | } else { |
| 248 | *(uint32_t *)(&dst0[i * dst_stride0 + j]) = |
| 249 | _mm_cvtsi128_si32(res_0); |
| 250 | *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) = |
| 251 | _mm_cvtsi128_si32(res_1); |
| 252 | } |
| 253 | } else { |
| 254 | const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned); |
| 255 | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); |
| 256 | |
| 257 | const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1); |
| 258 | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), |
| 259 | res_1); |
| 260 | } |
| 261 | } else { |
| 262 | __m256i res_hi = convolve_lowbd(s + 4, coeffs); |
| 263 | |
| 264 | res_hi = _mm256_add_epi16(res_hi, offset_const_1); |
| 265 | |
| 266 | const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero); |
| 267 | const __m256i res_hi_0_shift = |
| 268 | _mm256_slli_epi32(res_hi_0_32b, left_shift); |
| 269 | const __m256i res_hi_0_round = _mm256_sra_epi32( |
| 270 | _mm256_add_epi32(res_hi_0_shift, round_const), round_shift); |
| 271 | |
| 272 | const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero); |
| 273 | const __m256i res_hi_1_shift = |
| 274 | _mm256_slli_epi32(res_hi_1_32b, left_shift); |
| 275 | const __m256i res_hi_1_round = _mm256_sra_epi32( |
| 276 | _mm256_add_epi32(res_hi_1_shift, round_const), round_shift); |
| 277 | |
| 278 | const __m256i res_hi_round = |
| 279 | _mm256_packs_epi32(res_hi_0_round, res_hi_1_round); |
| 280 | |
| 281 | const __m256i res_hi_unsigned = |
| 282 | _mm256_add_epi16(res_hi_round, offset_const_2); |
| 283 | |
| 284 | if (do_average) { |
pengbin | 2645155 | 2018-08-08 10:15:28 +0800 | [diff] [blame] | 285 | const __m256i data_ref_0_lo = load_line2_avx2( |
| 286 | &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]); |
Cherma Rajan A | a7be368 | 2018-03-20 10:00:51 +0530 | [diff] [blame] | 287 | |
pengbin | 2645155 | 2018-08-08 10:15:28 +0800 | [diff] [blame] | 288 | const __m256i data_ref_0_hi = |
| 289 | load_line2_avx2(&dst[i * dst_stride + j + 8], |
| 290 | &dst[i * dst_stride + j + 8 + dst_stride]); |
Cherma Rajan A | a7be368 | 2018-03-20 10:00:51 +0530 | [diff] [blame] | 291 | |
Debargha Mukherjee | 7ac3eb1 | 2018-12-12 10:26:50 -0800 | [diff] [blame] | 292 | const __m256i comp_avg_res_lo = comp_avg( |
| 293 | &data_ref_0_lo, &res_lo_unsigned, &wt, use_dist_wtd_comp_avg); |
Cherma Rajan A | a7be368 | 2018-03-20 10:00:51 +0530 | [diff] [blame] | 294 | |
Debargha Mukherjee | 7ac3eb1 | 2018-12-12 10:26:50 -0800 | [diff] [blame] | 295 | const __m256i comp_avg_res_hi = comp_avg( |
| 296 | &data_ref_0_hi, &res_hi_unsigned, &wt, use_dist_wtd_comp_avg); |
Cherma Rajan A | a7be368 | 2018-03-20 10:00:51 +0530 | [diff] [blame] | 297 | |
| 298 | const __m256i round_result_lo = convolve_rounding( |
| 299 | &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); |
| 300 | |
| 301 | const __m256i round_result_hi = convolve_rounding( |
| 302 | &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift); |
| 303 | |
| 304 | const __m256i res_8 = |
| 305 | _mm256_packus_epi16(round_result_lo, round_result_hi); |
| 306 | const __m128i res_0 = _mm256_castsi256_si128(res_8); |
| 307 | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); |
| 308 | |
| 309 | _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); |
| 310 | _mm_store_si128( |
| 311 | (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); |
| 312 | |
| 313 | } else { |
| 314 | const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned); |
| 315 | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0); |
| 316 | |
| 317 | const __m128i res_lo_1 = _mm256_extracti128_si256(res_lo_unsigned, 1); |
| 318 | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), |
| 319 | res_lo_1); |
| 320 | |
| 321 | const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned); |
| 322 | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]), res_hi_0); |
| 323 | |
| 324 | const __m128i res_hi_1 = _mm256_extracti128_si256(res_hi_unsigned, 1); |
| 325 | _mm_store_si128( |
| 326 | (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]), res_hi_1); |
| 327 | } |
| 328 | } |
| 329 | s[0] = s[1]; |
| 330 | s[1] = s[2]; |
| 331 | s[2] = s[3]; |
| 332 | |
| 333 | s[4] = s[5]; |
| 334 | s[5] = s[6]; |
| 335 | s[6] = s[7]; |
| 336 | } |
| 337 | } |
| 338 | } |
| 339 | |
Debargha Mukherjee | 0c96c11 | 2018-12-20 16:04:18 -0800 | [diff] [blame] | 340 | void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride, |
| 341 | uint8_t *dst0, int dst_stride0, int w, int h, |
| 342 | const InterpFilterParams *filter_params_x, |
| 343 | const InterpFilterParams *filter_params_y, |
| 344 | const int subpel_x_q4, const int subpel_y_q4, |
| 345 | ConvolveParams *conv_params) { |
Cherma Rajan A | a7be368 | 2018-03-20 10:00:51 +0530 | [diff] [blame] | 346 | CONV_BUF_TYPE *dst = conv_params->dst; |
| 347 | int dst_stride = conv_params->dst_stride; |
| 348 | const int bd = 8; |
| 349 | |
| 350 | DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); |
| 351 | int im_h = h + filter_params_y->taps - 1; |
| 352 | int im_stride = 8; |
| 353 | int i, j; |
| 354 | const int fo_vert = filter_params_y->taps / 2 - 1; |
| 355 | const int fo_horiz = filter_params_x->taps / 2 - 1; |
| 356 | const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; |
pengbin | 2645155 | 2018-08-08 10:15:28 +0800 | [diff] [blame] | 357 | const __m256i wt = unpack_weights_avx2(conv_params); |
Cherma Rajan A | a7be368 | 2018-03-20 10:00:51 +0530 | [diff] [blame] | 358 | const int do_average = conv_params->do_average; |
Debargha Mukherjee | 7ac3eb1 | 2018-12-12 10:26:50 -0800 | [diff] [blame] | 359 | const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; |
Cherma Rajan A | a7be368 | 2018-03-20 10:00:51 +0530 | [diff] [blame] | 360 | const int offset_0 = |
| 361 | bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
| 362 | const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); |
| 363 | const __m256i offset_const = _mm256_set1_epi16(offset); |
| 364 | const int rounding_shift = |
| 365 | 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
| 366 | const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1); |
| 367 | __m256i filt[4], s[8], coeffs_x[4], coeffs_y[4]; |
| 368 | |
| 369 | assert(conv_params->round_0 > 0); |
| 370 | |
Xing Jin | fd68191 | 2018-07-12 20:21:55 +0800 | [diff] [blame] | 371 | filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2); |
| 372 | filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); |
| 373 | filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); |
| 374 | filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); |
Cherma Rajan A | a7be368 | 2018-03-20 10:00:51 +0530 | [diff] [blame] | 375 | |
| 376 | prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_x); |
| 377 | prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y); |
| 378 | |
| 379 | const __m256i round_const_h = _mm256_set1_epi16( |
| 380 | ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2))); |
| 381 | const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1); |
| 382 | |
| 383 | const __m256i round_const_v = _mm256_set1_epi32( |
| 384 | ((1 << conv_params->round_1) >> 1) - |
| 385 | (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1))); |
| 386 | const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1); |
| 387 | |
| 388 | for (j = 0; j < w; j += 8) { |
| 389 | /* Horizontal filter */ |
| 390 | { |
pengbin | 2645155 | 2018-08-08 10:15:28 +0800 | [diff] [blame] | 391 | const uint8_t *src_h = src_ptr + j; |
Cherma Rajan A | a7be368 | 2018-03-20 10:00:51 +0530 | [diff] [blame] | 392 | for (i = 0; i < im_h; i += 2) { |
pengbin | 2645155 | 2018-08-08 10:15:28 +0800 | [diff] [blame] | 393 | __m256i data = |
| 394 | _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h)); |
Cherma Rajan A | a7be368 | 2018-03-20 10:00:51 +0530 | [diff] [blame] | 395 | if (i + 1 < im_h) |
| 396 | data = _mm256_inserti128_si256( |
pengbin | 2645155 | 2018-08-08 10:15:28 +0800 | [diff] [blame] | 397 | data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1); |
| 398 | src_h += (src_stride << 1); |
Cherma Rajan A | a7be368 | 2018-03-20 10:00:51 +0530 | [diff] [blame] | 399 | __m256i res = convolve_lowbd_x(data, coeffs_x, filt); |
| 400 | |
| 401 | res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), |
| 402 | round_shift_h); |
| 403 | |
| 404 | _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); |
| 405 | } |
| 406 | } |
| 407 | |
| 408 | /* Vertical filter */ |
| 409 | { |
| 410 | __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); |
| 411 | __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); |
| 412 | __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); |
| 413 | __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); |
| 414 | __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); |
| 415 | __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); |
| 416 | |
| 417 | s[0] = _mm256_unpacklo_epi16(s0, s1); |
| 418 | s[1] = _mm256_unpacklo_epi16(s2, s3); |
| 419 | s[2] = _mm256_unpacklo_epi16(s4, s5); |
| 420 | |
| 421 | s[4] = _mm256_unpackhi_epi16(s0, s1); |
| 422 | s[5] = _mm256_unpackhi_epi16(s2, s3); |
| 423 | s[6] = _mm256_unpackhi_epi16(s4, s5); |
| 424 | |
| 425 | for (i = 0; i < h; i += 2) { |
| 426 | const int16_t *data = &im_block[i * im_stride]; |
| 427 | |
| 428 | const __m256i s6 = |
| 429 | _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); |
| 430 | const __m256i s7 = |
| 431 | _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); |
| 432 | |
| 433 | s[3] = _mm256_unpacklo_epi16(s6, s7); |
| 434 | s[7] = _mm256_unpackhi_epi16(s6, s7); |
| 435 | |
| 436 | const __m256i res_a = convolve(s, coeffs_y); |
| 437 | const __m256i res_a_round = _mm256_sra_epi32( |
| 438 | _mm256_add_epi32(res_a, round_const_v), round_shift_v); |
| 439 | |
| 440 | if (w - j > 4) { |
| 441 | const __m256i res_b = convolve(s + 4, coeffs_y); |
| 442 | const __m256i res_b_round = _mm256_sra_epi32( |
| 443 | _mm256_add_epi32(res_b, round_const_v), round_shift_v); |
| 444 | const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round); |
| 445 | const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); |
| 446 | |
| 447 | if (do_average) { |
pengbin | 2645155 | 2018-08-08 10:15:28 +0800 | [diff] [blame] | 448 | const __m256i data_ref_0 = |
| 449 | load_line2_avx2(&dst[i * dst_stride + j], |
| 450 | &dst[i * dst_stride + j + dst_stride]); |
Debargha Mukherjee | 7ac3eb1 | 2018-12-12 10:26:50 -0800 | [diff] [blame] | 451 | const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, |
| 452 | &wt, use_dist_wtd_comp_avg); |
Cherma Rajan A | a7be368 | 2018-03-20 10:00:51 +0530 | [diff] [blame] | 453 | |
| 454 | const __m256i round_result = convolve_rounding( |
| 455 | &comp_avg_res, &offset_const, &rounding_const, rounding_shift); |
| 456 | |
| 457 | const __m256i res_8 = |
| 458 | _mm256_packus_epi16(round_result, round_result); |
| 459 | const __m128i res_0 = _mm256_castsi256_si128(res_8); |
| 460 | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); |
| 461 | |
| 462 | _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); |
| 463 | _mm_storel_epi64( |
| 464 | (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); |
| 465 | } else { |
| 466 | const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); |
| 467 | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); |
| 468 | |
| 469 | const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); |
| 470 | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), |
| 471 | res_1); |
| 472 | } |
| 473 | } else { |
| 474 | const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round); |
| 475 | const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); |
| 476 | |
| 477 | if (do_average) { |
pengbin | 2645155 | 2018-08-08 10:15:28 +0800 | [diff] [blame] | 478 | const __m256i data_ref_0 = |
| 479 | load_line2_avx2(&dst[i * dst_stride + j], |
| 480 | &dst[i * dst_stride + j + dst_stride]); |
Cherma Rajan A | a7be368 | 2018-03-20 10:00:51 +0530 | [diff] [blame] | 481 | |
Debargha Mukherjee | 7ac3eb1 | 2018-12-12 10:26:50 -0800 | [diff] [blame] | 482 | const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, |
| 483 | &wt, use_dist_wtd_comp_avg); |
Cherma Rajan A | a7be368 | 2018-03-20 10:00:51 +0530 | [diff] [blame] | 484 | |
| 485 | const __m256i round_result = convolve_rounding( |
| 486 | &comp_avg_res, &offset_const, &rounding_const, rounding_shift); |
| 487 | |
| 488 | const __m256i res_8 = |
| 489 | _mm256_packus_epi16(round_result, round_result); |
| 490 | const __m128i res_0 = _mm256_castsi256_si128(res_8); |
| 491 | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); |
| 492 | |
| 493 | *(uint32_t *)(&dst0[i * dst_stride0 + j]) = |
| 494 | _mm_cvtsi128_si32(res_0); |
| 495 | *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) = |
| 496 | _mm_cvtsi128_si32(res_1); |
| 497 | |
| 498 | } else { |
| 499 | const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); |
| 500 | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); |
| 501 | |
| 502 | const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); |
| 503 | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), |
| 504 | res_1); |
| 505 | } |
| 506 | } |
| 507 | |
| 508 | s[0] = s[1]; |
| 509 | s[1] = s[2]; |
| 510 | s[2] = s[3]; |
| 511 | |
| 512 | s[4] = s[5]; |
| 513 | s[5] = s[6]; |
| 514 | s[6] = s[7]; |
| 515 | } |
| 516 | } |
| 517 | } |
| 518 | } |
| 519 | |
Debargha Mukherjee | 0c96c11 | 2018-12-20 16:04:18 -0800 | [diff] [blame] | 520 | void av1_dist_wtd_convolve_2d_copy_avx2( |
| 521 | const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w, |
| 522 | int h, const InterpFilterParams *filter_params_x, |
| 523 | const InterpFilterParams *filter_params_y, const int subpel_x_q4, |
| 524 | const int subpel_y_q4, ConvolveParams *conv_params) { |
Cherma Rajan A | a7be368 | 2018-03-20 10:00:51 +0530 | [diff] [blame] | 525 | const int bd = 8; |
| 526 | CONV_BUF_TYPE *dst = conv_params->dst; |
| 527 | int dst_stride = conv_params->dst_stride; |
| 528 | (void)filter_params_x; |
| 529 | (void)filter_params_y; |
| 530 | (void)subpel_x_q4; |
| 531 | (void)subpel_y_q4; |
| 532 | |
| 533 | const int bits = |
| 534 | FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0; |
| 535 | const __m128i left_shift = _mm_cvtsi32_si128(bits); |
| 536 | const int do_average = conv_params->do_average; |
Debargha Mukherjee | 7ac3eb1 | 2018-12-12 10:26:50 -0800 | [diff] [blame] | 537 | const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; |
pengbin | 2645155 | 2018-08-08 10:15:28 +0800 | [diff] [blame] | 538 | const __m256i wt = unpack_weights_avx2(conv_params); |
Cherma Rajan A | a7be368 | 2018-03-20 10:00:51 +0530 | [diff] [blame] | 539 | const __m256i zero = _mm256_setzero_si256(); |
| 540 | |
| 541 | const int offset_0 = |
| 542 | bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
| 543 | const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); |
| 544 | const __m256i offset_const = _mm256_set1_epi16(offset); |
| 545 | const int rounding_shift = |
| 546 | 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
| 547 | const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1); |
| 548 | int i, j; |
| 549 | |
| 550 | if (!(w % 16)) { |
| 551 | for (i = 0; i < h; i += 1) { |
| 552 | for (j = 0; j < w; j += 16) { |
| 553 | const __m256i src_16bit = _mm256_cvtepu8_epi16( |
| 554 | _mm_loadu_si128((__m128i *)(&src[i * src_stride + j]))); |
| 555 | |
| 556 | const __m256i res = _mm256_sll_epi16(src_16bit, left_shift); |
| 557 | const __m256i res_unsigned = _mm256_add_epi16(res, offset_const); |
| 558 | |
| 559 | if (do_average) { |
| 560 | const __m256i data_ref_0 = |
| 561 | _mm256_loadu_si256((__m256i *)(&dst[i * dst_stride + j])); |
| 562 | |
| 563 | const __m256i comp_avg_res = |
Debargha Mukherjee | 7ac3eb1 | 2018-12-12 10:26:50 -0800 | [diff] [blame] | 564 | comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); |
Cherma Rajan A | a7be368 | 2018-03-20 10:00:51 +0530 | [diff] [blame] | 565 | |
| 566 | const __m256i round_result = convolve_rounding( |
| 567 | &comp_avg_res, &offset_const, &rounding_const, rounding_shift); |
| 568 | |
| 569 | const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); |
| 570 | const __m256i res_0 = _mm256_permute4x64_epi64(res_8, 0xD8); |
| 571 | |
| 572 | _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), |
| 573 | _mm256_castsi256_si128(res_0)); |
| 574 | } else { |
| 575 | _mm256_store_si256((__m256i *)(&dst[i * dst_stride + j]), |
| 576 | res_unsigned); |
| 577 | } |
| 578 | } |
| 579 | } |
| 580 | } else if (!(w % 4)) { |
| 581 | for (i = 0; i < h; i += 2) { |
| 582 | for (j = 0; j < w; j += 8) { |
| 583 | const __m128i src_row_0 = |
| 584 | _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j])); |
| 585 | const __m128i src_row_1 = |
| 586 | _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j + src_stride])); |
| 587 | // since not all compilers yet support _mm256_set_m128i() |
| 588 | const __m256i src_10 = _mm256_insertf128_si256( |
| 589 | _mm256_castsi128_si256(src_row_0), src_row_1, 1); |
| 590 | |
| 591 | const __m256i src_16bit = _mm256_unpacklo_epi8(src_10, zero); |
| 592 | |
| 593 | const __m256i res = _mm256_sll_epi16(src_16bit, left_shift); |
| 594 | |
| 595 | const __m256i res_unsigned = _mm256_add_epi16(res, offset_const); |
| 596 | |
| 597 | // Accumulate values into the destination buffer |
| 598 | if (do_average) { |
pengbin | 2645155 | 2018-08-08 10:15:28 +0800 | [diff] [blame] | 599 | const __m256i data_ref_0 = load_line2_avx2( |
| 600 | &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]); |
Cherma Rajan A | a7be368 | 2018-03-20 10:00:51 +0530 | [diff] [blame] | 601 | const __m256i comp_avg_res = |
Debargha Mukherjee | 7ac3eb1 | 2018-12-12 10:26:50 -0800 | [diff] [blame] | 602 | comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); |
Cherma Rajan A | a7be368 | 2018-03-20 10:00:51 +0530 | [diff] [blame] | 603 | |
| 604 | const __m256i round_result = convolve_rounding( |
| 605 | &comp_avg_res, &offset_const, &rounding_const, rounding_shift); |
| 606 | |
| 607 | const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); |
| 608 | const __m128i res_0 = _mm256_castsi256_si128(res_8); |
| 609 | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); |
| 610 | |
| 611 | if (w > 4) { |
| 612 | _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); |
| 613 | _mm_storel_epi64( |
| 614 | (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); |
| 615 | } else { |
| 616 | *(uint32_t *)(&dst0[i * dst_stride0 + j]) = |
| 617 | _mm_cvtsi128_si32(res_0); |
| 618 | *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) = |
| 619 | _mm_cvtsi128_si32(res_1); |
| 620 | } |
| 621 | } else { |
| 622 | const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); |
| 623 | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); |
| 624 | |
| 625 | const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); |
| 626 | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), |
| 627 | res_1); |
| 628 | } |
| 629 | } |
| 630 | } |
| 631 | } |
| 632 | } |