Satish Kumar Suman | 95c38b2 | 2018-09-05 10:21:03 +0530 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (c) 2018, Alliance for Open Media. All rights reserved |
| 3 | * |
| 4 | * This source code is subject to the terms of the BSD 2 Clause License and |
| 5 | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| 6 | * was not distributed with this source code in the LICENSE file, you can |
| 7 | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| 8 | * Media Patent License 1.0 was not distributed with this source code in the |
| 9 | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
| 10 | */ |
| 11 | |
| 12 | #include <immintrin.h> |
| 13 | #include <smmintrin.h> |
| 14 | |
| 15 | #include "aom_dsp/x86/synonyms.h" |
Satish Kumar Suman | d11a4b7 | 2018-09-07 11:52:50 +0530 | [diff] [blame] | 16 | #include "aom_dsp/x86/synonyms_avx2.h" |
Satish Kumar Suman | 95c38b2 | 2018-09-05 10:21:03 +0530 | [diff] [blame] | 17 | #include "aom_dsp/x86/sum_squares_sse2.h" |
| 18 | #include "config/aom_dsp_rtcd.h" |
| 19 | |
| 20 | static uint64_t aom_sum_squares_2d_i16_nxn_avx2(const int16_t *src, int stride, |
| 21 | int width, int height) { |
| 22 | uint64_t result; |
| 23 | __m256i v_acc_q = _mm256_setzero_si256(); |
James Zern | d7a91ce | 2022-08-29 16:45:31 -0700 | [diff] [blame] | 24 | const __m256i v_zext_mask_q = yy_set1_64_from_32i(~0); |
Satish Kumar Suman | 95c38b2 | 2018-09-05 10:21:03 +0530 | [diff] [blame] | 25 | for (int col = 0; col < height; col += 4) { |
| 26 | __m256i v_acc_d = _mm256_setzero_si256(); |
| 27 | for (int row = 0; row < width; row += 16) { |
| 28 | const int16_t *tempsrc = src + row; |
| 29 | const __m256i v_val_0_w = |
| 30 | _mm256_loadu_si256((const __m256i *)(tempsrc + 0 * stride)); |
| 31 | const __m256i v_val_1_w = |
| 32 | _mm256_loadu_si256((const __m256i *)(tempsrc + 1 * stride)); |
| 33 | const __m256i v_val_2_w = |
| 34 | _mm256_loadu_si256((const __m256i *)(tempsrc + 2 * stride)); |
| 35 | const __m256i v_val_3_w = |
| 36 | _mm256_loadu_si256((const __m256i *)(tempsrc + 3 * stride)); |
| 37 | |
| 38 | const __m256i v_sq_0_d = _mm256_madd_epi16(v_val_0_w, v_val_0_w); |
| 39 | const __m256i v_sq_1_d = _mm256_madd_epi16(v_val_1_w, v_val_1_w); |
| 40 | const __m256i v_sq_2_d = _mm256_madd_epi16(v_val_2_w, v_val_2_w); |
| 41 | const __m256i v_sq_3_d = _mm256_madd_epi16(v_val_3_w, v_val_3_w); |
| 42 | |
| 43 | const __m256i v_sum_01_d = _mm256_add_epi32(v_sq_0_d, v_sq_1_d); |
| 44 | const __m256i v_sum_23_d = _mm256_add_epi32(v_sq_2_d, v_sq_3_d); |
| 45 | const __m256i v_sum_0123_d = _mm256_add_epi32(v_sum_01_d, v_sum_23_d); |
| 46 | |
| 47 | v_acc_d = _mm256_add_epi32(v_acc_d, v_sum_0123_d); |
| 48 | } |
| 49 | v_acc_q = |
| 50 | _mm256_add_epi64(v_acc_q, _mm256_and_si256(v_acc_d, v_zext_mask_q)); |
| 51 | v_acc_q = _mm256_add_epi64(v_acc_q, _mm256_srli_epi64(v_acc_d, 32)); |
| 52 | src += 4 * stride; |
| 53 | } |
| 54 | __m128i lower_64_2_Value = _mm256_castsi256_si128(v_acc_q); |
| 55 | __m128i higher_64_2_Value = _mm256_extracti128_si256(v_acc_q, 1); |
| 56 | __m128i result_64_2_int = _mm_add_epi64(lower_64_2_Value, higher_64_2_Value); |
| 57 | |
| 58 | result_64_2_int = _mm_add_epi64( |
| 59 | result_64_2_int, _mm_unpackhi_epi64(result_64_2_int, result_64_2_int)); |
| 60 | |
| 61 | xx_storel_64(&result, result_64_2_int); |
| 62 | |
| 63 | return result; |
| 64 | } |
| 65 | |
| 66 | uint64_t aom_sum_squares_2d_i16_avx2(const int16_t *src, int stride, int width, |
| 67 | int height) { |
| 68 | if (LIKELY(width == 4 && height == 4)) { |
| 69 | return aom_sum_squares_2d_i16_4x4_sse2(src, stride); |
| 70 | } else if (LIKELY(width == 4 && (height & 3) == 0)) { |
| 71 | return aom_sum_squares_2d_i16_4xn_sse2(src, stride, height); |
| 72 | } else if (LIKELY(width == 8 && (height & 3) == 0)) { |
| 73 | return aom_sum_squares_2d_i16_nxn_sse2(src, stride, width, height); |
| 74 | } else if (LIKELY(((width & 15) == 0) && ((height & 3) == 0))) { |
| 75 | return aom_sum_squares_2d_i16_nxn_avx2(src, stride, width, height); |
| 76 | } else { |
| 77 | return aom_sum_squares_2d_i16_c(src, stride, width, height); |
| 78 | } |
| 79 | } |
Jayasanker J | f4368ac | 2020-01-08 19:54:24 +0530 | [diff] [blame] | 80 | |
Ravi Chaudhary | 587ba3a | 2020-08-27 15:40:37 +0530 | [diff] [blame] | 81 | static uint64_t aom_sum_sse_2d_i16_nxn_avx2(const int16_t *src, int stride, |
| 82 | int width, int height, int *sum) { |
| 83 | uint64_t result; |
| 84 | const __m256i zero_reg = _mm256_setzero_si256(); |
| 85 | const __m256i one_reg = _mm256_set1_epi16(1); |
| 86 | |
| 87 | __m256i v_sse_total = zero_reg; |
| 88 | __m256i v_sum_total = zero_reg; |
| 89 | |
| 90 | for (int col = 0; col < height; col += 4) { |
| 91 | __m256i v_sse_row = zero_reg; |
| 92 | for (int row = 0; row < width; row += 16) { |
| 93 | const int16_t *tempsrc = src + row; |
| 94 | const __m256i v_val_0_w = |
| 95 | _mm256_loadu_si256((const __m256i *)(tempsrc + 0 * stride)); |
| 96 | const __m256i v_val_1_w = |
| 97 | _mm256_loadu_si256((const __m256i *)(tempsrc + 1 * stride)); |
| 98 | const __m256i v_val_2_w = |
| 99 | _mm256_loadu_si256((const __m256i *)(tempsrc + 2 * stride)); |
| 100 | const __m256i v_val_3_w = |
| 101 | _mm256_loadu_si256((const __m256i *)(tempsrc + 3 * stride)); |
| 102 | |
| 103 | const __m256i v_sum_01 = _mm256_add_epi16(v_val_0_w, v_val_1_w); |
| 104 | const __m256i v_sum_23 = _mm256_add_epi16(v_val_2_w, v_val_3_w); |
| 105 | __m256i v_sum_0123 = _mm256_add_epi16(v_sum_01, v_sum_23); |
| 106 | v_sum_0123 = _mm256_madd_epi16(v_sum_0123, one_reg); |
| 107 | v_sum_total = _mm256_add_epi32(v_sum_total, v_sum_0123); |
| 108 | |
| 109 | const __m256i v_sq_0_d = _mm256_madd_epi16(v_val_0_w, v_val_0_w); |
| 110 | const __m256i v_sq_1_d = _mm256_madd_epi16(v_val_1_w, v_val_1_w); |
| 111 | const __m256i v_sq_2_d = _mm256_madd_epi16(v_val_2_w, v_val_2_w); |
| 112 | const __m256i v_sq_3_d = _mm256_madd_epi16(v_val_3_w, v_val_3_w); |
| 113 | const __m256i v_sq_01_d = _mm256_add_epi32(v_sq_0_d, v_sq_1_d); |
| 114 | const __m256i v_sq_23_d = _mm256_add_epi32(v_sq_2_d, v_sq_3_d); |
| 115 | const __m256i v_sq_0123_d = _mm256_add_epi32(v_sq_01_d, v_sq_23_d); |
| 116 | v_sse_row = _mm256_add_epi32(v_sse_row, v_sq_0123_d); |
| 117 | } |
| 118 | const __m256i v_sse_row_low = _mm256_unpacklo_epi32(v_sse_row, zero_reg); |
| 119 | const __m256i v_sse_row_hi = _mm256_unpackhi_epi32(v_sse_row, zero_reg); |
| 120 | v_sse_row = _mm256_add_epi64(v_sse_row_low, v_sse_row_hi); |
| 121 | v_sse_total = _mm256_add_epi64(v_sse_total, v_sse_row); |
| 122 | src += 4 * stride; |
| 123 | } |
| 124 | |
| 125 | const __m128i v_sum_total_low = _mm256_castsi256_si128(v_sum_total); |
| 126 | const __m128i v_sum_total_hi = _mm256_extracti128_si256(v_sum_total, 1); |
| 127 | __m128i sum_128bit = _mm_add_epi32(v_sum_total_hi, v_sum_total_low); |
| 128 | sum_128bit = _mm_add_epi32(sum_128bit, _mm_srli_si128(sum_128bit, 8)); |
| 129 | sum_128bit = _mm_add_epi32(sum_128bit, _mm_srli_si128(sum_128bit, 4)); |
| 130 | *sum += _mm_cvtsi128_si32(sum_128bit); |
| 131 | |
| 132 | __m128i v_sse_total_lo = _mm256_castsi256_si128(v_sse_total); |
| 133 | __m128i v_sse_total_hi = _mm256_extracti128_si256(v_sse_total, 1); |
| 134 | __m128i sse_128bit = _mm_add_epi64(v_sse_total_lo, v_sse_total_hi); |
| 135 | |
| 136 | sse_128bit = |
| 137 | _mm_add_epi64(sse_128bit, _mm_unpackhi_epi64(sse_128bit, sse_128bit)); |
| 138 | |
| 139 | xx_storel_64(&result, sse_128bit); |
| 140 | |
| 141 | return result; |
| 142 | } |
| 143 | |
| 144 | uint64_t aom_sum_sse_2d_i16_avx2(const int16_t *src, int src_stride, int width, |
| 145 | int height, int *sum) { |
| 146 | if (LIKELY(width == 4 && height == 4)) { |
| 147 | return aom_sum_sse_2d_i16_4x4_sse2(src, src_stride, sum); |
| 148 | } else if (LIKELY(width == 4 && (height & 3) == 0)) { |
| 149 | return aom_sum_sse_2d_i16_4xn_sse2(src, src_stride, height, sum); |
| 150 | } else if (LIKELY(width == 8 && (height & 3) == 0)) { |
| 151 | return aom_sum_sse_2d_i16_nxn_sse2(src, src_stride, width, height, sum); |
| 152 | } else if (LIKELY(((width & 15) == 0) && ((height & 3) == 0))) { |
| 153 | return aom_sum_sse_2d_i16_nxn_avx2(src, src_stride, width, height, sum); |
| 154 | } else { |
| 155 | return aom_sum_sse_2d_i16_c(src, src_stride, width, height, sum); |
| 156 | } |
| 157 | } |
| 158 | |
Jayasanker J | f4368ac | 2020-01-08 19:54:24 +0530 | [diff] [blame] | 159 | // Accumulate sum of 16-bit elements in the vector |
| 160 | static AOM_INLINE int32_t mm256_accumulate_epi16(__m256i vec_a) { |
| 161 | __m128i vtmp1 = _mm256_extracti128_si256(vec_a, 1); |
| 162 | __m128i vtmp2 = _mm256_castsi256_si128(vec_a); |
| 163 | vtmp1 = _mm_add_epi16(vtmp1, vtmp2); |
| 164 | vtmp2 = _mm_srli_si128(vtmp1, 8); |
| 165 | vtmp1 = _mm_add_epi16(vtmp1, vtmp2); |
| 166 | vtmp2 = _mm_srli_si128(vtmp1, 4); |
| 167 | vtmp1 = _mm_add_epi16(vtmp1, vtmp2); |
| 168 | vtmp2 = _mm_srli_si128(vtmp1, 2); |
| 169 | vtmp1 = _mm_add_epi16(vtmp1, vtmp2); |
| 170 | return _mm_extract_epi16(vtmp1, 0); |
| 171 | } |
| 172 | |
| 173 | // Accumulate sum of 32-bit elements in the vector |
| 174 | static AOM_INLINE int32_t mm256_accumulate_epi32(__m256i vec_a) { |
| 175 | __m128i vtmp1 = _mm256_extracti128_si256(vec_a, 1); |
| 176 | __m128i vtmp2 = _mm256_castsi256_si128(vec_a); |
| 177 | vtmp1 = _mm_add_epi32(vtmp1, vtmp2); |
| 178 | vtmp2 = _mm_srli_si128(vtmp1, 8); |
| 179 | vtmp1 = _mm_add_epi32(vtmp1, vtmp2); |
| 180 | vtmp2 = _mm_srli_si128(vtmp1, 4); |
| 181 | vtmp1 = _mm_add_epi32(vtmp1, vtmp2); |
| 182 | return _mm_cvtsi128_si32(vtmp1); |
| 183 | } |
| 184 | |
| 185 | uint64_t aom_var_2d_u8_avx2(uint8_t *src, int src_stride, int width, |
| 186 | int height) { |
| 187 | uint8_t *srcp; |
| 188 | uint64_t s = 0, ss = 0; |
| 189 | __m256i vzero = _mm256_setzero_si256(); |
| 190 | __m256i v_acc_sum = vzero; |
| 191 | __m256i v_acc_sqs = vzero; |
| 192 | int i, j; |
| 193 | |
| 194 | // Process 32 elements in a row |
| 195 | for (i = 0; i < width - 31; i += 32) { |
| 196 | srcp = src + i; |
| 197 | // Process 8 columns at a time |
| 198 | for (j = 0; j < height - 7; j += 8) { |
| 199 | __m256i vsrc[8]; |
| 200 | for (int k = 0; k < 8; k++) { |
| 201 | vsrc[k] = _mm256_loadu_si256((__m256i *)srcp); |
| 202 | srcp += src_stride; |
| 203 | } |
| 204 | for (int k = 0; k < 8; k++) { |
| 205 | __m256i vsrc0 = _mm256_unpacklo_epi8(vsrc[k], vzero); |
| 206 | __m256i vsrc1 = _mm256_unpackhi_epi8(vsrc[k], vzero); |
| 207 | v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc0); |
| 208 | v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc1); |
| 209 | |
| 210 | __m256i vsqs0 = _mm256_madd_epi16(vsrc0, vsrc0); |
| 211 | __m256i vsqs1 = _mm256_madd_epi16(vsrc1, vsrc1); |
| 212 | v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0); |
| 213 | v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs1); |
| 214 | } |
| 215 | |
| 216 | // Update total sum and clear the vectors |
| 217 | s += mm256_accumulate_epi16(v_acc_sum); |
| 218 | ss += mm256_accumulate_epi32(v_acc_sqs); |
| 219 | v_acc_sum = vzero; |
| 220 | v_acc_sqs = vzero; |
| 221 | } |
| 222 | |
| 223 | // Process remaining rows (height not a multiple of 8) |
| 224 | for (; j < height; j++) { |
| 225 | __m256i vsrc = _mm256_loadu_si256((__m256i *)srcp); |
| 226 | __m256i vsrc0 = _mm256_unpacklo_epi8(vsrc, vzero); |
| 227 | __m256i vsrc1 = _mm256_unpackhi_epi8(vsrc, vzero); |
| 228 | v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc0); |
| 229 | v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc1); |
| 230 | |
| 231 | __m256i vsqs0 = _mm256_madd_epi16(vsrc0, vsrc0); |
| 232 | __m256i vsqs1 = _mm256_madd_epi16(vsrc1, vsrc1); |
| 233 | v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0); |
| 234 | v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs1); |
| 235 | |
| 236 | srcp += src_stride; |
| 237 | } |
| 238 | |
| 239 | // Update total sum and clear the vectors |
| 240 | s += mm256_accumulate_epi16(v_acc_sum); |
| 241 | ss += mm256_accumulate_epi32(v_acc_sqs); |
| 242 | v_acc_sum = vzero; |
| 243 | v_acc_sqs = vzero; |
| 244 | } |
| 245 | |
| 246 | // Process the remaining area using C |
| 247 | srcp = src; |
| 248 | for (int k = 0; k < height; k++) { |
| 249 | for (int m = i; m < width; m++) { |
| 250 | uint8_t val = srcp[m]; |
| 251 | s += val; |
| 252 | ss += val * val; |
| 253 | } |
| 254 | srcp += src_stride; |
| 255 | } |
| 256 | return (ss - s * s / (width * height)); |
| 257 | } |
| 258 | |
| 259 | uint64_t aom_var_2d_u16_avx2(uint8_t *src, int src_stride, int width, |
| 260 | int height) { |
| 261 | uint16_t *srcp1 = CONVERT_TO_SHORTPTR(src), *srcp; |
| 262 | uint64_t s = 0, ss = 0; |
| 263 | __m256i vzero = _mm256_setzero_si256(); |
| 264 | __m256i v_acc_sum = vzero; |
| 265 | __m256i v_acc_sqs = vzero; |
| 266 | int i, j; |
| 267 | |
| 268 | // Process 16 elements in a row |
| 269 | for (i = 0; i < width - 15; i += 16) { |
| 270 | srcp = srcp1 + i; |
| 271 | // Process 8 columns at a time |
| 272 | for (j = 0; j < height - 8; j += 8) { |
| 273 | __m256i vsrc[8]; |
| 274 | for (int k = 0; k < 8; k++) { |
| 275 | vsrc[k] = _mm256_loadu_si256((__m256i *)srcp); |
| 276 | srcp += src_stride; |
| 277 | } |
| 278 | for (int k = 0; k < 8; k++) { |
| 279 | __m256i vsrc0 = _mm256_unpacklo_epi16(vsrc[k], vzero); |
| 280 | __m256i vsrc1 = _mm256_unpackhi_epi16(vsrc[k], vzero); |
| 281 | v_acc_sum = _mm256_add_epi32(vsrc0, v_acc_sum); |
| 282 | v_acc_sum = _mm256_add_epi32(vsrc1, v_acc_sum); |
| 283 | |
| 284 | __m256i vsqs0 = _mm256_madd_epi16(vsrc[k], vsrc[k]); |
| 285 | v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0); |
| 286 | } |
| 287 | |
| 288 | // Update total sum and clear the vectors |
| 289 | s += mm256_accumulate_epi32(v_acc_sum); |
| 290 | ss += mm256_accumulate_epi32(v_acc_sqs); |
| 291 | v_acc_sum = vzero; |
| 292 | v_acc_sqs = vzero; |
| 293 | } |
| 294 | |
| 295 | // Process remaining rows (height not a multiple of 8) |
| 296 | for (; j < height; j++) { |
| 297 | __m256i vsrc = _mm256_loadu_si256((__m256i *)srcp); |
| 298 | __m256i vsrc0 = _mm256_unpacklo_epi16(vsrc, vzero); |
| 299 | __m256i vsrc1 = _mm256_unpackhi_epi16(vsrc, vzero); |
| 300 | v_acc_sum = _mm256_add_epi32(vsrc0, v_acc_sum); |
| 301 | v_acc_sum = _mm256_add_epi32(vsrc1, v_acc_sum); |
| 302 | |
| 303 | __m256i vsqs0 = _mm256_madd_epi16(vsrc, vsrc); |
| 304 | v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0); |
| 305 | srcp += src_stride; |
| 306 | } |
| 307 | |
| 308 | // Update total sum and clear the vectors |
| 309 | s += mm256_accumulate_epi32(v_acc_sum); |
| 310 | ss += mm256_accumulate_epi32(v_acc_sqs); |
| 311 | v_acc_sum = vzero; |
| 312 | v_acc_sqs = vzero; |
| 313 | } |
| 314 | |
| 315 | // Process the remaining area using C |
| 316 | srcp = srcp1; |
| 317 | for (int k = 0; k < height; k++) { |
| 318 | for (int m = i; m < width; m++) { |
| 319 | uint16_t val = srcp[m]; |
| 320 | s += val; |
| 321 | ss += val * val; |
| 322 | } |
| 323 | srcp += src_stride; |
| 324 | } |
| 325 | return (ss - s * s / (width * height)); |
| 326 | } |