| /* |
| * Copyright (c) 2025, Alliance for Open Media. All rights reserved |
| * |
| * This source code is subject to the terms of the BSD 3-Clause Clear License |
| * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear |
| * License was not distributed with this source code in the LICENSE file, you |
| * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the |
| * Alliance for Open Media Patent License 1.0 was not distributed with this |
| * source code in the PATENTS file, you can obtain it at |
| * aomedia.org/license/patent-license/. |
| */ |
| |
| #include <immintrin.h> |
| |
| #include "config/aom_config.h" |
| #include "config/av1_rtcd.h" |
| |
| void av1_make_bawp_block_avx2(uint16_t *dst, int dst_stride, int16_t alpha, |
| int32_t beta, int shift, int bw, int bh, int bd) { |
| const __m256i alpha_reg = _mm256_set1_epi32((int)alpha); |
| const __m256i beta_reg = _mm256_set1_epi32(beta); |
| const __m256i clip_pixel = |
| _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); |
| if (bw == 4 && ((bh & 3) == 0)) { |
| for (int j = 0; j < bh; j += 4) { |
| // d00 d01 d02 d03 |
| const __m128i dst_0 = _mm_cvtepu16_epi32( |
| _mm_loadl_epi64((const __m128i *)(&dst[j * dst_stride]))); |
| // d10 d11 d12 d13 |
| const __m128i dst_1 = _mm_cvtepu16_epi32( |
| _mm_loadl_epi64((const __m128i *)(&dst[(j + 1) * dst_stride]))); |
| // d00 d01 d02 d03 | d10 d11 d12 d13 |
| const __m256i dst_01 = |
| _mm256_inserti128_si256(_mm256_castsi128_si256(dst_0), dst_1, 1); |
| // d20 d21 d22 d23 |
| const __m128i dst_2 = _mm_cvtepu16_epi32( |
| _mm_loadl_epi64((const __m128i *)(&dst[(j + 2) * dst_stride]))); |
| // d30 d31 d32 d33 |
| const __m128i dst_3 = _mm_cvtepu16_epi32( |
| _mm_loadl_epi64((const __m128i *)(&dst[(j + 3) * dst_stride]))); |
| // d20 d21 d22 d23 | d30 d31 d32 d33 |
| const __m256i dst_23 = |
| _mm256_inserti128_si256(_mm256_castsi128_si256(dst_2), dst_3, 1); |
| |
| const __m256i res_0 = _mm256_srai_epi32( |
| _mm256_add_epi32(beta_reg, _mm256_mullo_epi32(dst_01, alpha_reg)), |
| shift); |
| const __m256i res_1 = _mm256_srai_epi32( |
| _mm256_add_epi32(beta_reg, _mm256_mullo_epi32(dst_23, alpha_reg)), |
| shift); |
| // 00 01 02 03 | 20 21 22 23 | 10 11 12 13 | 30 31 32 33 |
| const __m256i res_2 = _mm256_packus_epi32(res_0, res_1); |
| const __m256i res = _mm256_min_epu16(res_2, clip_pixel); |
| const __m128i res_lo = _mm256_castsi256_si128(res); |
| const __m128i res_hi = _mm256_extracti128_si256(res, 1); |
| |
| _mm_storel_epi64((__m128i *)(&dst[j * dst_stride]), res_lo); |
| _mm_storel_epi64((__m128i *)(&dst[(j + 1) * dst_stride]), res_hi); |
| _mm_storel_epi64((__m128i *)(&dst[(j + 2) * dst_stride]), |
| _mm_srli_si128(res_lo, 8)); |
| _mm_storel_epi64((__m128i *)(&dst[(j + 3) * dst_stride]), |
| _mm_srli_si128(res_hi, 8)); |
| } |
| } else if (((bw & 7) == 0) && ((bh & 1) == 0)) { |
| for (int j = 0; j < bh; j += 2) { |
| for (int i = 0; i < bw; i += 8) { |
| // d00 d01 d02 d03 d04 d05 d06 d07 |
| const __m256i dst_0 = _mm256_cvtepu16_epi32( |
| _mm_loadu_si128((const __m128i *)(&dst[j * dst_stride + i]))); |
| // d10 d11 d12 d13 d14 d15 d16 d17 |
| const __m256i dst_1 = _mm256_cvtepu16_epi32( |
| _mm_loadu_si128((const __m128i *)(&dst[(j + 1) * dst_stride + i]))); |
| |
| const __m256i res_0 = _mm256_srai_epi32( |
| _mm256_add_epi32(beta_reg, _mm256_mullo_epi32(dst_0, alpha_reg)), |
| shift); |
| const __m256i res_1 = _mm256_srai_epi32( |
| _mm256_add_epi32(beta_reg, _mm256_mullo_epi32(dst_1, alpha_reg)), |
| shift); |
| const __m256i res_2 = |
| _mm256_permute4x64_epi64(_mm256_packus_epi32(res_0, res_1), 0xD8); |
| const __m256i res = _mm256_min_epu16(res_2, clip_pixel); |
| |
| _mm_storeu_si128((__m128i *)(&dst[j * dst_stride + i]), |
| _mm256_castsi256_si128(res)); |
| _mm_storeu_si128((__m128i *)(&dst[(j + 1) * dst_stride + i]), |
| _mm256_extracti128_si256(res, 1)); |
| } |
| } |
| } else { |
| av1_make_bawp_block_c(dst, dst_stride, alpha, beta, shift, bw, bh, bd); |
| } |
| } |