| /* |
| * Copyright (c) 2025, Alliance for Open Media. All rights reserved |
| * |
| * This source code is subject to the terms of the BSD 3-Clause Clear License |
| * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear |
| * License was not distributed with this source code in the LICENSE file, you |
| * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the |
| * Alliance for Open Media Patent License 1.0 was not distributed with this |
| * source code in the PATENTS file, you can obtain it at |
| * aomedia.org/license/patent-license/. |
| */ |
| |
| #include "av1/common/gdf_block.h" |
| #include <immintrin.h> |
| |
| #define gdf_calculate_laplacian_2x2_reg(lap, lap0, lap1, y0A, y_1A, y1A, y0B, \ |
| y_1B, y1B) \ |
| lap0 = _mm256_abs_epi16(_mm256_sub_epi16( \ |
| _mm256_sub_epi16(_mm256_slli_epi16(y0A, 1), y_1A), y1A)); \ |
| lap1 = _mm256_abs_epi16(_mm256_sub_epi16( \ |
| _mm256_sub_epi16(_mm256_slli_epi16(y0B, 1), y_1B), y1B)); \ |
| lap = _mm256_add_epi16(lap0, lap1); |
| |
| #define gdf_calculate_laplacian_4x4_reg( \ |
| lap4x4, lap_prev, lap_cur, shuffle_mask, shuffle_mask2, clip_mask) \ |
| lap4x4 = _mm256_add_epi16(lap_prev, lap_cur); \ |
| lap4x4 = \ |
| _mm256_add_epi16(lap4x4, _mm256_shuffle_epi8(lap4x4, shuffle_mask)); \ |
| lap4x4 = _mm256_add_epi16( \ |
| lap4x4, _mm256_permutevar8x32_epi32(lap4x4, shuffle_mask2)); \ |
| lap4x4 = _mm256_and_si256(lap4x4, clip_mask); |
| |
| /*!\brief Function to calculate gradients and classes for 2x2 pixels used in GDF |
| * of a block boxed by location of [i_min, j_min] to [i_max, j_max] |
| * gradients and classes are stored in gdf_lap_y and gdf_cls_y, |
| * respectively |
| */ |
| void gdf_set_lap_and_cls_unit_avx2( |
| const int i_min, const int i_max, const int j_min, const int j_max, |
| const int stripe_size, const uint16_t *rec_pnt, const int rec_stride, |
| const int bit_depth, uint16_t *const *gdf_lap_y, const int gdf_lap_y_stride, |
| uint32_t *gdf_cls_y, const int gdf_cls_y_stride) { |
| const int offset_ver = rec_stride, offset_dia0 = rec_stride + 1, |
| offset_dia1 = rec_stride - 1; |
| __m256i shuffle_mask = |
| _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, 13, |
| 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); |
| __m256i shuffle_mask2 = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1); |
| __m256i clip_mask = _mm256_set1_epi16( |
| (short)((1 << (16 - (GDF_TEST_INP_PREC - |
| AOMMIN(GDF_TEST_INP_PREC, bit_depth)))) - |
| 1)); |
| for (int j = 0; j < (j_max - j_min); j += 14) { |
| const uint16_t *std_pos = rec_pnt + (i_max - i_min) * rec_stride + j; |
| const uint16_t *std_pos_1; |
| const uint16_t *std_pos0; |
| const uint16_t *std_pos1; |
| const uint16_t *std_pos2; |
| |
| if ((i_max + GDF_TEST_STRIPE_OFF) % stripe_size == 0) { |
| #if (GDF_TEST_LINE_BUFFER >= 3) |
| std_pos_1 = std_pos - rec_stride; |
| std_pos0 = std_pos; |
| std_pos1 = std_pos0 + rec_stride; |
| std_pos2 = std_pos1 + rec_stride; |
| #elif (GDF_TEST_LINE_BUFFER == 2) |
| std_pos_1 = std_pos - rec_stride; |
| std_pos0 = std_pos; |
| std_pos1 = std_pos0 + rec_stride; |
| std_pos2 = std_pos - (rec_stride << 2); |
| #elif (GDF_TEST_LINE_BUFFER == 1) |
| std_pos_1 = std_pos - rec_stride; |
| std_pos0 = std_pos; |
| std_pos1 = std_pos_1 - (rec_stride << 1); |
| std_pos2 = std_pos1 - rec_stride; |
| #else |
| std_pos_1 = std_pos - rec_stride; |
| std_pos0 = std_pos_1 - rec_stride; |
| std_pos1 = std_pos0 - rec_stride; |
| std_pos2 = std_pos1 - rec_stride; |
| #endif |
| } else { |
| std_pos_1 = std_pos - rec_stride; |
| std_pos0 = std_pos; |
| std_pos1 = std_pos0 + rec_stride; |
| std_pos2 = std_pos1 + rec_stride; |
| } |
| __m256i lap0, lap1; |
| __m256i prev_ver_reg, prev_hor_reg, prev_dia0_reg, prev_dia1_reg; |
| |
| __m256i y00 = _mm256_loadu_si256((const __m256i *)(std_pos0)); |
| __m256i y10 = _mm256_loadu_si256((const __m256i *)(std_pos1)); |
| __m256i y_10 = _mm256_loadu_si256((const __m256i *)(std_pos_1)); |
| __m256i y20 = _mm256_loadu_si256((const __m256i *)(std_pos2)); |
| gdf_calculate_laplacian_2x2_reg(prev_ver_reg, lap0, lap1, y00, y_10, y10, |
| y10, y00, y20); |
| |
| __m256i y0_1 = _mm256_loadu_si256((const __m256i *)(std_pos0 - 1)); |
| __m256i y01 = _mm256_loadu_si256((const __m256i *)(std_pos0 + 1)); |
| __m256i y1_1 = _mm256_loadu_si256((const __m256i *)(std_pos1 - 1)); |
| __m256i y11 = _mm256_loadu_si256((const __m256i *)(std_pos1 + 1)); |
| gdf_calculate_laplacian_2x2_reg(prev_hor_reg, lap0, lap1, y00, y0_1, y01, |
| y10, y1_1, y11); |
| |
| __m256i y_1_1 = _mm256_loadu_si256((const __m256i *)(std_pos_1 - 1)); |
| __m256i y21 = _mm256_loadu_si256((const __m256i *)(std_pos2 + 1)); |
| gdf_calculate_laplacian_2x2_reg(prev_dia0_reg, lap0, lap1, y00, y_1_1, y11, |
| y10, y0_1, y21); |
| |
| __m256i y_11 = _mm256_loadu_si256((const __m256i *)(std_pos_1 + 1)); |
| __m256i y2_1 = _mm256_loadu_si256((const __m256i *)(std_pos2 - 1)); |
| gdf_calculate_laplacian_2x2_reg(prev_dia1_reg, lap0, lap1, y00, y_11, y1_1, |
| y10, y01, y2_1); |
| |
| for (int i = (i_max - i_min - 2); i >= 0; i -= 2) { |
| __m256i cur_ver_reg, cur_hor_reg, cur_dia0_reg, cur_dia1_reg; |
| __m256i out_ver_reg, out_hor_reg, out_dia0_reg, out_dia1_reg; |
| |
| std_pos = rec_pnt + i * rec_stride + j; |
| y00 = _mm256_loadu_si256((const __m256i *)(std_pos)); |
| y10 = _mm256_loadu_si256((const __m256i *)(std_pos + offset_ver)); |
| |
| #if !GDF_TEST_LINE_BUFFER |
| if ((i == 0) && ((i_min + GDF_TEST_STRIPE_OFF) % stripe_size == 0)) |
| y_10 = y00; |
| else |
| #endif |
| y_10 = _mm256_loadu_si256((const __m256i *)(std_pos - offset_ver)); |
| #if !GDF_TEST_LINE_BUFFER |
| if ((i == (i_max - i_min - 2)) && |
| ((i_max + GDF_TEST_STRIPE_OFF) % stripe_size == 0)) |
| y20 = y00; |
| else |
| #endif |
| y20 = _mm256_loadu_si256( |
| (const __m256i *)(std_pos + offset_ver + offset_ver)); |
| gdf_calculate_laplacian_2x2_reg(cur_ver_reg, lap0, lap1, y00, y_10, y10, |
| y10, y00, y20); |
| gdf_calculate_laplacian_4x4_reg(out_ver_reg, prev_ver_reg, cur_ver_reg, |
| shuffle_mask, shuffle_mask2, clip_mask); |
| _mm256_storeu_si256( |
| (__m256i *)(gdf_lap_y[GDF_VER] + (i >> 1) * gdf_lap_y_stride + j), |
| out_ver_reg); |
| prev_ver_reg = cur_ver_reg; |
| |
| y0_1 = _mm256_loadu_si256((const __m256i *)(std_pos - 1)); |
| y01 = _mm256_loadu_si256((const __m256i *)(std_pos + 1)); |
| y1_1 = _mm256_loadu_si256((const __m256i *)(std_pos + offset_ver - 1)); |
| y11 = _mm256_loadu_si256((const __m256i *)(std_pos + offset_ver + 1)); |
| gdf_calculate_laplacian_2x2_reg(cur_hor_reg, lap0, lap1, y00, y0_1, y01, |
| y10, y1_1, y11); |
| gdf_calculate_laplacian_4x4_reg(out_hor_reg, prev_hor_reg, cur_hor_reg, |
| shuffle_mask, shuffle_mask2, clip_mask); |
| _mm256_storeu_si256( |
| (__m256i *)(gdf_lap_y[GDF_HOR] + (i >> 1) * gdf_lap_y_stride + j), |
| out_hor_reg); |
| prev_hor_reg = cur_hor_reg; |
| |
| #if !GDF_TEST_LINE_BUFFER |
| if ((i == 0) && ((i_min + GDF_TEST_STRIPE_OFF) % stripe_size == 0)) |
| y_1_1 = y0_1; |
| else |
| #endif |
| y_1_1 = _mm256_loadu_si256((const __m256i *)(std_pos - offset_dia0)); |
| #if !GDF_TEST_LINE_BUFFER |
| if ((i == (i_max - i_min - 2)) && |
| ((i_max + GDF_TEST_STRIPE_OFF) % stripe_size == 0)) |
| y21 = y01; |
| else |
| #endif |
| y21 = _mm256_loadu_si256( |
| (const __m256i *)(std_pos + offset_ver + offset_dia0)); |
| gdf_calculate_laplacian_2x2_reg(cur_dia0_reg, lap0, lap1, y00, y_1_1, y11, |
| y10, y0_1, y21); |
| gdf_calculate_laplacian_4x4_reg(out_dia0_reg, prev_dia0_reg, cur_dia0_reg, |
| shuffle_mask, shuffle_mask2, clip_mask); |
| _mm256_storeu_si256( |
| (__m256i *)(gdf_lap_y[GDF_DIAG0] + (i >> 1) * gdf_lap_y_stride + j), |
| out_dia0_reg); |
| prev_dia0_reg = cur_dia0_reg; |
| |
| #if !GDF_TEST_LINE_BUFFER |
| if ((i == 0) && ((i_min + GDF_TEST_STRIPE_OFF) % stripe_size == 0)) |
| y_11 = y01; |
| else |
| #endif |
| y_11 = _mm256_loadu_si256((const __m256i *)(std_pos - offset_dia1)); |
| #if !GDF_TEST_LINE_BUFFER |
| if ((i == (i_max - i_min - 2)) && |
| ((i_max + GDF_TEST_STRIPE_OFF) % stripe_size == 0)) |
| y2_1 = y0_1; |
| else |
| #endif |
| y2_1 = _mm256_loadu_si256( |
| (const __m256i *)(std_pos + offset_ver + offset_dia1)); |
| gdf_calculate_laplacian_2x2_reg(cur_dia1_reg, lap0, lap1, y00, y_11, y1_1, |
| y10, y01, y2_1); |
| gdf_calculate_laplacian_4x4_reg(out_dia1_reg, prev_dia1_reg, cur_dia1_reg, |
| shuffle_mask, shuffle_mask2, clip_mask); |
| _mm256_storeu_si256( |
| (__m256i *)(gdf_lap_y[GDF_DIAG1] + (i >> 1) * gdf_lap_y_stride + j), |
| out_dia1_reg); |
| prev_dia1_reg = cur_dia1_reg; |
| |
| __m256i offset12 = _mm256_set1_epi16((int16_t)0x8000); |
| __m256i cls_reg = _mm256_or_si256( |
| _mm256_add_epi16( |
| _mm256_cmpgt_epi16(_mm256_sub_epi16(out_ver_reg, offset12), |
| _mm256_sub_epi16(out_hor_reg, offset12)), |
| _mm256_set1_epi16(1)), |
| _mm256_slli_epi16( |
| _mm256_add_epi16( |
| _mm256_cmpgt_epi16(_mm256_sub_epi16(out_dia0_reg, offset12), |
| _mm256_sub_epi16(out_dia1_reg, offset12)), |
| _mm256_set1_epi16(1)), |
| 1)); |
| cls_reg = _mm256_and_si256(cls_reg, _mm256_set1_epi32(3)); |
| _mm256_storeu_si256( |
| (__m256i *)(gdf_cls_y + (i >> 1) * gdf_cls_y_stride + (j >> 1)), |
| cls_reg); |
| } |
| } |
| } |
| |
| /*!\brief Function to apply expected coding error and controling parameter |
| * (i.e., scaling) to generate the final filtered block |
| */ |
| void gdf_compensation_unit_avx2(uint16_t *rec_pnt, const int rec_stride, |
| int16_t *err_pnt, const int err_stride, |
| const int err_shift, const int scale, |
| const int pxl_max, const int blk_height, |
| const int blk_width) { |
| const int err_shift_half = err_shift > 0 ? 1 << (err_shift - 1) : 0; |
| const int j_avx2 = ((blk_width) >> 4) << 4; |
| __m256i scale_reg = _mm256_set1_epi16(scale); |
| __m256i zero_reg = _mm256_setzero_si256(); |
| __m256i tgt_half_reg = _mm256_set1_epi16(err_shift_half); |
| __m256i pxl_max_reg = _mm256_set1_epi16(pxl_max); |
| |
| for (int i = 0; i < blk_height; i++) { |
| for (int j = 0; j < j_avx2; j += 16) { |
| __m256i err_reg = _mm256_loadu_si256((__m256i *)(err_pnt + j)); |
| __m256i neg_err_mask = _mm256_cmpgt_epi16(zero_reg, err_reg); |
| __m256i abs_err_reg = _mm256_abs_epi16(err_reg); |
| __m256i out_reg00 = _mm256_mullo_epi16(abs_err_reg, scale_reg); |
| out_reg00 = _mm256_add_epi16(out_reg00, tgt_half_reg); |
| out_reg00 = _mm256_srli_epi16(out_reg00, err_shift); |
| out_reg00 = _mm256_sub_epi16(_mm256_xor_si256(out_reg00, neg_err_mask), |
| neg_err_mask); |
| |
| __m256i rec_reg = _mm256_loadu_si256((__m256i *)(rec_pnt + j)); |
| out_reg00 = _mm256_add_epi16(out_reg00, rec_reg); |
| out_reg00 = _mm256_max_epi16(out_reg00, zero_reg); |
| out_reg00 = _mm256_min_epi16(out_reg00, pxl_max_reg); |
| _mm256_storeu_si256((__m256i *)(rec_pnt + j), out_reg00); |
| } |
| for (int j = j_avx2; j < blk_width; j++) { |
| int16_t res_pxl = scale * (*(err_pnt + j)); |
| uint16_t *rec_ptr = rec_pnt + j; |
| if (res_pxl > 0) { |
| res_pxl = (res_pxl + err_shift_half) >> err_shift; |
| } else { |
| res_pxl = -(((-res_pxl) + err_shift_half) >> err_shift); |
| } |
| *rec_ptr = (int16_t)CLIP(res_pxl + (*rec_ptr), 0, pxl_max); |
| } |
| rec_pnt += rec_stride; |
| err_pnt += err_stride; |
| } |
| } |
| |
| // Load weight register in the shape of [alpha[k]_sample[i], |
| // alpha[k]_sample[i+1], .., alpha[k]_sample[i+15]]: |
| // difference between each sample i-th to the center sample is to be clipped |
| // into range [-alpha, alpha] |
| // m256i_tmp_reg_01, m256_tmp_reg |
| #define gdf_load_alpha_reg(clip_max_reg, clip_min_reg, alphaOff, \ |
| m256i_tmp_reg, m256_tmp_reg, cls_idx) \ |
| m256i_tmp_reg = _mm256_set1_epi64x(*((const long long *)(alphaOff))); \ |
| m256_tmp_reg = _mm256_castsi256_ps( \ |
| _mm256_unpacklo_epi16(m256i_tmp_reg, m256i_tmp_reg)); \ |
| __m256i clip_max_reg = \ |
| _mm256_castps_si256(_mm256_permutevar_ps(m256_tmp_reg, cls_idx)); \ |
| __m256i clip_min_reg = _mm256_sub_epi16(_mm256_setzero_si256(), clip_max_reg); |
| |
| // Load bias register in the shape of [hiadd, loadd]: |
| // hiadd = [b_class0, b_class1, b_class2, b_class3] = 128bit, |
| // loadd = [b_class0, b_class1, b_class2, b_class3] = 128bit |
| // each b_classX is of 32 bit |
| #define gdf_load_bias_reg(bias_regx, biasOff) \ |
| __m256 bias_regx = \ |
| _mm256_loadu2_m128((const float *)(biasOff), (const float *)(biasOff)); |
| |
| // Load weight register in the shape of [weigt[k]_sample[i], |
| // weigt[k]_sample[i+1], .., weigt[k]_sample[i+15]]: |
| // weigt[k]_sample[x] is of 32 bits --> weight_regx contains 8 32-bit |
| // weights (only 16 LSB bits are nonzeros), W[cls_idx[sample0]] of 16-bit |
| // weight_regx = [W[cls_idx[sample0]], W[cls_idx[sample0]], |
| // W[cls_idx[sample1]], W[cls_idx[sample1]], ..., W[cls_idx[sample14]], |
| // W[cls_idx[sample14]]] |
| #define gdf_load_weight_reg(weight_regx, weightOff, m256i_tmp_reg, \ |
| m256_tmp_reg, cls_idx) \ |
| m256i_tmp_reg = _mm256_set1_epi64x(*((const long long *)(weightOff))); \ |
| m256_tmp_reg = _mm256_castsi256_ps( \ |
| _mm256_unpacklo_epi16(m256i_tmp_reg, m256i_tmp_reg)); \ |
| __m256i weight_regx = \ |
| _mm256_castps_si256(_mm256_permutevar_ps(m256_tmp_reg, cls_idx)); |
| |
| // Generate two vectors: |
| // odd_clip = [16-bit 0, X[1], 16-bit 0, X[3], 16-bit 0, X[5], ..., 16-bit |
| // 0, X[15]] even_clip = [16-bit 0, X[0], 16-bit 0, X[2], 16-bit 0, X[4], |
| // ..., 16-bit 0, X[14]] |
| #define gdf_clip_input_reg(odd_clip, even_clip, sample_reg, clip_min_reg, \ |
| clip_max_reg, m256i_tmp_reg_01, m256i_tmp_reg_02, \ |
| odd_mask) \ |
| m256i_tmp_reg_01 = _mm256_max_epi16(sample_reg, clip_min_reg); \ |
| m256i_tmp_reg_02 = _mm256_min_epi16(m256i_tmp_reg_01, clip_max_reg); \ |
| __m256i odd_clip = _mm256_and_si256(odd_mask, m256i_tmp_reg_02); \ |
| __m256i even_clip = _mm256_andnot_si256(odd_mask, m256i_tmp_reg_02); |
| |
| #define gdf_quant_feature_reg(out_regxx, neg_mask, zero_reg, scale_value, \ |
| half_value, lut_shift, idx_min_reg, idx_max_reg) \ |
| neg_mask = _mm256_cmpgt_epi32(zero_reg, out_regxx); \ |
| out_regxx = _mm256_abs_epi32(out_regxx); \ |
| out_regxx = _mm256_mullo_epi32(out_regxx, scale_value); \ |
| out_regxx = _mm256_add_epi32(out_regxx, half_value); \ |
| out_regxx = _mm256_srli_epi32(out_regxx, lut_shift); \ |
| out_regxx = \ |
| _mm256_sub_epi32(_mm256_xor_si256(out_regxx, neg_mask), neg_mask); \ |
| out_regxx = _mm256_sub_epi32(out_regxx, idx_min_reg); \ |
| out_regxx = _mm256_max_epi32(out_regxx, zero_reg); \ |
| out_regxx = _mm256_min_epi32(out_regxx, idx_max_reg); |
| |
| #define gdf_mult_weight_to_input_reg(out_regx0, out_regx1, mul_regx0, \ |
| mul_regx1, odd_clip, even_clip, \ |
| weight_regx) \ |
| mul_regx0 = _mm256_madd_epi16(odd_clip, weight_regx); \ |
| mul_regx1 = _mm256_madd_epi16(even_clip, weight_regx); \ |
| out_regx0 = _mm256_add_epi32(mul_regx0, out_regx0); \ |
| out_regx1 = _mm256_add_epi32(mul_regx1, out_regx1); |
| |
| #define gdf_assign_bias_to_output_reg(out_regx0, out_regx1, bias_regx, \ |
| cls_idx) \ |
| __m256i out_regx0 = \ |
| _mm256_castps_si256(_mm256_permutevar_ps(bias_regx, cls_idx)); \ |
| __m256i out_regx1 = out_regx0; |
| |
| // Swap the vertical weight/feature if the class index in [1, 3] |
| // cls_is_odd has the LSB moved to 31b for the use of _mm256_blendv_ps |
| #define gdf_swap_value32bit_by_mask32bit( \ |
| vert_reg, horz_reg, m256_vert_tmp_reg, m256_horz_tmp_reg, cls_is_odd) \ |
| m256_vert_tmp_reg = _mm256_castsi256_ps(vert_reg); \ |
| m256_horz_tmp_reg = _mm256_castsi256_ps(horz_reg); \ |
| vert_reg = _mm256_castps_si256(_mm256_blendv_ps( \ |
| m256_vert_tmp_reg, m256_horz_tmp_reg, _mm256_castsi256_ps(cls_is_odd))); \ |
| horz_reg = _mm256_castps_si256(_mm256_blendv_ps( \ |
| m256_horz_tmp_reg, m256_vert_tmp_reg, _mm256_castsi256_ps(cls_is_odd))); |
| |
| static inline __m256i gdf_intra_get_idx(__m256i *out_reg0, __m256i *out_reg1, |
| __m256i *out_reg2) { |
| return _mm256_add_epi32(_mm256_add_epi32(_mm256_slli_epi32(*out_reg0, 8), |
| _mm256_slli_epi32(*out_reg1, 4)), |
| *out_reg2); |
| } |
| |
| static inline __m256i gdf_inter_get_idx(__m256i *out_reg0, __m256i *out_reg1, |
| __m256i *out_reg2) { |
| return _mm256_add_epi32( |
| _mm256_add_epi32( |
| _mm256_add_epi32(_mm256_add_epi32(_mm256_slli_epi32(*out_reg0, 6), |
| _mm256_slli_epi32(*out_reg0, 5)), |
| _mm256_slli_epi32(*out_reg0, 2)), |
| _mm256_add_epi32(_mm256_slli_epi32(*out_reg1, 3), |
| _mm256_slli_epi32(*out_reg1, 1))), |
| *out_reg2); |
| } |
| |
| /*!\brief Function to generate vertical/horizontal/mixed features |
| * and then lookup for expected coding error with the |
| * corresponding quantized features |
| */ |
| void gdf_inference_unit_avx2(const int i_min, const int i_max, const int j_min, |
| const int j_max, const int stripe_size, |
| const int qp_idx, const uint16_t *rec_pnt, |
| const int rec_stride, uint16_t *const *gdf_lap_pnt, |
| const int gdf_lap_stride, |
| const uint32_t *gdf_cls_pnt, |
| const int gdf_cls_stride, int16_t *err_pnt, |
| const int err_stride, const int pxl_shift, |
| const int ref_dst_idx) { |
| assert(((i_max - i_min) & 1) == 0); |
| assert(((j_max - j_min) & 1) == 0); |
| assert((i_min & 1) == 0); |
| assert((j_min & 1) == 0); |
| |
| const int is_intra = ref_dst_idx == 0 ? 1 : 0; |
| const int lut_frm_max = |
| is_intra ? GDF_NET_LUT_IDX_INTRA_MAX : GDF_NET_LUT_IDX_INTER_MAX; |
| const int lut_idx_min = -(lut_frm_max >> 1); |
| const int lut_idx_max = lut_frm_max - 1 + lut_idx_min; |
| const int lut_idx_scale = AOMMAX(-lut_idx_min, lut_idx_max); |
| int32_t lut_shift = |
| GDF_TEST_INP_PREC - GDF_TRAIN_INP_PREC + GDF_TRAIN_PAR_SCALE_LOG2; |
| int32_t lut_shitf_half = 1 << (lut_shift - 1); |
| const int16_t *alpha, *weight; |
| const int32_t *bias; |
| const int8_t *gdf_table; |
| const uint16_t *copied_lap_pnt[GDF_NET_INP_GRD_NUM]; |
| memcpy(copied_lap_pnt, gdf_lap_pnt, |
| sizeof(const uint16_t *) * GDF_NET_INP_GRD_NUM); |
| if (is_intra) { |
| alpha = gdf_intra_alpha_table[qp_idx]; |
| weight = gdf_intra_weight_table[qp_idx]; |
| bias = gdf_intra_bias_table[qp_idx]; |
| gdf_table = gdf_intra_error_table[qp_idx]; |
| } else { |
| alpha = gdf_inter_alpha_table[ref_dst_idx - 1][qp_idx]; |
| weight = gdf_inter_weight_table[ref_dst_idx - 1][qp_idx]; |
| bias = gdf_inter_bias_table[ref_dst_idx - 1][qp_idx]; |
| gdf_table = gdf_inter_error_table[ref_dst_idx - 1][qp_idx]; |
| } |
| __m256i (*gdf_get_idx_func)(__m256i *, __m256i *, __m256i *) = |
| is_intra ? gdf_intra_get_idx : gdf_inter_get_idx; |
| |
| gdf_load_bias_reg(bias_reg0, bias); |
| gdf_load_bias_reg(bias_reg1, bias + GDF_NET_INP_GRD_NUM); |
| gdf_load_bias_reg(bias_reg2, |
| bias + GDF_NET_INP_GRD_NUM + GDF_NET_INP_GRD_NUM); |
| |
| int16_t *tgt_line = err_pnt; |
| const uint16_t *rec_ptr = rec_pnt; |
| |
| __m256i m256i_tmp_reg_01, m256i_tmp_reg_02; |
| __m256i odd_mask = _mm256_set1_epi32(0x0000ffff); |
| const __m256i min_val = _mm256_set1_epi16(-(1 << (GDF_TEST_INP_PREC - 1))); |
| const __m256i max_val = _mm256_set1_epi16((1 << (GDF_TEST_INP_PREC - 1)) - 1); |
| __m256 m256_tmp_reg, m256_tmp_reg_02; |
| |
| for (int i = 0; i < (i_max - i_min); i++) { |
| int vertical_spatial_support_min = |
| -GDF_TEST_LINE_BUFFER - |
| ((i + i_min + GDF_TEST_STRIPE_OFF) % stripe_size); |
| int vertical_spatial_support_max = |
| (stripe_size - 1 + GDF_TEST_LINE_BUFFER) - |
| ((i + i_min + GDF_TEST_STRIPE_OFF) % stripe_size); |
| for (int j = 0; j < (j_max - j_min); j += 16) { |
| __m256i cls_idx = |
| _mm256_load_si256((const __m256i *)(gdf_cls_pnt + (j >> 1))); |
| __m256i cls_is_odd = _mm256_slli_epi32(cls_idx, 31); |
| gdf_assign_bias_to_output_reg(out_reg00, out_reg01, bias_reg0, cls_idx); |
| gdf_assign_bias_to_output_reg(out_reg10, out_reg11, bias_reg1, cls_idx); |
| gdf_assign_bias_to_output_reg(out_reg20, out_reg21, bias_reg2, cls_idx); |
| gdf_swap_value32bit_by_mask32bit(out_reg00, out_reg10, m256_tmp_reg, |
| m256_tmp_reg_02, cls_is_odd); |
| gdf_swap_value32bit_by_mask32bit(out_reg01, out_reg11, m256_tmp_reg, |
| m256_tmp_reg_02, cls_is_odd); |
| |
| for (int k = 0; k < GDF_NET_INP_REC_NUM; k++) { |
| __m256i input_reg1 = _mm256_loadu_si256((const __m256i *)(rec_ptr + j)); |
| #if GDF_TEST_VIRTUAL_BOUNDARY |
| int gdf_rec_coordinates_fwd = |
| (gdf_guided_sample_coordinates_fwd[k][0] < |
| vertical_spatial_support_min) |
| ? -gdf_guided_sample_coordinates_fwd[k][0] |
| : gdf_guided_sample_coordinates_fwd[k][0]; |
| const uint16_t *s_pos_fwd = rec_ptr + j + |
| (gdf_rec_coordinates_fwd * rec_stride) + |
| gdf_guided_sample_coordinates_fwd[k][1]; |
| #else // |
| const uint16_t *s_pos_fwd = |
| rec_ptr + j + |
| (gdf_guided_sample_coordinates_fwd[k][0] * rec_stride) + |
| gdf_guided_sample_coordinates_fwd[k][1]; |
| #endif // |
| m256i_tmp_reg_01 = _mm256_loadu_si256((const __m256i *)(s_pos_fwd)); |
| m256i_tmp_reg_02 = _mm256_sub_epi16(m256i_tmp_reg_01, input_reg1); |
| __m256i sample_reg0 = _mm256_slli_epi16(m256i_tmp_reg_02, pxl_shift); |
| |
| #if GDF_TEST_VIRTUAL_BOUNDARY |
| int gdf_rec_coordinates_bwd = |
| (gdf_guided_sample_coordinates_bwd[k][0] > |
| vertical_spatial_support_max) |
| ? -gdf_guided_sample_coordinates_bwd[k][0] |
| : gdf_guided_sample_coordinates_bwd[k][0]; |
| const uint16_t *s_pos_bwd = rec_ptr + j + |
| (gdf_rec_coordinates_bwd * rec_stride) + |
| gdf_guided_sample_coordinates_bwd[k][1]; |
| #else // |
| const uint16_t *s_pos_bwd = |
| rec_ptr + j + |
| (gdf_guided_sample_coordinates_bwd[k][0] * rec_stride) + |
| gdf_guided_sample_coordinates_bwd[k][1]; |
| #endif // |
| m256i_tmp_reg_01 = _mm256_loadu_si256((const __m256i *)(s_pos_bwd)); |
| m256i_tmp_reg_02 = _mm256_sub_epi16(m256i_tmp_reg_01, input_reg1); |
| __m256i sample_reg1 = _mm256_slli_epi16(m256i_tmp_reg_02, pxl_shift); |
| |
| gdf_load_alpha_reg(clip_max_reg, clip_min_reg, |
| alpha + k * GDF_TRAIN_CLS_NUM, m256i_tmp_reg_01, |
| m256_tmp_reg, cls_idx); |
| gdf_clip_input_reg(odd_clip0, even_clip0, sample_reg0, clip_min_reg, |
| clip_max_reg, m256i_tmp_reg_01, m256i_tmp_reg_02, |
| odd_mask); |
| gdf_clip_input_reg(odd_clip1, even_clip1, sample_reg1, clip_min_reg, |
| clip_max_reg, m256i_tmp_reg_01, m256i_tmp_reg_02, |
| odd_mask); |
| __m256i odd_clip = _mm256_min_epi16( |
| _mm256_max_epi16(_mm256_add_epi16(odd_clip0, odd_clip1), min_val), |
| max_val); |
| __m256i even_clip = _mm256_min_epi16( |
| _mm256_max_epi16(_mm256_add_epi16(even_clip0, even_clip1), min_val), |
| max_val); |
| |
| gdf_load_weight_reg(weight_reg0, weight + k * GDF_TRAIN_CLS_NUM, |
| m256i_tmp_reg_01, m256_tmp_reg, cls_idx); |
| gdf_load_weight_reg(weight_reg1, |
| weight + k * GDF_TRAIN_CLS_NUM + |
| GDF_OPTS_INP_TOT * GDF_TRAIN_CLS_NUM, |
| m256i_tmp_reg_01, m256_tmp_reg, cls_idx); |
| gdf_swap_value32bit_by_mask32bit(weight_reg0, weight_reg1, m256_tmp_reg, |
| m256_tmp_reg_02, cls_is_odd); |
| if (gdf_guided_sample_vertical_masks[k]) { |
| gdf_mult_weight_to_input_reg(out_reg00, out_reg01, m256i_tmp_reg_01, |
| m256i_tmp_reg_02, odd_clip, even_clip, |
| weight_reg0); |
| } |
| if (gdf_guided_sample_horizontal_masks[k]) { |
| gdf_mult_weight_to_input_reg(out_reg10, out_reg11, m256i_tmp_reg_01, |
| m256i_tmp_reg_02, odd_clip, even_clip, |
| weight_reg1); |
| } |
| if (gdf_guided_sample_mixed_masks[k]) { |
| gdf_load_weight_reg(weight_reg2, |
| weight + k * GDF_TRAIN_CLS_NUM + |
| GDF_OPTS_INP_TOT * 2 * GDF_TRAIN_CLS_NUM, |
| m256i_tmp_reg_01, m256_tmp_reg, cls_idx); |
| gdf_mult_weight_to_input_reg(out_reg20, out_reg21, m256i_tmp_reg_01, |
| m256i_tmp_reg_02, odd_clip, even_clip, |
| weight_reg2); |
| } |
| } |
| gdf_swap_value32bit_by_mask32bit(out_reg00, out_reg10, m256_tmp_reg, |
| m256_tmp_reg_02, cls_is_odd); |
| gdf_swap_value32bit_by_mask32bit(out_reg01, out_reg11, m256_tmp_reg, |
| m256_tmp_reg_02, cls_is_odd); |
| |
| for (int k = GDF_NET_INP_REC_NUM; |
| k < (GDF_NET_INP_GRD_NUM + GDF_NET_INP_REC_NUM); k++) { |
| m256i_tmp_reg_01 = _mm256_load_si256( |
| (const __m256i *)(copied_lap_pnt[k - GDF_NET_INP_REC_NUM] + j)); |
| m256i_tmp_reg_02 = _mm256_slli_epi16(m256i_tmp_reg_01, pxl_shift); |
| __m256i sample_reg = |
| _mm256_srli_epi16(m256i_tmp_reg_02, GDF_TRAIN_GRD_SHIFT); |
| |
| gdf_load_alpha_reg(clip_max_reg, clip_min_reg, |
| alpha + k * GDF_TRAIN_CLS_NUM, m256i_tmp_reg_01, |
| m256_tmp_reg, cls_idx); |
| gdf_clip_input_reg(odd_clip, even_clip, sample_reg, clip_min_reg, |
| clip_max_reg, m256i_tmp_reg_01, m256i_tmp_reg_02, |
| odd_mask) |
| |
| gdf_load_weight_reg(weight_reg2, |
| weight + k * GDF_TRAIN_CLS_NUM + |
| GDF_OPTS_INP_TOT * 2 * GDF_TRAIN_CLS_NUM, |
| m256i_tmp_reg_01, m256_tmp_reg, cls_idx); |
| gdf_mult_weight_to_input_reg(out_reg20, out_reg21, m256i_tmp_reg_01, |
| m256i_tmp_reg_02, odd_clip, even_clip, |
| weight_reg2) |
| } |
| |
| __m256i scale_value = _mm256_set1_epi32(lut_idx_scale); |
| __m256i half_value = _mm256_set1_epi32(lut_shitf_half); |
| __m256i idx_min_reg = _mm256_set1_epi32(lut_idx_min); |
| __m256i idx_max_reg = _mm256_set1_epi32(lut_frm_max - 1); |
| __m256i zero_reg = _mm256_setzero_si256(); |
| __m256i neg_mask; |
| |
| gdf_quant_feature_reg(out_reg00, neg_mask, zero_reg, scale_value, |
| half_value, lut_shift, idx_min_reg, idx_max_reg); |
| gdf_quant_feature_reg(out_reg01, neg_mask, zero_reg, scale_value, |
| half_value, lut_shift, idx_min_reg, idx_max_reg); |
| gdf_quant_feature_reg(out_reg10, neg_mask, zero_reg, scale_value, |
| half_value, lut_shift, idx_min_reg, idx_max_reg); |
| gdf_quant_feature_reg(out_reg11, neg_mask, zero_reg, scale_value, |
| half_value, lut_shift, idx_min_reg, idx_max_reg); |
| gdf_quant_feature_reg(out_reg20, neg_mask, zero_reg, scale_value, |
| half_value, lut_shift, idx_min_reg, idx_max_reg); |
| gdf_quant_feature_reg(out_reg21, neg_mask, zero_reg, scale_value, |
| half_value, lut_shift, idx_min_reg, idx_max_reg); |
| |
| __m256i lut_idx_odd = |
| gdf_get_idx_func(&out_reg00, &out_reg10, &out_reg20); |
| _mm256_add_epi32(_mm256_add_epi32(_mm256_slli_epi32(out_reg00, 8), |
| _mm256_slli_epi32(out_reg10, 4)), |
| out_reg20); |
| __m256i lut_idx_even = |
| gdf_get_idx_func(&out_reg01, &out_reg11, &out_reg21); |
| |
| __m256i sub_idx_mask = _mm256_set1_epi32(0x3); |
| __m256i v_odd = _mm256_i32gather_epi32( |
| (int *)gdf_table, _mm256_andnot_si256(sub_idx_mask, lut_idx_odd), 1); |
| __m256i v_even = _mm256_i32gather_epi32( |
| (int *)gdf_table, _mm256_andnot_si256(sub_idx_mask, lut_idx_even), 1); |
| |
| __m256i tv_odd = _mm256_srai_epi32( |
| _mm256_slli_epi32( |
| _mm256_srlv_epi32( |
| v_odd, _mm256_slli_epi32( |
| _mm256_and_si256(sub_idx_mask, lut_idx_odd), 3)), |
| 24), |
| 24); |
| __m256i tv_even = _mm256_srai_epi32( |
| _mm256_slli_epi32( |
| _mm256_srlv_epi32( |
| v_even, _mm256_slli_epi32( |
| _mm256_and_si256(sub_idx_mask, lut_idx_even), 3)), |
| 24), |
| 8); |
| |
| __m256i out_reg = _mm256_blend_epi16(tv_odd, tv_even, 0xAA); |
| |
| _mm256_storeu_si256((__m256i *)(tgt_line + j), out_reg); |
| } |
| gdf_cls_pnt += (i & 1) ? gdf_cls_stride : 0; |
| rec_ptr += rec_stride; |
| tgt_line += err_stride; |
| for (int grd_idx = 0; grd_idx < GDF_NET_INP_GRD_NUM; grd_idx++) { |
| copied_lap_pnt[grd_idx] += (i & 1) ? gdf_lap_stride : 0; |
| } |
| } |
| } |