|  | /* | 
|  | * Copyright (c) 2021, Alliance for Open Media. All rights reserved | 
|  | * | 
|  | * This source code is subject to the terms of the BSD 3-Clause Clear License | 
|  | * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear | 
|  | * License was not distributed with this source code in the LICENSE file, you | 
|  | * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the | 
|  | * Alliance for Open Media Patent License 1.0 was not distributed with this | 
|  | * source code in the PATENTS file, you can obtain it at | 
|  | * aomedia.org/license/patent-license/. | 
|  | */ | 
|  | #include <emmintrin.h> | 
|  | #include <smmintrin.h>  // For SSE4.1 | 
|  | #include <immintrin.h> | 
|  |  | 
|  | #include "config/av1_rtcd.h" | 
|  |  | 
|  | #include "av1/common/cfl.h" | 
|  |  | 
|  | #include "av1/common/reconinter.h" | 
|  |  | 
|  | #if CONFIG_ENABLE_MHCCP | 
|  | static __m128i read_int64(int64_t x) { | 
|  | #ifdef __x86_64__ | 
|  | return _mm_cvtsi64_si128(x); | 
|  | #else | 
|  | return _mm_set_epi32(0, 0, x >> 32, (int32_t)x); | 
|  | #endif | 
|  | } | 
|  |  | 
|  | void mhccp_predict_hv_hbd_sse4_1(const uint16_t *input, uint16_t *dst, | 
|  | bool have_top, bool have_left, int dst_stride, | 
|  | int64_t *alpha_q3, int bit_depth, int width, | 
|  | int height, int dir) { | 
|  | const uint16_t mid_s = (1 << (bit_depth - 1)); | 
|  | const __m128i mid_vec = _mm_set1_epi16(mid_s); | 
|  | const int cfl_stride = CFL_BUF_LINE * 2; | 
|  | const uint16_t *above_row = input - cfl_stride; | 
|  | __m128i rounding_vec = _mm_set1_epi32(MHCCP_DECIM_ROUND); | 
|  | __m128i max_val_vec; | 
|  | switch (bit_depth) { | 
|  | case 8: | 
|  | default: max_val_vec = _mm_set1_epi16(255); break; | 
|  | case 10: max_val_vec = _mm_set1_epi16(1023); break; | 
|  | case 12: max_val_vec = _mm_set1_epi16(4095); break; | 
|  | } | 
|  | __m128i zero_vec = _mm_setzero_si128(); | 
|  |  | 
|  | for (int j = 0; j < height; j++) { | 
|  | const uint16_t *current_input_row = input; | 
|  | uint16_t *current_dst_row = dst; | 
|  | __m128i left_vec_shr3 = _mm_setzero_si128();  // For the very first block | 
|  | uint16_t left_element = | 
|  | current_input_row[-1] >> 3;  // Value to the left of the current block | 
|  |  | 
|  | for (int i = 0; i < width; i += 8) { | 
|  | __m128i current_in_vec = | 
|  | _mm_loadu_si128((const __m128i *)¤t_input_row[i]); | 
|  | __m128i current_in_shr3_vec = _mm_srli_epi16(current_in_vec, 3); | 
|  |  | 
|  | __m128i vector_1_vec; | 
|  | if (dir == 0) { | 
|  | if (j == 0 && !have_top) { | 
|  | vector_1_vec = current_in_shr3_vec; | 
|  | } else { | 
|  | vector_1_vec = _mm_srli_epi16( | 
|  | _mm_loadu_si128((const __m128i *)&above_row[i]), 3); | 
|  | } | 
|  | } else { | 
|  | __m128i prev_vec; | 
|  | if (i == 0) { | 
|  | uint16_t L = !have_left ? (current_input_row[i] >> 3) : left_element; | 
|  | prev_vec = _mm_setr_epi16(L, L, L, L, L, L, L, L); | 
|  | } else { | 
|  | prev_vec = left_vec_shr3; | 
|  | } | 
|  | vector_1_vec = _mm_alignr_epi8(current_in_shr3_vec, prev_vec, 14); | 
|  | } | 
|  | left_vec_shr3 = current_in_shr3_vec; | 
|  | left_element = | 
|  | _mm_extract_epi16(current_in_shr3_vec, 7);  // Update for next block | 
|  |  | 
|  | // --- Calculate NON_LINEAR (V * V + M) >> BD --- | 
|  | __m128i v_val = current_in_shr3_vec; | 
|  | __m128i v_val_lo = _mm_mullo_epi16(v_val, v_val); | 
|  | __m128i v_val_hi = _mm_mulhi_epi16(v_val, v_val); | 
|  | __m128i v_sq_lo = _mm_unpacklo_epi16(v_val_lo, v_val_hi); | 
|  | __m128i v_sq_hi = _mm_unpackhi_epi16(v_val_lo, v_val_hi); | 
|  |  | 
|  | __m128i mid_vec_bcast_lo = | 
|  | _mm_unpacklo_epi16(mid_vec, _mm_setzero_si128()); | 
|  | __m128i mid_vec_bcast_hi = | 
|  | _mm_unpackhi_epi16(mid_vec, _mm_setzero_si128()); | 
|  |  | 
|  | __m128i non_linear_res_lo = _mm_add_epi32(v_sq_lo, mid_vec_bcast_lo); | 
|  | __m128i non_linear_res_hi = _mm_add_epi32(v_sq_hi, mid_vec_bcast_hi); | 
|  |  | 
|  | non_linear_res_lo = _mm_srai_epi32(non_linear_res_lo, bit_depth); | 
|  | non_linear_res_hi = _mm_srai_epi32(non_linear_res_hi, bit_depth); | 
|  |  | 
|  | __m128i vector_2_vec = | 
|  | _mm_packus_epi32(non_linear_res_lo, non_linear_res_hi); | 
|  | __m128i vector_0_vec = current_in_shr3_vec; | 
|  | __m128i vector_3_vec = mid_vec; | 
|  |  | 
|  | // --- Convolve --- | 
|  | __m128i sum_lo = _mm_setzero_si128(); | 
|  | __m128i sum_hi = _mm_setzero_si128(); | 
|  |  | 
|  | // param 0 | 
|  | __m128i param0_lo_val = read_int64(alpha_q3[0]); | 
|  | __m128i param0_lo_bcast = | 
|  | _mm_shuffle_epi32(param0_lo_val, _MM_SHUFFLE(0, 0, 0, 0)); | 
|  | __m128i vec0_lo = _mm_unpacklo_epi16(vector_0_vec, zero_vec); | 
|  | __m128i vec0_hi = _mm_unpackhi_epi16(vector_0_vec, zero_vec); | 
|  | sum_lo = _mm_add_epi32(sum_lo, _mm_mullo_epi32(vec0_lo, param0_lo_bcast)); | 
|  | sum_hi = _mm_add_epi32(sum_hi, _mm_mullo_epi32(vec0_hi, param0_lo_bcast)); | 
|  |  | 
|  | // param 1 | 
|  | __m128i param1_lo_val = read_int64(alpha_q3[1]); | 
|  | __m128i param1_lo_bcast = | 
|  | _mm_shuffle_epi32(param1_lo_val, _MM_SHUFFLE(0, 0, 0, 0)); | 
|  | __m128i vec1_lo = _mm_unpacklo_epi16(vector_1_vec, zero_vec); | 
|  | __m128i vec1_hi = _mm_unpackhi_epi16(vector_1_vec, zero_vec); | 
|  | sum_lo = _mm_add_epi32(sum_lo, _mm_mullo_epi32(vec1_lo, param1_lo_bcast)); | 
|  | sum_hi = _mm_add_epi32(sum_hi, _mm_mullo_epi32(vec1_hi, param1_lo_bcast)); | 
|  |  | 
|  | // param 2 | 
|  | __m128i param2_lo_val = read_int64(alpha_q3[2]); | 
|  | __m128i param2_lo_bcast = | 
|  | _mm_shuffle_epi32(param2_lo_val, _MM_SHUFFLE(0, 0, 0, 0)); | 
|  | __m128i vec2_lo = _mm_unpacklo_epi16(vector_2_vec, zero_vec); | 
|  | __m128i vec2_hi = _mm_unpackhi_epi16(vector_2_vec, zero_vec); | 
|  | sum_lo = _mm_add_epi32(sum_lo, _mm_mullo_epi32(vec2_lo, param2_lo_bcast)); | 
|  | sum_hi = _mm_add_epi32(sum_hi, _mm_mullo_epi32(vec2_hi, param2_lo_bcast)); | 
|  |  | 
|  | // param 3 | 
|  | __m128i param3_lo_val = read_int64(alpha_q3[3]); | 
|  | __m128i param3_lo_bcast = | 
|  | _mm_shuffle_epi32(param3_lo_val, _MM_SHUFFLE(0, 0, 0, 0)); | 
|  | __m128i vec3_lo = _mm_unpacklo_epi16(vector_3_vec, zero_vec); | 
|  | __m128i vec3_hi = _mm_unpackhi_epi16(vector_3_vec, zero_vec); | 
|  | sum_lo = _mm_add_epi32(sum_lo, _mm_mullo_epi32(vec3_lo, param3_lo_bcast)); | 
|  | sum_hi = _mm_add_epi32(sum_hi, _mm_mullo_epi32(vec3_hi, param3_lo_bcast)); | 
|  |  | 
|  | sum_lo = _mm_add_epi32(sum_lo, rounding_vec); | 
|  | sum_hi = _mm_add_epi32(sum_hi, rounding_vec); | 
|  |  | 
|  | __m128i shifted_lo = _mm_srai_epi32(sum_lo, MHCCP_DECIM_BITS); | 
|  | __m128i shifted_hi = _mm_srai_epi32(sum_hi, MHCCP_DECIM_BITS); | 
|  |  | 
|  | __m128i convolve_res = _mm_packs_epi32(shifted_lo, shifted_hi); | 
|  |  | 
|  | // --- Clip --- | 
|  | __m128i clipped_res = _mm_max_epi16(convolve_res, zero_vec); | 
|  | clipped_res = _mm_min_epi16(clipped_res, max_val_vec); | 
|  |  | 
|  | if (width < 8) { | 
|  | _mm_storeu_si64((__m128i *)¤t_dst_row[i], clipped_res); | 
|  | } else { | 
|  | _mm_storeu_si128((__m128i *)¤t_dst_row[i], clipped_res); | 
|  | } | 
|  | } | 
|  |  | 
|  | dst += dst_stride; | 
|  | input += cfl_stride; | 
|  | above_row += cfl_stride; | 
|  | } | 
|  | } | 
|  | #endif  // CONFIG_ENABLE_MHCCP |