blob: 9986666de59ab339999ce68cdfa02056067cab48 [file] [edit]
/*
* Copyright (c) 2026, Alliance for Open Media. All rights reserved.
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <emmintrin.h>
#include "config/av1_rtcd.h"
void av1_interp_cubic_rate_dist_sse2(const double *p1, const double *p2,
double x, double *const rate_f,
double *const distbysse_f) {
const __m128d half = _mm_set1_pd(0.5);
const __m128d two = _mm_set1_pd(2.0);
const __m128d three = _mm_set1_pd(3.0);
const __m128d four = _mm_set1_pd(4.0);
const __m128d five = _mm_set1_pd(5.0);
const __m128d reg_x = _mm_set1_pd(x);
const __m128d reg_p0 = _mm_set_pd(p2[0], p1[0]);
const __m128d reg_p1 = _mm_set_pd(p2[1], p1[1]);
const __m128d reg_p2 = _mm_set_pd(p2[2], p1[2]);
const __m128d reg_p3 = _mm_set_pd(p2[3], p1[3]);
// To ensure that results are bit-identical to the C code, we need to perform
// exactly the same sequence of operations here as in the C code.
// reg_res_0 = x * (3.0 * (p[1] - p[2]) + p[3] - p[0])
__m128d reg_res_0 = _mm_sub_pd(reg_p1, reg_p2);
reg_res_0 = _mm_mul_pd(three, reg_res_0);
reg_res_0 = _mm_add_pd(reg_res_0, reg_p3);
reg_res_0 = _mm_sub_pd(reg_res_0, reg_p0);
reg_res_0 = _mm_mul_pd(reg_x, reg_res_0);
// reg_res_1 = 2.0 * p[0] - 5.0 * p[1] + 4.0 * p[2]- p[3]
const __m128d regp0_x_2 = _mm_mul_pd(two, reg_p0);
const __m128d regp1_x_5 = _mm_mul_pd(five, reg_p1);
const __m128d regp2_x_4 = _mm_mul_pd(four, reg_p2);
__m128d reg_res_1 = _mm_sub_pd(regp0_x_2, regp1_x_5);
reg_res_1 = _mm_add_pd(reg_res_1, regp2_x_4);
reg_res_1 = _mm_sub_pd(reg_res_1, reg_p3);
// reg_res_2 = x * (reg_res_1 + reg_res_0)
__m128d reg_res_2 = _mm_add_pd(reg_res_1, reg_res_0);
reg_res_2 = _mm_mul_pd(reg_x, reg_res_2);
// reg_res_3 = p[2] - p[0] + reg_res_2
__m128d reg_res_3 = _mm_sub_pd(reg_p2, reg_p0);
reg_res_3 = _mm_add_pd(reg_res_3, reg_res_2);
// reg_res_4 = p[1] + 0.5 * x * reg_res_3
__m128d reg_res_4 = _mm_mul_pd(_mm_mul_pd(half, reg_x), reg_res_3);
reg_res_4 = _mm_add_pd(reg_p1, reg_res_4);
_mm_storel_pd(rate_f, reg_res_4);
_mm_storeh_pd(distbysse_f, reg_res_4);
}