| /* | 
 |  * Copyright (c) 2021, Alliance for Open Media. All rights reserved | 
 |  * | 
 |  * This source code is subject to the terms of the BSD 3-Clause Clear License | 
 |  * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear | 
 |  * License was not distributed with this source code in the LICENSE file, you | 
 |  * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the | 
 |  * Alliance for Open Media Patent License 1.0 was not distributed with this | 
 |  * source code in the PATENTS file, you can obtain it at | 
 |  * aomedia.org/license/patent-license/. | 
 |  */ | 
 |  | 
 | #include <assert.h> | 
 |  | 
 | #include "aom/aom_integer.h" | 
 |  | 
 | #include "aom_ports/mem.h" | 
 |  | 
 | #include "aom_dsp/aom_dsp_common.h" | 
 |  | 
 | #include "av1/common/reconinter.h" | 
 |  | 
 | #define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS) | 
 |  | 
 | /** | 
 |  * Computes SSE of a compound predictor constructed from 2 fundamental | 
 |  * predictors p0 and p1 using blending with mask. | 
 |  * | 
 |  * r1:  Residuals of p1. | 
 |  *      (source - p1) | 
 |  * d:   Difference of p1 and p0. | 
 |  *      (p1 - p0) | 
 |  * m:   The blending mask | 
 |  * N:   Number of pixels | 
 |  * | 
 |  * 'r1', 'd', and 'm' are contiguous. | 
 |  * | 
 |  * Computes: | 
 |  *  Sum((MAX_MASK_VALUE*r1 + mask*d)**2), which is equivalent to: | 
 |  *  Sum((mask*r0 + (MAX_MASK_VALUE-mask)*r1)**2), | 
 |  *    where r0 is (source - p0), and r1 is (source - p1), which is in turn | 
 |  *    is equivalent to: | 
 |  *  Sum((source*MAX_MASK_VALUE - (mask*p0 + (MAX_MASK_VALUE-mask)*p1))**2), | 
 |  *    which is the SSE of the residuals of the compound predictor scaled up by | 
 |  *    MAX_MASK_VALUE**2. | 
 |  * | 
 |  * Note that we clamp the partial term in the loop to 16 bits signed. This is | 
 |  * to facilitate equivalent SIMD implementation. It should have no effect if | 
 |  * residuals are within 16 - WEDGE_WEIGHT_BITS (=10) signed, which always | 
 |  * holds for 8 bit input, and on real input, it should hold practically always, | 
 |  * as residuals are expected to be small. | 
 |  */ | 
 | uint64_t av1_wedge_sse_from_residuals_c(const int16_t *r1, const int16_t *d, | 
 |                                         const uint8_t *m, int N) { | 
 |   uint64_t csse = 0; | 
 |   int i; | 
 |  | 
 |   for (i = 0; i < N; i++) { | 
 |     int32_t t = MAX_MASK_VALUE * r1[i] + m[i] * d[i]; | 
 |     t = clamp(t, INT16_MIN, INT16_MAX); | 
 |     csse += t * t; | 
 |   } | 
 |   return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS); | 
 | } | 
 |  | 
 | /** | 
 |  * Choose the mask sign for a compound predictor. | 
 |  * | 
 |  * ds:    Difference of the squares of the residuals. | 
 |  *        r0**2 - r1**2 | 
 |  * m:     The blending mask | 
 |  * N:     Number of pixels | 
 |  * limit: Pre-computed threshold value. | 
 |  *        MAX_MASK_VALUE/2 * (sum(r0**2) - sum(r1**2)) | 
 |  * | 
 |  * 'ds' and 'm' are contiguous. | 
 |  * | 
 |  * Returns true if the negated mask has lower SSE compared to the positive | 
 |  * mask. Computation is based on: | 
 |  *  Sum((mask*r0 + (MAX_MASK_VALUE-mask)*r1)**2) | 
 |  *                                     > | 
 |  *                                Sum(((MAX_MASK_VALUE-mask)*r0 + mask*r1)**2) | 
 |  * | 
 |  *  which can be simplified to: | 
 |  * | 
 |  *  Sum(mask*(r0**2 - r1**2)) > MAX_MASK_VALUE/2 * (sum(r0**2) - sum(r1**2)) | 
 |  * | 
 |  *  The right hand side does not depend on the mask, and needs to be passed as | 
 |  *  the 'limit' parameter. | 
 |  * | 
 |  *  After pre-computing (r0**2 - r1**2), which is passed in as 'ds', the left | 
 |  *  hand side is simply a scalar product between an int16_t and uint8_t vector. | 
 |  * | 
 |  *  Note that for efficiency, ds is stored on 16 bits. Real input residuals | 
 |  *  being small, this should not cause a noticeable issue. | 
 |  */ | 
 | int8_t av1_wedge_sign_from_residuals_c(const int16_t *ds, const uint8_t *m, | 
 |                                        int N, int64_t limit) { | 
 |   int64_t acc = 0; | 
 |  | 
 |   do { | 
 |     acc += *ds++ * *m++; | 
 |   } while (--N); | 
 |  | 
 |   return acc > limit; | 
 | } | 
 |  | 
 | /** | 
 |  * Compute the element-wise difference of the squares of 2 arrays. | 
 |  * | 
 |  * d: Difference of the squares of the inputs: a**2 - b**2 | 
 |  * a: First input array | 
 |  * b: Second input array | 
 |  * N: Number of elements | 
 |  * | 
 |  * 'd', 'a', and 'b' are contiguous. | 
 |  * | 
 |  * The result is saturated to signed 16 bits. | 
 |  */ | 
 | void av1_wedge_compute_delta_squares_c(int16_t *d, const int16_t *a, | 
 |                                        const int16_t *b, int N) { | 
 |   int i; | 
 |  | 
 |   for (i = 0; i < N; i++) | 
 |     d[i] = clamp(a[i] * a[i] - b[i] * b[i], INT16_MIN, INT16_MAX); | 
 | } |