blob: a00d8211bfc0e93de4900e7331eada8d5332281d [file] [log] [blame]
/*
* Copyright (c) 2023, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <arm_neon.h>
#include <stdlib.h>
#include "config/aom_config.h"
#include "config/aom_dsp_rtcd.h"
/* clang-format off */
// Error metric used for global motion evaluation.
static const uint16_t error_measure_lut[256] = {
0, 339, 550, 731, 894, 1045, 1187, 1323,
1452, 1577, 1698, 1815, 1929, 2040, 2148, 2255,
2359, 2461, 2562, 2661, 2758, 2854, 2948, 3041,
3133, 3224, 3314, 3402, 3490, 3577, 3663, 3748,
3832, 3916, 3998, 4080, 4162, 4242, 4322, 4401,
4480, 4558, 4636, 4713, 4789, 4865, 4941, 5015,
5090, 5164, 5237, 5311, 5383, 5456, 5527, 5599,
5670, 5741, 5811, 5881, 5950, 6020, 6089, 6157,
6225, 6293, 6361, 6428, 6495, 6562, 6628, 6695,
6760, 6826, 6891, 6956, 7021, 7086, 7150, 7214,
7278, 7341, 7405, 7468, 7531, 7593, 7656, 7718,
7780, 7842, 7903, 7965, 8026, 8087, 8148, 8208,
8269, 8329, 8389, 8449, 8508, 8568, 8627, 8686,
8745, 8804, 8862, 8921, 8979, 9037, 9095, 9153,
9211, 9268, 9326, 9383, 9440, 9497, 9553, 9610,
9666, 9723, 9779, 9835, 9891, 9947, 10002, 10058,
10113, 10168, 10224, 10279, 10333, 10388, 10443, 10497,
10552, 10606, 10660, 10714, 10768, 10822, 10875, 10929,
10982, 11036, 11089, 11142, 11195, 11248, 11301, 11353,
11406, 11458, 11511, 11563, 11615, 11667, 11719, 11771,
11823, 11875, 11926, 11978, 12029, 12080, 12132, 12183,
12234, 12285, 12335, 12386, 12437, 12487, 12538, 12588,
12639, 12689, 12739, 12789, 12839, 12889, 12939, 12988,
13038, 13088, 13137, 13187, 13236, 13285, 13334, 13383,
13432, 13481, 13530, 13579, 13628, 13676, 13725, 13773,
13822, 13870, 13918, 13967, 14015, 14063, 14111, 14159,
14206, 14254, 14302, 14350, 14397, 14445, 14492, 14539,
14587, 14634, 14681, 14728, 14775, 14822, 14869, 14916,
14963, 15010, 15056, 15103, 15149, 15196, 15242, 15289,
15335, 15381, 15427, 15474, 15520, 15566, 15612, 15657,
15703, 15749, 15795, 15840, 15886, 15932, 15977, 16022,
16068, 16113, 16158, 16204, 16249, 16294, 16339, 16384,
};
/* clang-format on */
int64_t av1_calc_frame_error_neon(const uint8_t *const ref, int ref_stride,
const uint8_t *const dst, int dst_stride,
int width, int height) {
int64_t sum_error[4] = { 0, 0, 0, 0 };
int r = 0;
int d = 0;
do {
int w = width;
int rr = r;
int dd = d;
do {
uint8x16_t dst_v = vld1q_u8(&dst[dd]);
uint8x16_t ref_v = vld1q_u8(&ref[rr]);
#if AOM_ARCH_AARCH64
uint64x2_t abs_v = vreinterpretq_u64_u8(vabdq_u8(dst_v, ref_v));
uint64_t abs0 = vgetq_lane_u64(abs_v, 0);
uint64_t abs1 = vgetq_lane_u64(abs_v, 1);
sum_error[0] += error_measure_lut[(abs0 >> 0) & 0xFF];
sum_error[1] += error_measure_lut[(abs0 >> 8) & 0xFF];
sum_error[2] += error_measure_lut[(abs0 >> 16) & 0xFF];
sum_error[3] += error_measure_lut[(abs0 >> 24) & 0xFF];
sum_error[0] += error_measure_lut[(abs0 >> 32) & 0xFF];
sum_error[1] += error_measure_lut[(abs0 >> 40) & 0xFF];
sum_error[2] += error_measure_lut[(abs0 >> 48) & 0xFF];
sum_error[3] += error_measure_lut[(abs0 >> 56) & 0xFF];
sum_error[0] += error_measure_lut[(abs1 >> 0) & 0xFF];
sum_error[1] += error_measure_lut[(abs1 >> 8) & 0xFF];
sum_error[2] += error_measure_lut[(abs1 >> 16) & 0xFF];
sum_error[3] += error_measure_lut[(abs1 >> 24) & 0xFF];
sum_error[0] += error_measure_lut[(abs1 >> 32) & 0xFF];
sum_error[1] += error_measure_lut[(abs1 >> 40) & 0xFF];
sum_error[2] += error_measure_lut[(abs1 >> 48) & 0xFF];
sum_error[3] += error_measure_lut[(abs1 >> 56) & 0xFF];
#else // !AOM_ARCH_AARCH64
uint32x4_t abs_v = vreinterpretq_u32_u8(vabdq_u8(dst_v, ref_v));
uint32_t abs0 = vgetq_lane_u32(abs_v, 0);
uint32_t abs1 = vgetq_lane_u32(abs_v, 1);
uint32_t abs2 = vgetq_lane_u32(abs_v, 2);
uint32_t abs3 = vgetq_lane_u32(abs_v, 3);
sum_error[0] += error_measure_lut[(abs0 >> 0) & 0xFF];
sum_error[1] += error_measure_lut[(abs0 >> 8) & 0xFF];
sum_error[2] += error_measure_lut[(abs0 >> 16) & 0xFF];
sum_error[3] += error_measure_lut[(abs0 >> 24) & 0xFF];
sum_error[0] += error_measure_lut[(abs1 >> 0) & 0xFF];
sum_error[1] += error_measure_lut[(abs1 >> 8) & 0xFF];
sum_error[2] += error_measure_lut[(abs1 >> 16) & 0xFF];
sum_error[3] += error_measure_lut[(abs1 >> 24) & 0xFF];
sum_error[0] += error_measure_lut[(abs2 >> 0) & 0xFF];
sum_error[1] += error_measure_lut[(abs2 >> 8) & 0xFF];
sum_error[2] += error_measure_lut[(abs2 >> 16) & 0xFF];
sum_error[3] += error_measure_lut[(abs2 >> 24) & 0xFF];
sum_error[0] += error_measure_lut[(abs3 >> 0) & 0xFF];
sum_error[1] += error_measure_lut[(abs3 >> 8) & 0xFF];
sum_error[2] += error_measure_lut[(abs3 >> 16) & 0xFF];
sum_error[3] += error_measure_lut[(abs3 >> 24) & 0xFF];
#endif // AOM_ARCH_AARCH64
dd += 16;
rr += 16;
w -= 16;
} while (w >= 16);
while (w-- != 0) {
sum_error[0] += error_measure_lut[abs(dst[dd] - ref[rr])];
dd++;
rr++;
}
r += ref_stride;
d += dst_stride;
} while (--height != 0);
return sum_error[0] + sum_error[1] + sum_error[2] + sum_error[3];
}