blob: f3e24956e8cb50d6fe750c16533949d8c674f1b1 [file] [log] [blame]
/*
* Copyright (c) 2022, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <arm_neon.h>
#include "config/aom_config.h"
#include "config/aom_dsp_rtcd.h"
#include "aom/aom_integer.h"
// -----------------------------------------------------------------------------
// DC
static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
const uint16_t *above,
const uint16_t *left) {
assert(bw >= 4);
assert(IS_POWER_OF_TWO(bw));
int expected_dc, sum = 0;
const int count = bw * 2;
uint32x4_t sum_q = vdupq_n_u32(0);
uint32x2_t sum_d;
uint16_t *dst_1;
if (bw >= 8) {
for (int i = 0; i < bw; i += 8) {
sum_q = vpadalq_u16(sum_q, vld1q_u16(above));
sum_q = vpadalq_u16(sum_q, vld1q_u16(left));
above += 8;
left += 8;
}
sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q));
sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0);
expected_dc = (sum + (count >> 1)) / count;
const uint16x8_t dc = vdupq_n_u16((uint16_t)expected_dc);
for (int r = 0; r < bw; r++) {
dst_1 = dst;
for (int i = 0; i < bw; i += 8) {
vst1q_u16(dst_1, dc);
dst_1 += 8;
}
dst += stride;
}
} else { // 4x4
sum_q = vaddl_u16(vld1_u16(above), vld1_u16(left));
sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q));
sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0);
expected_dc = (sum + (count >> 1)) / count;
const uint16x4_t dc = vdup_n_u16((uint16_t)expected_dc);
for (int r = 0; r < bw; r++) {
vst1_u16(dst, dc);
dst += stride;
}
}
}
#define INTRA_PRED_HIGHBD_SIZED_NEON(type, width) \
void aom_highbd_##type##_predictor_##width##x##width##_neon( \
uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
const uint16_t *left, int bd) { \
(void)bd; \
highbd_##type##_predictor(dst, stride, width, above, left); \
}
#define INTRA_PRED_SQUARE(type) \
INTRA_PRED_HIGHBD_SIZED_NEON(type, 4) \
INTRA_PRED_HIGHBD_SIZED_NEON(type, 8) \
INTRA_PRED_HIGHBD_SIZED_NEON(type, 16) \
INTRA_PRED_HIGHBD_SIZED_NEON(type, 32) \
INTRA_PRED_HIGHBD_SIZED_NEON(type, 64)
INTRA_PRED_SQUARE(dc)
#undef INTRA_PRED_SQUARE
// -----------------------------------------------------------------------------
// V_PRED
#define HIGHBD_V_NXM(W, H) \
void aom_highbd_v_predictor_##W##x##H##_neon( \
uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
const uint16_t *left, int bd) { \
(void)left; \
(void)bd; \
vertical##W##xh_neon(dst, stride, above, H); \
}
static INLINE uint16x8x2_t load_uint16x8x2(uint16_t const *ptr) {
uint16x8x2_t x;
// Clang/gcc uses ldp here.
x.val[0] = vld1q_u16(ptr);
x.val[1] = vld1q_u16(ptr + 8);
return x;
}
static INLINE void store_uint16x8x2(uint16_t *ptr, uint16x8x2_t x) {
vst1q_u16(ptr, x.val[0]);
vst1q_u16(ptr + 8, x.val[1]);
}
static INLINE void vertical4xh_neon(uint16_t *dst, ptrdiff_t stride,
const uint16_t *const above, int height) {
const uint16x4_t row = vld1_u16(above);
int y = height;
do {
vst1_u16(dst, row);
vst1_u16(dst + stride, row);
dst += stride << 1;
y -= 2;
} while (y != 0);
}
static INLINE void vertical8xh_neon(uint16_t *dst, ptrdiff_t stride,
const uint16_t *const above, int height) {
const uint16x8_t row = vld1q_u16(above);
int y = height;
do {
vst1q_u16(dst, row);
vst1q_u16(dst + stride, row);
dst += stride << 1;
y -= 2;
} while (y != 0);
}
static INLINE void vertical16xh_neon(uint16_t *dst, ptrdiff_t stride,
const uint16_t *const above, int height) {
const uint16x8x2_t row = load_uint16x8x2(above);
int y = height;
do {
store_uint16x8x2(dst, row);
store_uint16x8x2(dst + stride, row);
dst += stride << 1;
y -= 2;
} while (y != 0);
}
static INLINE uint16x8x4_t load_uint16x8x4(uint16_t const *ptr) {
uint16x8x4_t x;
// Clang/gcc uses ldp here.
x.val[0] = vld1q_u16(ptr);
x.val[1] = vld1q_u16(ptr + 8);
x.val[2] = vld1q_u16(ptr + 16);
x.val[3] = vld1q_u16(ptr + 24);
return x;
}
static INLINE void store_uint16x8x4(uint16_t *ptr, uint16x8x4_t x) {
vst1q_u16(ptr, x.val[0]);
vst1q_u16(ptr + 8, x.val[1]);
vst1q_u16(ptr + 16, x.val[2]);
vst1q_u16(ptr + 24, x.val[3]);
}
static INLINE void vertical32xh_neon(uint16_t *dst, ptrdiff_t stride,
const uint16_t *const above, int height) {
const uint16x8x4_t row = load_uint16x8x4(above);
int y = height;
do {
store_uint16x8x4(dst, row);
store_uint16x8x4(dst + stride, row);
dst += stride << 1;
y -= 2;
} while (y != 0);
}
static INLINE void vertical64xh_neon(uint16_t *dst, ptrdiff_t stride,
const uint16_t *const above, int height) {
uint16_t *dst32 = dst + 32;
const uint16x8x4_t row = load_uint16x8x4(above);
const uint16x8x4_t row32 = load_uint16x8x4(above + 32);
int y = height;
do {
store_uint16x8x4(dst, row);
store_uint16x8x4(dst32, row32);
store_uint16x8x4(dst + stride, row);
store_uint16x8x4(dst32 + stride, row32);
dst += stride << 1;
dst32 += stride << 1;
y -= 2;
} while (y != 0);
}
HIGHBD_V_NXM(4, 4)
HIGHBD_V_NXM(4, 8)
HIGHBD_V_NXM(4, 16)
HIGHBD_V_NXM(8, 4)
HIGHBD_V_NXM(8, 8)
HIGHBD_V_NXM(8, 16)
HIGHBD_V_NXM(8, 32)
HIGHBD_V_NXM(16, 4)
HIGHBD_V_NXM(16, 8)
HIGHBD_V_NXM(16, 16)
HIGHBD_V_NXM(16, 32)
HIGHBD_V_NXM(16, 64)
HIGHBD_V_NXM(32, 8)
HIGHBD_V_NXM(32, 16)
HIGHBD_V_NXM(32, 32)
HIGHBD_V_NXM(32, 64)
HIGHBD_V_NXM(64, 16)
HIGHBD_V_NXM(64, 32)
HIGHBD_V_NXM(64, 64)
// -----------------------------------------------------------------------------
// PAETH
static INLINE void highbd_paeth_4or8_x_h_neon(uint16_t *dest, ptrdiff_t stride,
const uint16_t *const top_row,
const uint16_t *const left_column,
int width, int height) {
const uint16x8_t top_left = vdupq_n_u16(top_row[-1]);
const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]);
uint16x8_t top;
if (width == 4) {
top = vcombine_u16(vld1_u16(top_row), vdup_n_u16(0));
} else { // width == 8
top = vld1q_u16(top_row);
}
for (int y = 0; y < height; ++y) {
const uint16x8_t left = vdupq_n_u16(left_column[y]);
const uint16x8_t left_dist = vabdq_u16(top, top_left);
const uint16x8_t top_dist = vabdq_u16(left, top_left);
const uint16x8_t top_left_dist =
vabdq_u16(vaddq_u16(top, left), top_left_x2);
const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist);
const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist);
const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist);
// if (left_dist <= top_dist && left_dist <= top_left_dist)
const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left);
// dest[x] = left_column[y];
// Fill all the unused spaces with 'top'. They will be overwritten when
// the positions for top_left are known.
uint16x8_t result = vbslq_u16(left_mask, left, top);
// else if (top_dist <= top_left_dist)
// dest[x] = top_row[x];
// Add these values to the mask. They were already set.
const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left);
// else
// dest[x] = top_left;
result = vbslq_u16(left_or_top_mask, result, top_left);
if (width == 4) {
vst1_u16(dest, vget_low_u16(result));
} else { // width == 8
vst1q_u16(dest, result);
}
dest += stride;
}
}
#define HIGHBD_PAETH_NXM(W, H) \
void aom_highbd_paeth_predictor_##W##x##H##_neon( \
uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
const uint16_t *left, int bd) { \
(void)bd; \
highbd_paeth_4or8_x_h_neon(dst, stride, above, left, W, H); \
}
HIGHBD_PAETH_NXM(4, 4)
HIGHBD_PAETH_NXM(4, 8)
HIGHBD_PAETH_NXM(8, 4)
HIGHBD_PAETH_NXM(8, 8)
HIGHBD_PAETH_NXM(8, 16)
#if !CONFIG_REALTIME_ONLY
HIGHBD_PAETH_NXM(4, 16)
HIGHBD_PAETH_NXM(8, 32)
#endif
// Select the closest values and collect them.
static INLINE uint16x8_t select_paeth(const uint16x8_t top,
const uint16x8_t left,
const uint16x8_t top_left,
const uint16x8_t left_le_top,
const uint16x8_t left_le_top_left,
const uint16x8_t top_le_top_left) {
// if (left_dist <= top_dist && left_dist <= top_left_dist)
const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left);
// dest[x] = left_column[y];
// Fill all the unused spaces with 'top'. They will be overwritten when
// the positions for top_left are known.
const uint16x8_t result = vbslq_u16(left_mask, left, top);
// else if (top_dist <= top_left_dist)
// dest[x] = top_row[x];
// Add these values to the mask. They were already set.
const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left);
// else
// dest[x] = top_left;
return vbslq_u16(left_or_top_mask, result, top_left);
}
#define PAETH_PREDICTOR(num) \
do { \
const uint16x8_t left_dist = vabdq_u16(top[num], top_left); \
const uint16x8_t top_left_dist = \
vabdq_u16(vaddq_u16(top[num], left), top_left_x2); \
const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist); \
const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist); \
const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist); \
const uint16x8_t result = \
select_paeth(top[num], left, top_left, left_le_top, left_le_top_left, \
top_le_top_left); \
vst1q_u16(dest + (num * 8), result); \
} while (0)
#define LOAD_TOP_ROW(num) vld1q_u16(top_row + (num * 8))
static INLINE void highbd_paeth16_plus_x_h_neon(
uint16_t *dest, ptrdiff_t stride, const uint16_t *const top_row,
const uint16_t *const left_column, int width, int height) {
const uint16x8_t top_left = vdupq_n_u16(top_row[-1]);
const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]);
uint16x8_t top[8];
top[0] = LOAD_TOP_ROW(0);
top[1] = LOAD_TOP_ROW(1);
if (width > 16) {
top[2] = LOAD_TOP_ROW(2);
top[3] = LOAD_TOP_ROW(3);
if (width == 64) {
top[4] = LOAD_TOP_ROW(4);
top[5] = LOAD_TOP_ROW(5);
top[6] = LOAD_TOP_ROW(6);
top[7] = LOAD_TOP_ROW(7);
}
}
for (int y = 0; y < height; ++y) {
const uint16x8_t left = vdupq_n_u16(left_column[y]);
const uint16x8_t top_dist = vabdq_u16(left, top_left);
PAETH_PREDICTOR(0);
PAETH_PREDICTOR(1);
if (width > 16) {
PAETH_PREDICTOR(2);
PAETH_PREDICTOR(3);
if (width == 64) {
PAETH_PREDICTOR(4);
PAETH_PREDICTOR(5);
PAETH_PREDICTOR(6);
PAETH_PREDICTOR(7);
}
}
dest += stride;
}
}
#define HIGHBD_PAETH_NXM_WIDE(W, H) \
void aom_highbd_paeth_predictor_##W##x##H##_neon( \
uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
const uint16_t *left, int bd) { \
(void)bd; \
highbd_paeth16_plus_x_h_neon(dst, stride, above, left, W, H); \
}
HIGHBD_PAETH_NXM_WIDE(16, 8)
HIGHBD_PAETH_NXM_WIDE(16, 16)
HIGHBD_PAETH_NXM_WIDE(16, 32)
HIGHBD_PAETH_NXM_WIDE(32, 16)
HIGHBD_PAETH_NXM_WIDE(32, 32)
HIGHBD_PAETH_NXM_WIDE(32, 64)
HIGHBD_PAETH_NXM_WIDE(64, 32)
HIGHBD_PAETH_NXM_WIDE(64, 64)
#if !CONFIG_REALTIME_ONLY
HIGHBD_PAETH_NXM_WIDE(16, 4)
HIGHBD_PAETH_NXM_WIDE(16, 64)
HIGHBD_PAETH_NXM_WIDE(32, 8)
HIGHBD_PAETH_NXM_WIDE(64, 16)
#endif