blob: 463ee59c6f574aa08b9693a90315f97b57f0bec2 [file] [log] [blame] [edit]
/*
*
* Copyright (c) 2021, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 3-Clause Clear License
* and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
* License was not distributed with this source code in the LICENSE file, you
* can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the
* Alliance for Open Media Patent License 1.0 was not distributed with this
* source code in the PATENTS file, you can obtain it at
* aomedia.org/license/patent-license/.
*/
#include <assert.h>
#include <arm_neon.h>
#include "config/av1_rtcd.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_ports/mem.h"
#include "av1/common/convolve.h"
#include "av1/common/filter.h"
#include "av1/common/arm/convolve_neon.h"
#include "aom_dsp/arm/mem_neon.h"
#include "aom_dsp/arm/transpose_neon.h"
void av1_convolve_x_sr_intrabc_neon(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride, int w, int h,
const InterpFilterParams *filter_params_x,
const int subpel_x_qn,
ConvolveParams *conv_params) {
assert(subpel_x_qn == 8);
assert(filter_params_x->taps == 2);
assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
(void)filter_params_x;
(void)subpel_x_qn;
(void)conv_params;
if (w <= 4) {
do {
uint8x8_t s0_0 = vld1_u8(src);
uint8x8_t s0_1 = vld1_u8(src + 1);
uint8x8_t s1_0 = vld1_u8(src + src_stride);
uint8x8_t s1_1 = vld1_u8(src + src_stride + 1);
uint8x8_t d0 = vrhadd_u8(s0_0, s0_1);
uint8x8_t d1 = vrhadd_u8(s1_0, s1_1);
if (w == 2) {
store_u8_2x1(dst + 0 * dst_stride, d0);
store_u8_2x1(dst + 1 * dst_stride, d1);
} else {
store_u8_4x1(dst + 0 * dst_stride, d0);
store_u8_4x1(dst + 1 * dst_stride, d1);
}
src += 2 * src_stride;
dst += 2 * dst_stride;
h -= 2;
} while (h != 0);
} else if (w == 8) {
do {
uint8x8_t s0_0 = vld1_u8(src);
uint8x8_t s0_1 = vld1_u8(src + 1);
uint8x8_t s1_0 = vld1_u8(src + src_stride);
uint8x8_t s1_1 = vld1_u8(src + src_stride + 1);
uint8x8_t d0 = vrhadd_u8(s0_0, s0_1);
uint8x8_t d1 = vrhadd_u8(s1_0, s1_1);
vst1_u8(dst, d0);
vst1_u8(dst + dst_stride, d1);
src += 2 * src_stride;
dst += 2 * dst_stride;
h -= 2;
} while (h != 0);
} else {
do {
const uint8_t *src_ptr = src;
uint8_t *dst_ptr = dst;
int width = w;
do {
uint8x16_t s0 = vld1q_u8(src_ptr);
uint8x16_t s1 = vld1q_u8(src_ptr + 1);
uint8x16_t d0 = vrhaddq_u8(s0, s1);
vst1q_u8(dst_ptr, d0);
src_ptr += 16;
dst_ptr += 16;
width -= 16;
} while (width != 0);
src += src_stride;
dst += dst_stride;
} while (--h != 0);
}
}
void av1_convolve_y_sr_intrabc_neon(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride, int w, int h,
const InterpFilterParams *filter_params_y,
const int subpel_y_qn) {
assert(subpel_y_qn == 8);
assert(filter_params_y->taps == 2);
(void)filter_params_y;
(void)subpel_y_qn;
if (w <= 4) {
do {
uint8x8_t s0 = load_unaligned_u8_4x1(src);
uint8x8_t s1 = load_unaligned_u8_4x1(src + src_stride);
uint8x8_t s2 = load_unaligned_u8_4x1(src + 2 * src_stride);
uint8x8_t d0 = vrhadd_u8(s0, s1);
uint8x8_t d1 = vrhadd_u8(s1, s2);
if (w == 2) {
store_u8_2x1(dst + 0 * dst_stride, d0);
store_u8_2x1(dst + 1 * dst_stride, d1);
} else {
store_u8_4x1(dst + 0 * dst_stride, d0);
store_u8_4x1(dst + 1 * dst_stride, d1);
}
src += 2 * src_stride;
dst += 2 * dst_stride;
h -= 2;
} while (h != 0);
} else if (w == 8) {
do {
uint8x8_t s0 = vld1_u8(src);
uint8x8_t s1 = vld1_u8(src + src_stride);
uint8x8_t s2 = vld1_u8(src + 2 * src_stride);
uint8x8_t d0 = vrhadd_u8(s0, s1);
uint8x8_t d1 = vrhadd_u8(s1, s2);
vst1_u8(dst, d0);
vst1_u8(dst + dst_stride, d1);
src += 2 * src_stride;
dst += 2 * dst_stride;
h -= 2;
} while (h != 0);
} else {
do {
const uint8_t *src_ptr = src;
uint8_t *dst_ptr = dst;
int height = h;
do {
uint8x16_t s0 = vld1q_u8(src_ptr);
uint8x16_t s1 = vld1q_u8(src_ptr + src_stride);
uint8x16_t d0 = vrhaddq_u8(s0, s1);
vst1q_u8(dst_ptr, d0);
src_ptr += src_stride;
dst_ptr += dst_stride;
} while (--height != 0);
src += 16;
dst += 16;
w -= 16;
} while (w != 0);
}
}
void av1_convolve_2d_sr_intrabc_neon(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride, int w, int h,
const InterpFilterParams *filter_params_x,
const InterpFilterParams *filter_params_y,
const int subpel_x_qn,
const int subpel_y_qn,
ConvolveParams *conv_params) {
assert(subpel_x_qn == 8);
assert(subpel_y_qn == 8);
assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
(void)filter_params_x;
(void)subpel_x_qn;
(void)filter_params_y;
(void)subpel_y_qn;
(void)conv_params;
uint16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
int im_h = h + 1;
int im_stride = w;
assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
uint16_t *im = im_block;
// Horizontal filter.
if (w <= 4) {
do {
uint8x8_t s0 = vld1_u8(src);
uint8x8_t s1 = vld1_u8(src + 1);
uint16x4_t sum = vget_low_u16(vaddl_u8(s0, s1));
// Safe to store the whole vector, the im buffer is big enough.
vst1_u16(im, sum);
src += src_stride;
im += im_stride;
} while (--im_h != 0);
} else {
do {
const uint8_t *src_ptr = src;
uint16_t *im_ptr = im;
int width = w;
do {
uint8x8_t s0 = vld1_u8(src_ptr);
uint8x8_t s1 = vld1_u8(src_ptr + 1);
uint16x8_t sum = vaddl_u8(s0, s1);
vst1q_u16(im_ptr, sum);
src_ptr += 8;
im_ptr += 8;
width -= 8;
} while (width != 0);
src += src_stride;
im += im_stride;
} while (--im_h != 0);
}
im = im_block;
// Vertical filter.
if (w <= 4) {
do {
uint16x4_t s0 = vld1_u16(im);
uint16x4_t s1 = vld1_u16(im + im_stride);
uint16x4_t s2 = vld1_u16(im + 2 * im_stride);
uint16x4_t sum0 = vadd_u16(s0, s1);
uint16x4_t sum1 = vadd_u16(s1, s2);
uint8x8_t d0 = vqrshrn_n_u16(vcombine_u16(sum0, vdup_n_u16(0)), 2);
uint8x8_t d1 = vqrshrn_n_u16(vcombine_u16(sum1, vdup_n_u16(0)), 2);
if (w == 2) {
store_u8_2x1(dst + 0 * dst_stride, d0);
store_u8_2x1(dst + 1 * dst_stride, d1);
} else {
store_u8_4x1(dst + 0 * dst_stride, d0);
store_u8_4x1(dst + 1 * dst_stride, d1);
}
im += 2 * im_stride;
dst += 2 * dst_stride;
h -= 2;
} while (h != 0);
} else {
do {
uint16_t *im_ptr = im;
uint8_t *dst_ptr = dst;
int height = h;
do {
uint16x8_t s0 = vld1q_u16(im_ptr);
uint16x8_t s1 = vld1q_u16(im_ptr + im_stride);
uint16x8_t sum = vaddq_u16(s0, s1);
uint8x8_t d0 = vqrshrn_n_u16(sum, 2);
vst1_u8(dst_ptr, d0);
im_ptr += im_stride;
dst_ptr += dst_stride;
} while (--height != 0);
im += 8;
dst += 8;
w -= 8;
} while (w != 0);
}
}