blob: 0a12c88371bdb9ba5b673ca1df62600ac59c24ab [file] [log] [blame]
/*
* Copyright (c) 2018, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <arm_neon.h>
#include <assert.h>
#include "config/aom_config.h"
#include "config/av1_rtcd.h"
#include "aom_dsp/txfm_common.h"
#include "aom_dsp/arm/mem_neon.h"
#include "aom_dsp/arm/transpose_neon.h"
#include "aom_ports/mem.h"
#include "av1/common/common.h"
#include "av1/common/arm/convolve_neon.h"
#define HORZ_FILTERING_CORE(t0, t1, t2, t3, t4, t5, t6, res) \
res0 = vreinterpretq_s16_u16(vaddl_u8(t0, t1)); \
res1 = vreinterpretq_s16_u16(vaddl_u8(t2, t3)); \
res2 = vreinterpretq_s16_u16(vaddl_u8(t4, t5)); \
res3 = vreinterpretq_s16_u16(vmovl_u8(t6)); \
res = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp, bd, \
conv_params->round_0);
#define PROCESS_ROW_FOR_VERTICAL_FILTER \
__builtin_prefetch(dst_tmp_ptr + 0 * dst_stride); \
\
do { \
s7 = vld1q_s16(s); \
s += src_stride; \
\
t0 = wiener_convolve8_vert_4x8(s0, s1, s2, s3, s4, s5, s6, filter_y_tmp, \
bd, conv_params->round_1); \
vst1_u8(d, t0); \
d += dst_stride; \
\
s0 = s1; \
s1 = s2; \
s2 = s3; \
s3 = s4; \
s4 = s5; \
s5 = s6; \
s6 = s7; \
height--; \
} while (height > 0);
static INLINE void process_row_for_horz_filtering(
uint16_t *dst_ptr, int16_t *filter_x, const uint8_t *src_ptr,
ptrdiff_t src_stride, ptrdiff_t dst_stride, int round0_bits, int w,
int height, int bd) {
do {
__builtin_prefetch(src_ptr);
uint8x8_t tt0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7
__builtin_prefetch(dst_ptr);
const uint8_t *ss = src_ptr + 8;
uint16_t *d_tmp = dst_ptr;
int width = w;
do {
uint8x8_t tt7 = vld1_u8(ss); // a8 a9 a10 a11 a12 a13 a14 a15
uint8x8_t ttemp_0 = tt0;
tt0 = tt7;
uint8x8_t tt1 = vext_u8(ttemp_0, tt7, 1); // a1 a2 a3 a4 a5 a6 a7 a8
uint8x8_t tt2 = vext_u8(ttemp_0, tt7, 2); // a2 a3 a4 a5 a6 a7 a8 a9
uint8x8_t tt3 = vext_u8(ttemp_0, tt7, 3); // a3 a4 a5 a6 a7 a8 a9 a10
uint8x8_t tt4 = vext_u8(ttemp_0, tt7, 4); // a4 a5 a6 a7 a8 a9 a10 a11
uint8x8_t tt5 = vext_u8(ttemp_0, tt7, 5); // a5 a6 a7 a8 a9 a10 a11 a12
uint8x8_t tt6 = vext_u8(ttemp_0, tt7, 6); // a6 a7 a8 a9 a10 a11 a12 a13
tt7 = vext_u8(ttemp_0, tt7, 7); // a7 a8 a9 a10 a11 a12 a13 a14
int16x8_t ttt0 = vreinterpretq_s16_u16(vaddl_u8(ttemp_0, tt6));
int16x8_t ttt1 = vreinterpretq_s16_u16(vaddl_u8(tt1, tt5));
int16x8_t ttt2 = vreinterpretq_s16_u16(vaddl_u8(tt2, tt4));
int16x8_t ttt3 = vreinterpretq_s16_u16(vmovl_u8(tt3));
uint16x8_t dd0 = wiener_convolve8_horiz_8x8(ttt0, ttt1, ttt2, ttt3,
filter_x, bd, round0_bits);
vst1q_u16(d_tmp, dd0);
ss += 8;
d_tmp += 8;
width -= 8;
} while (width > 0);
src_ptr += src_stride;
dst_ptr += dst_stride;
height--;
} while (height > 0);
}
/* Wiener filter 2D
Apply horizontal filter and store in a temporary buffer. When applying
vertical filter, overwrite the original pixel values.
*/
void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h,
const ConvolveParams *conv_params) {
uint8_t *d;
const uint8_t *src_ptr, *s_tmp;
uint16_t *dst_ptr;
(void)x_step_q4;
(void)y_step_q4;
int height;
const int bd = 8;
// Indicates the height needs to be processed during horizontal filtering.
const int intermediate_height = h + SUBPEL_TAPS - 1;
const int center_tap = ((SUBPEL_TAPS - 1) / 2);
int16_t filter_x_tmp[7], filter_y_tmp[7];
DECLARE_ALIGNED(16, uint16_t,
temp[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
assert(x_step_q4 == 16 && y_step_q4 == 16);
assert(!(w % 8));
assert(w <= MAX_SB_SIZE);
assert(h <= MAX_SB_SIZE);
assert(filter_x[7] == 0);
assert(filter_y[7] == 0);
/* assumption of horizontal filtering output will not exceed 15 bit.
((bd) + 1 + FILTER_BITS - conv_params->round_0) <= 15
16 - conv_params->round_0 <= 15 -- (conv_params->round_0) >= 1
*/
assert((conv_params->round_0) >= 1);
memcpy(&filter_x_tmp[0], filter_x, sizeof(*filter_x) * FILTER_BITS);
memcpy(&filter_y_tmp[0], filter_y, sizeof(*filter_y) * FILTER_BITS);
filter_x_tmp[3] += (1 << FILTER_BITS);
filter_y_tmp[3] += (1 << FILTER_BITS);
s_tmp = src - center_tap * src_stride - center_tap;
dst_ptr = temp;
src_ptr = s_tmp;
height = intermediate_height;
// For aarch_64.
#if defined(__aarch64__)
int processed_height = 0;
uint16_t *d_tmp;
int width, remaining_height;
// Start of horizontal filtering.
if (intermediate_height > 7) {
uint16x8_t res4, res5, res6, res7, res8, res9, res10, res11;
uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
do {
const uint8_t *s;
__builtin_prefetch(src_ptr + 0 * src_stride);
__builtin_prefetch(src_ptr + 1 * src_stride);
__builtin_prefetch(src_ptr + 2 * src_stride);
__builtin_prefetch(src_ptr + 3 * src_stride);
__builtin_prefetch(src_ptr + 4 * src_stride);
__builtin_prefetch(src_ptr + 5 * src_stride);
__builtin_prefetch(src_ptr + 6 * src_stride);
__builtin_prefetch(src_ptr + 7 * src_stride);
load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
s = src_ptr + 7;
d_tmp = dst_ptr;
width = w;
__builtin_prefetch(dst_ptr + 0 * dst_stride);
__builtin_prefetch(dst_ptr + 1 * dst_stride);
__builtin_prefetch(dst_ptr + 2 * dst_stride);
__builtin_prefetch(dst_ptr + 3 * dst_stride);
__builtin_prefetch(dst_ptr + 4 * dst_stride);
__builtin_prefetch(dst_ptr + 5 * dst_stride);
__builtin_prefetch(dst_ptr + 6 * dst_stride);
__builtin_prefetch(dst_ptr + 7 * dst_stride);
do {
int16x8_t res0, res1, res2, res3;
uint8x8_t t8, t9, t10, t11, t12, t13, t14;
load_u8_8x8(s, src_stride, &t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14);
transpose_u8_8x8(&t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14);
HORZ_FILTERING_CORE(t0, t6, t1, t5, t2, t4, t3, res4)
HORZ_FILTERING_CORE(t1, t7, t2, t6, t3, t5, t4, res5)
HORZ_FILTERING_CORE(t2, t8, t3, t7, t4, t6, t5, res6)
HORZ_FILTERING_CORE(t3, t9, t4, t8, t5, t7, t6, res7)
HORZ_FILTERING_CORE(t4, t10, t5, t9, t6, t8, t7, res8)
HORZ_FILTERING_CORE(t5, t11, t6, t10, t7, t9, t8, res9)
HORZ_FILTERING_CORE(t6, t12, t7, t11, t8, t10, t9, res10)
HORZ_FILTERING_CORE(t7, t13, t8, t12, t9, t11, t10, res11)
transpose_u16_8x8(&res4, &res5, &res6, &res7, &res8, &res9, &res10,
&res11);
store_u16_8x8(d_tmp, MAX_SB_SIZE, res4, res5, res6, res7, res8, res9,
res10, res11);
t0 = t8;
t1 = t9;
t2 = t10;
t3 = t11;
t4 = t12;
t5 = t13;
t6 = t14;
s += 8;
d_tmp += 8;
width -= 8;
} while (width > 0);
src_ptr += 8 * src_stride;
dst_ptr += 8 * MAX_SB_SIZE;
height -= 8;
processed_height += 8;
} while (height > 7);
}
// Process the remaining rows for horizontal filtering.
remaining_height = intermediate_height - processed_height;
if (remaining_height)
process_row_for_horz_filtering(dst_ptr, filter_x_tmp, src_ptr, src_stride,
MAX_SB_SIZE, conv_params->round_0, w, height,
bd);
// Start of vertical filtering.
{
int16_t *src_tmp_ptr, *s;
uint8_t *dst_tmp_ptr;
height = h;
width = w;
src_tmp_ptr = (int16_t *)temp;
dst_tmp_ptr = dst;
src_stride = MAX_SB_SIZE;
do {
int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
uint8x8_t t0;
s = src_tmp_ptr;
s0 = vld1q_s16(s);
s += src_stride;
s1 = vld1q_s16(s);
s += src_stride;
s2 = vld1q_s16(s);
s += src_stride;
s3 = vld1q_s16(s);
s += src_stride;
s4 = vld1q_s16(s);
s += src_stride;
s5 = vld1q_s16(s);
s += src_stride;
s6 = vld1q_s16(s);
s += src_stride;
d = dst_tmp_ptr;
height = h;
do {
int16x8_t s8, s9, s10;
uint8x8_t t1, t2, t3;
__builtin_prefetch(dst_tmp_ptr + 0 * dst_stride);
__builtin_prefetch(dst_tmp_ptr + 1 * dst_stride);
__builtin_prefetch(dst_tmp_ptr + 2 * dst_stride);
__builtin_prefetch(dst_tmp_ptr + 3 * dst_stride);
s7 = vld1q_s16(s);
s += src_stride;
s8 = vld1q_s16(s);
s += src_stride;
s9 = vld1q_s16(s);
s += src_stride;
s10 = vld1q_s16(s);
s += src_stride;
t0 = wiener_convolve8_vert_4x8(s0, s1, s2, s3, s4, s5, s6, filter_y_tmp,
bd, conv_params->round_1);
t1 = wiener_convolve8_vert_4x8(s1, s2, s3, s4, s5, s6, s7, filter_y_tmp,
bd, conv_params->round_1);
t2 = wiener_convolve8_vert_4x8(s2, s3, s4, s5, s6, s7, s8, filter_y_tmp,
bd, conv_params->round_1);
t3 = wiener_convolve8_vert_4x8(s3, s4, s5, s6, s7, s8, s9, filter_y_tmp,
bd, conv_params->round_1);
vst1_u8(d, t0);
d += dst_stride;
vst1_u8(d, t1);
d += dst_stride;
vst1_u8(d, t2);
d += dst_stride;
vst1_u8(d, t3);
d += dst_stride;
s0 = s4;
s1 = s5;
s2 = s6;
s3 = s7;
s4 = s8;
s5 = s9;
s6 = s10;
height -= 4;
} while (height > 3);
if (height) {
PROCESS_ROW_FOR_VERTICAL_FILTER
}
src_tmp_ptr += 8;
dst_tmp_ptr += 8;
w -= 8;
} while (w > 0);
}
#else
// Start of horizontal filtering.
process_row_for_horz_filtering(dst_ptr, filter_x_tmp, src_ptr, src_stride,
MAX_SB_SIZE, conv_params->round_0, w, height,
bd);
// Start of vertical filtering.
{
int16_t *src_tmp_ptr, *s;
uint8_t *dst_tmp_ptr;
src_tmp_ptr = (int16_t *)temp;
dst_tmp_ptr = dst;
src_stride = MAX_SB_SIZE;
do {
uint8x8_t t0;
int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
s = src_tmp_ptr;
s0 = vld1q_s16(s);
s += src_stride;
s1 = vld1q_s16(s);
s += src_stride;
s2 = vld1q_s16(s);
s += src_stride;
s3 = vld1q_s16(s);
s += src_stride;
s4 = vld1q_s16(s);
s += src_stride;
s5 = vld1q_s16(s);
s += src_stride;
s6 = vld1q_s16(s);
s += src_stride;
d = dst_tmp_ptr;
height = h;
PROCESS_ROW_FOR_VERTICAL_FILTER
src_tmp_ptr += 8;
dst_tmp_ptr += 8;
w -= 8;
} while (w > 0);
}
#endif
}