|  | /* | 
|  | * Copyright (c) 2018, Alliance for Open Media. All rights reserved | 
|  | * | 
|  | * This source code is subject to the terms of the BSD 2 Clause License and | 
|  | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License | 
|  | * was not distributed with this source code in the LICENSE file, you can | 
|  | * obtain it at www.aomedia.org/license/software. If the Alliance for Open | 
|  | * Media Patent License 1.0 was not distributed with this source code in the | 
|  | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. | 
|  | */ | 
|  |  | 
|  | #include <arm_neon.h> | 
|  | #include <assert.h> | 
|  |  | 
|  | #include "config/aom_config.h" | 
|  | #include "config/av1_rtcd.h" | 
|  |  | 
|  | #include "aom_dsp/txfm_common.h" | 
|  | #include "aom_dsp/arm/mem_neon.h" | 
|  | #include "aom_dsp/arm/transpose_neon.h" | 
|  | #include "aom_ports/mem.h" | 
|  | #include "av1/common/common.h" | 
|  | #include "av1/common/arm/convolve_neon.h" | 
|  |  | 
|  | #define HORZ_FILTERING_CORE(t0, t1, t2, t3, t4, t5, t6, res)                 \ | 
|  | res0 = vreinterpretq_s16_u16(vaddl_u8(t0, t1));                            \ | 
|  | res1 = vreinterpretq_s16_u16(vaddl_u8(t2, t3));                            \ | 
|  | res2 = vreinterpretq_s16_u16(vaddl_u8(t4, t5));                            \ | 
|  | res3 = vreinterpretq_s16_u16(vmovl_u8(t6));                                \ | 
|  | res = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp, bd, \ | 
|  | conv_params->round_0); | 
|  |  | 
|  | #define PROCESS_ROW_FOR_VERTICAL_FILTER                                      \ | 
|  | __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride);                          \ | 
|  | \ | 
|  | do {                                                                       \ | 
|  | s7 = vld1q_s16(s);                                                       \ | 
|  | s += src_stride;                                                         \ | 
|  | \ | 
|  | t0 = wiener_convolve8_vert_4x8(s0, s1, s2, s3, s4, s5, s6, filter_y_tmp, \ | 
|  | bd, conv_params->round_1);                \ | 
|  | vst1_u8(d, t0);                                                          \ | 
|  | d += dst_stride;                                                         \ | 
|  | \ | 
|  | s0 = s1;                                                                 \ | 
|  | s1 = s2;                                                                 \ | 
|  | s2 = s3;                                                                 \ | 
|  | s3 = s4;                                                                 \ | 
|  | s4 = s5;                                                                 \ | 
|  | s5 = s6;                                                                 \ | 
|  | s6 = s7;                                                                 \ | 
|  | height--;                                                                \ | 
|  | } while (height > 0); | 
|  |  | 
|  | static INLINE void process_row_for_horz_filtering( | 
|  | uint16_t *dst_ptr, int16_t *filter_x, const uint8_t *src_ptr, | 
|  | ptrdiff_t src_stride, ptrdiff_t dst_stride, int round0_bits, int w, | 
|  | int height, int bd) { | 
|  | do { | 
|  | __builtin_prefetch(src_ptr); | 
|  |  | 
|  | uint8x8_t tt0 = vld1_u8(src_ptr);  // a0 a1 a2 a3 a4 a5 a6 a7 | 
|  |  | 
|  | __builtin_prefetch(dst_ptr); | 
|  |  | 
|  | const uint8_t *ss = src_ptr + 8; | 
|  | uint16_t *d_tmp = dst_ptr; | 
|  | int width = w; | 
|  |  | 
|  | do { | 
|  | uint8x8_t tt7 = vld1_u8(ss);  // a8 a9 a10 a11 a12 a13 a14 a15 | 
|  | uint8x8_t ttemp_0 = tt0; | 
|  | tt0 = tt7; | 
|  |  | 
|  | uint8x8_t tt1 = vext_u8(ttemp_0, tt7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8 | 
|  | uint8x8_t tt2 = vext_u8(ttemp_0, tt7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9 | 
|  | uint8x8_t tt3 = vext_u8(ttemp_0, tt7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10 | 
|  | uint8x8_t tt4 = vext_u8(ttemp_0, tt7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11 | 
|  | uint8x8_t tt5 = vext_u8(ttemp_0, tt7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12 | 
|  | uint8x8_t tt6 = vext_u8(ttemp_0, tt7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13 | 
|  | tt7 = vext_u8(ttemp_0, tt7, 7);            // a7 a8 a9 a10 a11 a12 a13 a14 | 
|  |  | 
|  | int16x8_t ttt0 = vreinterpretq_s16_u16(vaddl_u8(ttemp_0, tt6)); | 
|  | int16x8_t ttt1 = vreinterpretq_s16_u16(vaddl_u8(tt1, tt5)); | 
|  | int16x8_t ttt2 = vreinterpretq_s16_u16(vaddl_u8(tt2, tt4)); | 
|  | int16x8_t ttt3 = vreinterpretq_s16_u16(vmovl_u8(tt3)); | 
|  | uint16x8_t dd0 = wiener_convolve8_horiz_8x8(ttt0, ttt1, ttt2, ttt3, | 
|  | filter_x, bd, round0_bits); | 
|  |  | 
|  | vst1q_u16(d_tmp, dd0); | 
|  |  | 
|  | ss += 8; | 
|  | d_tmp += 8; | 
|  | width -= 8; | 
|  | } while (width > 0); | 
|  |  | 
|  | src_ptr += src_stride; | 
|  | dst_ptr += dst_stride; | 
|  | height--; | 
|  | } while (height > 0); | 
|  | } | 
|  |  | 
|  | /* Wiener filter 2D | 
|  | Apply horizontal filter and store in a temporary buffer. When applying | 
|  | vertical filter, overwrite the original pixel values. | 
|  | */ | 
|  | void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride, | 
|  | uint8_t *dst, ptrdiff_t dst_stride, | 
|  | const int16_t *filter_x, int x_step_q4, | 
|  | const int16_t *filter_y, int y_step_q4, | 
|  | int w, int h, | 
|  | const ConvolveParams *conv_params) { | 
|  | uint8_t *d; | 
|  | const uint8_t *src_ptr, *s_tmp; | 
|  | uint16_t *dst_ptr; | 
|  | (void)x_step_q4; | 
|  | (void)y_step_q4; | 
|  |  | 
|  | int height; | 
|  | const int bd = 8; | 
|  | // Indicates the height needs to be processed during horizontal filtering. | 
|  | const int intermediate_height = h + SUBPEL_TAPS - 1; | 
|  | const int center_tap = ((SUBPEL_TAPS - 1) / 2); | 
|  | int16_t filter_x_tmp[7], filter_y_tmp[7]; | 
|  |  | 
|  | DECLARE_ALIGNED(16, uint16_t, | 
|  | temp[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]); | 
|  |  | 
|  | assert(x_step_q4 == 16 && y_step_q4 == 16); | 
|  | assert(!(w % 8)); | 
|  |  | 
|  | assert(w <= MAX_SB_SIZE); | 
|  | assert(h <= MAX_SB_SIZE); | 
|  |  | 
|  | assert(filter_x[7] == 0); | 
|  | assert(filter_y[7] == 0); | 
|  |  | 
|  | /* assumption of horizontal filtering output will not exceed 15 bit. | 
|  | ((bd) + 1 + FILTER_BITS - conv_params->round_0) <= 15 | 
|  | 16 - conv_params->round_0 <= 15 -- (conv_params->round_0) >= 1 | 
|  | */ | 
|  | assert((conv_params->round_0) >= 1); | 
|  |  | 
|  | memcpy(&filter_x_tmp[0], filter_x, sizeof(*filter_x) * FILTER_BITS); | 
|  | memcpy(&filter_y_tmp[0], filter_y, sizeof(*filter_y) * FILTER_BITS); | 
|  |  | 
|  | filter_x_tmp[3] += (1 << FILTER_BITS); | 
|  | filter_y_tmp[3] += (1 << FILTER_BITS); | 
|  |  | 
|  | s_tmp = src - center_tap * src_stride - center_tap; | 
|  | dst_ptr = temp; | 
|  | src_ptr = s_tmp; | 
|  | height = intermediate_height; | 
|  |  | 
|  | // For aarch_64. | 
|  | #if defined(__aarch64__) | 
|  | int processed_height = 0; | 
|  | uint16_t *d_tmp; | 
|  | int width, remaining_height; | 
|  | // Start of horizontal filtering. | 
|  | if (intermediate_height > 7) { | 
|  | uint16x8_t res4, res5, res6, res7, res8, res9, res10, res11; | 
|  | uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; | 
|  | do { | 
|  | const uint8_t *s; | 
|  |  | 
|  | __builtin_prefetch(src_ptr + 0 * src_stride); | 
|  | __builtin_prefetch(src_ptr + 1 * src_stride); | 
|  | __builtin_prefetch(src_ptr + 2 * src_stride); | 
|  | __builtin_prefetch(src_ptr + 3 * src_stride); | 
|  | __builtin_prefetch(src_ptr + 4 * src_stride); | 
|  | __builtin_prefetch(src_ptr + 5 * src_stride); | 
|  | __builtin_prefetch(src_ptr + 6 * src_stride); | 
|  | __builtin_prefetch(src_ptr + 7 * src_stride); | 
|  |  | 
|  | load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); | 
|  | transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); | 
|  |  | 
|  | s = src_ptr + 7; | 
|  | d_tmp = dst_ptr; | 
|  | width = w; | 
|  |  | 
|  | __builtin_prefetch(dst_ptr + 0 * dst_stride); | 
|  | __builtin_prefetch(dst_ptr + 1 * dst_stride); | 
|  | __builtin_prefetch(dst_ptr + 2 * dst_stride); | 
|  | __builtin_prefetch(dst_ptr + 3 * dst_stride); | 
|  | __builtin_prefetch(dst_ptr + 4 * dst_stride); | 
|  | __builtin_prefetch(dst_ptr + 5 * dst_stride); | 
|  | __builtin_prefetch(dst_ptr + 6 * dst_stride); | 
|  | __builtin_prefetch(dst_ptr + 7 * dst_stride); | 
|  |  | 
|  | do { | 
|  | int16x8_t res0, res1, res2, res3; | 
|  | uint8x8_t t8, t9, t10, t11, t12, t13, t14; | 
|  | load_u8_8x8(s, src_stride, &t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14); | 
|  | transpose_u8_8x8(&t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14); | 
|  |  | 
|  | HORZ_FILTERING_CORE(t0, t6, t1, t5, t2, t4, t3, res4) | 
|  | HORZ_FILTERING_CORE(t1, t7, t2, t6, t3, t5, t4, res5) | 
|  | HORZ_FILTERING_CORE(t2, t8, t3, t7, t4, t6, t5, res6) | 
|  | HORZ_FILTERING_CORE(t3, t9, t4, t8, t5, t7, t6, res7) | 
|  | HORZ_FILTERING_CORE(t4, t10, t5, t9, t6, t8, t7, res8) | 
|  | HORZ_FILTERING_CORE(t5, t11, t6, t10, t7, t9, t8, res9) | 
|  | HORZ_FILTERING_CORE(t6, t12, t7, t11, t8, t10, t9, res10) | 
|  | HORZ_FILTERING_CORE(t7, t13, t8, t12, t9, t11, t10, res11) | 
|  |  | 
|  | transpose_u16_8x8(&res4, &res5, &res6, &res7, &res8, &res9, &res10, | 
|  | &res11); | 
|  | store_u16_8x8(d_tmp, MAX_SB_SIZE, res4, res5, res6, res7, res8, res9, | 
|  | res10, res11); | 
|  |  | 
|  | t0 = t8; | 
|  | t1 = t9; | 
|  | t2 = t10; | 
|  | t3 = t11; | 
|  | t4 = t12; | 
|  | t5 = t13; | 
|  | t6 = t14; | 
|  | s += 8; | 
|  | d_tmp += 8; | 
|  | width -= 8; | 
|  | } while (width > 0); | 
|  | src_ptr += 8 * src_stride; | 
|  | dst_ptr += 8 * MAX_SB_SIZE; | 
|  | height -= 8; | 
|  | processed_height += 8; | 
|  | } while (height > 7); | 
|  | } | 
|  |  | 
|  | // Process the remaining rows for horizontal filtering. | 
|  | remaining_height = intermediate_height - processed_height; | 
|  | if (remaining_height) | 
|  | process_row_for_horz_filtering(dst_ptr, filter_x_tmp, src_ptr, src_stride, | 
|  | MAX_SB_SIZE, conv_params->round_0, w, height, | 
|  | bd); | 
|  |  | 
|  | // Start of vertical filtering. | 
|  | { | 
|  | int16_t *src_tmp_ptr, *s; | 
|  | uint8_t *dst_tmp_ptr; | 
|  | height = h; | 
|  | width = w; | 
|  | src_tmp_ptr = (int16_t *)temp; | 
|  | dst_tmp_ptr = dst; | 
|  | src_stride = MAX_SB_SIZE; | 
|  |  | 
|  | do { | 
|  | int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; | 
|  | uint8x8_t t0; | 
|  | s = src_tmp_ptr; | 
|  | s0 = vld1q_s16(s); | 
|  | s += src_stride; | 
|  | s1 = vld1q_s16(s); | 
|  | s += src_stride; | 
|  | s2 = vld1q_s16(s); | 
|  | s += src_stride; | 
|  | s3 = vld1q_s16(s); | 
|  | s += src_stride; | 
|  | s4 = vld1q_s16(s); | 
|  | s += src_stride; | 
|  | s5 = vld1q_s16(s); | 
|  | s += src_stride; | 
|  | s6 = vld1q_s16(s); | 
|  | s += src_stride; | 
|  | d = dst_tmp_ptr; | 
|  | height = h; | 
|  |  | 
|  | do { | 
|  | int16x8_t s8, s9, s10; | 
|  | uint8x8_t t1, t2, t3; | 
|  | __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride); | 
|  | __builtin_prefetch(dst_tmp_ptr + 1 * dst_stride); | 
|  | __builtin_prefetch(dst_tmp_ptr + 2 * dst_stride); | 
|  | __builtin_prefetch(dst_tmp_ptr + 3 * dst_stride); | 
|  |  | 
|  | s7 = vld1q_s16(s); | 
|  | s += src_stride; | 
|  | s8 = vld1q_s16(s); | 
|  | s += src_stride; | 
|  | s9 = vld1q_s16(s); | 
|  | s += src_stride; | 
|  | s10 = vld1q_s16(s); | 
|  | s += src_stride; | 
|  |  | 
|  | t0 = wiener_convolve8_vert_4x8(s0, s1, s2, s3, s4, s5, s6, filter_y_tmp, | 
|  | bd, conv_params->round_1); | 
|  | t1 = wiener_convolve8_vert_4x8(s1, s2, s3, s4, s5, s6, s7, filter_y_tmp, | 
|  | bd, conv_params->round_1); | 
|  | t2 = wiener_convolve8_vert_4x8(s2, s3, s4, s5, s6, s7, s8, filter_y_tmp, | 
|  | bd, conv_params->round_1); | 
|  | t3 = wiener_convolve8_vert_4x8(s3, s4, s5, s6, s7, s8, s9, filter_y_tmp, | 
|  | bd, conv_params->round_1); | 
|  |  | 
|  | vst1_u8(d, t0); | 
|  | d += dst_stride; | 
|  | vst1_u8(d, t1); | 
|  | d += dst_stride; | 
|  | vst1_u8(d, t2); | 
|  | d += dst_stride; | 
|  | vst1_u8(d, t3); | 
|  | d += dst_stride; | 
|  |  | 
|  | s0 = s4; | 
|  | s1 = s5; | 
|  | s2 = s6; | 
|  | s3 = s7; | 
|  | s4 = s8; | 
|  | s5 = s9; | 
|  | s6 = s10; | 
|  | height -= 4; | 
|  | } while (height > 3); | 
|  |  | 
|  | if (height) { | 
|  | PROCESS_ROW_FOR_VERTICAL_FILTER | 
|  | } | 
|  | src_tmp_ptr += 8; | 
|  | dst_tmp_ptr += 8; | 
|  | w -= 8; | 
|  | } while (w > 0); | 
|  | } | 
|  | #else | 
|  | // Start of horizontal filtering. | 
|  | process_row_for_horz_filtering(dst_ptr, filter_x_tmp, src_ptr, src_stride, | 
|  | MAX_SB_SIZE, conv_params->round_0, w, height, | 
|  | bd); | 
|  |  | 
|  | // Start of vertical filtering. | 
|  | { | 
|  | int16_t *src_tmp_ptr, *s; | 
|  | uint8_t *dst_tmp_ptr; | 
|  | src_tmp_ptr = (int16_t *)temp; | 
|  | dst_tmp_ptr = dst; | 
|  | src_stride = MAX_SB_SIZE; | 
|  |  | 
|  | do { | 
|  | uint8x8_t t0; | 
|  | int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; | 
|  | s = src_tmp_ptr; | 
|  | s0 = vld1q_s16(s); | 
|  | s += src_stride; | 
|  | s1 = vld1q_s16(s); | 
|  | s += src_stride; | 
|  | s2 = vld1q_s16(s); | 
|  | s += src_stride; | 
|  | s3 = vld1q_s16(s); | 
|  | s += src_stride; | 
|  | s4 = vld1q_s16(s); | 
|  | s += src_stride; | 
|  | s5 = vld1q_s16(s); | 
|  | s += src_stride; | 
|  | s6 = vld1q_s16(s); | 
|  | s += src_stride; | 
|  | d = dst_tmp_ptr; | 
|  | height = h; | 
|  | PROCESS_ROW_FOR_VERTICAL_FILTER | 
|  |  | 
|  | src_tmp_ptr += 8; | 
|  | dst_tmp_ptr += 8; | 
|  |  | 
|  | w -= 8; | 
|  | } while (w > 0); | 
|  | } | 
|  | #endif | 
|  | } |