| /* |
| * Copyright (c) 2020, Alliance for Open Media. All rights reserved. |
| * |
| * This source code is subject to the terms of the BSD 2 Clause License and |
| * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| * was not distributed with this source code in the LICENSE file, you can |
| * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| * Media Patent License 1.0 was not distributed with this source code in the |
| * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
| */ |
| |
| #include <arm_neon.h> |
| #include <assert.h> |
| |
| #include "config/aom_config.h" |
| #include "config/av1_rtcd.h" |
| |
| #include "aom/aom_integer.h" |
| #include "aom_dsp/arm/mem_neon.h" |
| #include "aom_dsp/arm/sum_neon.h" |
| |
| #define MAX_UPSAMPLE_SZ 16 |
| #define FILTER_INTRA_SCALE_BITS 4 |
| |
| // These kernels are a transposed version of those defined in reconintra.c, |
| // with the absolute value of the negatives taken in the top row. |
| DECLARE_ALIGNED(16, static const uint8_t, |
| av1_filter_intra_taps_neon[FILTER_INTRA_MODES][7][8]) = { |
| // clang-format off |
| { |
| { 6, 5, 3, 3, 4, 3, 3, 3 }, |
| { 10, 2, 1, 1, 6, 2, 2, 1 }, |
| { 0, 10, 1, 1, 0, 6, 2, 2 }, |
| { 0, 0, 10, 2, 0, 0, 6, 2 }, |
| { 0, 0, 0, 10, 0, 0, 0, 6 }, |
| { 12, 9, 7, 5, 2, 2, 2, 3 }, |
| { 0, 0, 0, 0, 12, 9, 7, 5 } |
| }, |
| { |
| { 10, 6, 4, 2, 10, 6, 4, 2 }, |
| { 16, 0, 0, 0, 16, 0, 0, 0 }, |
| { 0, 16, 0, 0, 0, 16, 0, 0 }, |
| { 0, 0, 16, 0, 0, 0, 16, 0 }, |
| { 0, 0, 0, 16, 0, 0, 0, 16 }, |
| { 10, 6, 4, 2, 0, 0, 0, 0 }, |
| { 0, 0, 0, 0, 10, 6, 4, 2 } |
| }, |
| { |
| { 8, 8, 8, 8, 4, 4, 4, 4 }, |
| { 8, 0, 0, 0, 4, 0, 0, 0 }, |
| { 0, 8, 0, 0, 0, 4, 0, 0 }, |
| { 0, 0, 8, 0, 0, 0, 4, 0 }, |
| { 0, 0, 0, 8, 0, 0, 0, 4 }, |
| { 16, 16, 16, 16, 0, 0, 0, 0 }, |
| { 0, 0, 0, 0, 16, 16, 16, 16 } |
| }, |
| { |
| { 2, 1, 1, 0, 1, 1, 1, 1 }, |
| { 8, 3, 2, 1, 4, 3, 2, 2 }, |
| { 0, 8, 3, 2, 0, 4, 3, 2 }, |
| { 0, 0, 8, 3, 0, 0, 4, 3 }, |
| { 0, 0, 0, 8, 0, 0, 0, 4 }, |
| { 10, 6, 4, 2, 3, 4, 4, 3 }, |
| { 0, 0, 0, 0, 10, 6, 4, 3 } |
| }, |
| { |
| { 12, 10, 9, 8, 10, 9, 8, 7 }, |
| { 14, 0, 0, 0, 12, 1, 0, 0 }, |
| { 0, 14, 0, 0, 0, 12, 0, 0 }, |
| { 0, 0, 14, 0, 0, 0, 12, 1 }, |
| { 0, 0, 0, 14, 0, 0, 0, 12 }, |
| { 14, 12, 11, 10, 0, 0, 1, 1 }, |
| { 0, 0, 0, 0, 14, 12, 11, 9 } |
| } |
| // clang-format on |
| }; |
| |
| static inline uint8x8_t filter_intra_predictor( |
| uint8x8_t s0, uint8x8_t s1, uint8x8_t s2, uint8x8_t s3, uint8x8_t s4, |
| uint8x8_t s5, uint8x8_t s6, const uint8x8_t f0, const uint8x8_t f1, |
| const uint8x8_t f2, const uint8x8_t f3, const uint8x8_t f4, |
| const uint8x8_t f5, const uint8x8_t f6) { |
| uint16x8_t acc = vmull_u8(s1, f1); |
| // First row of each filter has all negative values so subtract. |
| acc = vmlsl_u8(acc, s0, f0); |
| acc = vmlal_u8(acc, s2, f2); |
| acc = vmlal_u8(acc, s3, f3); |
| acc = vmlal_u8(acc, s4, f4); |
| acc = vmlal_u8(acc, s5, f5); |
| acc = vmlal_u8(acc, s6, f6); |
| |
| return vqrshrun_n_s16(vreinterpretq_s16_u16(acc), FILTER_INTRA_SCALE_BITS); |
| } |
| |
| void av1_filter_intra_predictor_neon(uint8_t *dst, ptrdiff_t stride, |
| TX_SIZE tx_size, const uint8_t *above, |
| const uint8_t *left, int mode) { |
| const int width = tx_size_wide[tx_size]; |
| const int height = tx_size_high[tx_size]; |
| assert(width <= 32 && height <= 32); |
| |
| const uint8x8_t f0 = vld1_u8(av1_filter_intra_taps_neon[mode][0]); |
| const uint8x8_t f1 = vld1_u8(av1_filter_intra_taps_neon[mode][1]); |
| const uint8x8_t f2 = vld1_u8(av1_filter_intra_taps_neon[mode][2]); |
| const uint8x8_t f3 = vld1_u8(av1_filter_intra_taps_neon[mode][3]); |
| const uint8x8_t f4 = vld1_u8(av1_filter_intra_taps_neon[mode][4]); |
| const uint8x8_t f5 = vld1_u8(av1_filter_intra_taps_neon[mode][5]); |
| const uint8x8_t f6 = vld1_u8(av1_filter_intra_taps_neon[mode][6]); |
| |
| // Computing 4 cols per iteration (instead of 8) for 8x<h> blocks is faster. |
| if (width <= 8) { |
| uint8x8_t s0 = vdup_n_u8(above[-1]); |
| uint8x8_t s5 = vdup_n_u8(left[0]); |
| uint8x8_t s6 = vdup_n_u8(left[1]); |
| |
| int c = 0; |
| do { |
| uint8x8_t s1234 = load_unaligned_u8_4x1(above + c); |
| uint8x8_t s1 = vdup_lane_u8(s1234, 0); |
| uint8x8_t s2 = vdup_lane_u8(s1234, 1); |
| uint8x8_t s3 = vdup_lane_u8(s1234, 2); |
| uint8x8_t s4 = vdup_lane_u8(s1234, 3); |
| |
| uint8x8_t res = filter_intra_predictor(s0, s1, s2, s3, s4, s5, s6, f0, f1, |
| f2, f3, f4, f5, f6); |
| |
| store_u8x4_strided_x2(dst + c, stride, res); |
| |
| s0 = s4; |
| s5 = vdup_lane_u8(res, 3); |
| s6 = vdup_lane_u8(res, 7); |
| |
| c += 4; |
| } while (c < width); |
| |
| int r = 2; |
| do { |
| s0 = vdup_n_u8(left[r - 1]); |
| s5 = vdup_n_u8(left[r + 0]); |
| s6 = vdup_n_u8(left[r + 1]); |
| |
| c = 0; |
| do { |
| uint8x8_t s1234 = load_u8_4x1(dst + (r - 1) * stride + c); |
| uint8x8_t s1 = vdup_lane_u8(s1234, 0); |
| uint8x8_t s2 = vdup_lane_u8(s1234, 1); |
| uint8x8_t s3 = vdup_lane_u8(s1234, 2); |
| uint8x8_t s4 = vdup_lane_u8(s1234, 3); |
| |
| uint8x8_t res = filter_intra_predictor(s0, s1, s2, s3, s4, s5, s6, f0, |
| f1, f2, f3, f4, f5, f6); |
| |
| store_u8x4_strided_x2(dst + r * stride + c, stride, res); |
| |
| s0 = s4; |
| s5 = vdup_lane_u8(res, 3); |
| s6 = vdup_lane_u8(res, 7); |
| |
| c += 4; |
| } while (c < width); |
| |
| r += 2; |
| } while (r < height); |
| } else { |
| uint8x8_t s0_lo = vdup_n_u8(above[-1]); |
| uint8x8_t s5_lo = vdup_n_u8(left[0]); |
| uint8x8_t s6_lo = vdup_n_u8(left[1]); |
| |
| int c = 0; |
| do { |
| uint8x8_t s1234 = vld1_u8(above + c); |
| uint8x8_t s1_lo = vdup_lane_u8(s1234, 0); |
| uint8x8_t s2_lo = vdup_lane_u8(s1234, 1); |
| uint8x8_t s3_lo = vdup_lane_u8(s1234, 2); |
| uint8x8_t s4_lo = vdup_lane_u8(s1234, 3); |
| |
| uint8x8_t res_lo = |
| filter_intra_predictor(s0_lo, s1_lo, s2_lo, s3_lo, s4_lo, s5_lo, |
| s6_lo, f0, f1, f2, f3, f4, f5, f6); |
| |
| uint8x8_t s0_hi = s4_lo; |
| uint8x8_t s1_hi = vdup_lane_u8(s1234, 4); |
| uint8x8_t s2_hi = vdup_lane_u8(s1234, 5); |
| uint8x8_t s3_hi = vdup_lane_u8(s1234, 6); |
| uint8x8_t s4_hi = vdup_lane_u8(s1234, 7); |
| uint8x8_t s5_hi = vdup_lane_u8(res_lo, 3); |
| uint8x8_t s6_hi = vdup_lane_u8(res_lo, 7); |
| |
| uint8x8_t res_hi = |
| filter_intra_predictor(s0_hi, s1_hi, s2_hi, s3_hi, s4_hi, s5_hi, |
| s6_hi, f0, f1, f2, f3, f4, f5, f6); |
| |
| uint32x2x2_t res = |
| vzip_u32(vreinterpret_u32_u8(res_lo), vreinterpret_u32_u8(res_hi)); |
| |
| vst1_u8(dst + 0 * stride + c, vreinterpret_u8_u32(res.val[0])); |
| vst1_u8(dst + 1 * stride + c, vreinterpret_u8_u32(res.val[1])); |
| |
| s0_lo = s4_hi; |
| s5_lo = vdup_lane_u8(res_hi, 3); |
| s6_lo = vdup_lane_u8(res_hi, 7); |
| c += 8; |
| } while (c < width); |
| |
| int r = 2; |
| do { |
| s0_lo = vdup_n_u8(left[r - 1]); |
| s5_lo = vdup_n_u8(left[r + 0]); |
| s6_lo = vdup_n_u8(left[r + 1]); |
| |
| c = 0; |
| do { |
| uint8x8_t s1234 = vld1_u8(dst + (r - 1) * stride + c); |
| uint8x8_t s1_lo = vdup_lane_u8(s1234, 0); |
| uint8x8_t s2_lo = vdup_lane_u8(s1234, 1); |
| uint8x8_t s3_lo = vdup_lane_u8(s1234, 2); |
| uint8x8_t s4_lo = vdup_lane_u8(s1234, 3); |
| |
| uint8x8_t res_lo = |
| filter_intra_predictor(s0_lo, s1_lo, s2_lo, s3_lo, s4_lo, s5_lo, |
| s6_lo, f0, f1, f2, f3, f4, f5, f6); |
| |
| uint8x8_t s0_hi = s4_lo; |
| uint8x8_t s1_hi = vdup_lane_u8(s1234, 4); |
| uint8x8_t s2_hi = vdup_lane_u8(s1234, 5); |
| uint8x8_t s3_hi = vdup_lane_u8(s1234, 6); |
| uint8x8_t s4_hi = vdup_lane_u8(s1234, 7); |
| uint8x8_t s5_hi = vdup_lane_u8(res_lo, 3); |
| uint8x8_t s6_hi = vdup_lane_u8(res_lo, 7); |
| |
| uint8x8_t res_hi = |
| filter_intra_predictor(s0_hi, s1_hi, s2_hi, s3_hi, s4_hi, s5_hi, |
| s6_hi, f0, f1, f2, f3, f4, f5, f6); |
| |
| uint32x2x2_t res = |
| vzip_u32(vreinterpret_u32_u8(res_lo), vreinterpret_u32_u8(res_hi)); |
| |
| vst1_u8(dst + (r + 0) * stride + c, vreinterpret_u8_u32(res.val[0])); |
| vst1_u8(dst + (r + 1) * stride + c, vreinterpret_u8_u32(res.val[1])); |
| |
| s0_lo = s4_hi; |
| s5_lo = vdup_lane_u8(res_hi, 3); |
| s6_lo = vdup_lane_u8(res_hi, 7); |
| c += 8; |
| } while (c < width); |
| |
| r += 2; |
| } while (r < height); |
| } |
| } |
| |
| void av1_filter_intra_edge_neon(uint8_t *p, int sz, int strength) { |
| if (!strength) return; |
| assert(sz >= 0 && sz <= 129); |
| |
| uint8_t edge[160]; // Max value of sz + enough padding for vector accesses. |
| memcpy(edge + 1, p, sz * sizeof(*p)); |
| |
| // Populate extra space appropriately. |
| edge[0] = edge[1]; |
| edge[sz + 1] = edge[sz]; |
| edge[sz + 2] = edge[sz]; |
| |
| // Don't overwrite first pixel. |
| uint8_t *dst = p + 1; |
| sz--; |
| |
| if (strength == 1) { // Filter: {4, 8, 4}. |
| const uint8_t *src = edge + 1; |
| |
| while (sz >= 8) { |
| uint8x8_t s0 = vld1_u8(src); |
| uint8x8_t s1 = vld1_u8(src + 1); |
| uint8x8_t s2 = vld1_u8(src + 2); |
| |
| // Make use of the identity: |
| // (4*a + 8*b + 4*c) >> 4 == (a + (b << 1) + c) >> 2 |
| uint16x8_t t0 = vaddl_u8(s0, s2); |
| uint16x8_t t1 = vaddl_u8(s1, s1); |
| uint16x8_t sum = vaddq_u16(t0, t1); |
| uint8x8_t res = vrshrn_n_u16(sum, 2); |
| |
| vst1_u8(dst, res); |
| |
| src += 8; |
| dst += 8; |
| sz -= 8; |
| } |
| |
| if (sz > 0) { // Handle sz < 8 to avoid modifying out-of-bounds values. |
| uint8x8_t s0 = vld1_u8(src); |
| uint8x8_t s1 = vld1_u8(src + 1); |
| uint8x8_t s2 = vld1_u8(src + 2); |
| |
| uint16x8_t t0 = vaddl_u8(s0, s2); |
| uint16x8_t t1 = vaddl_u8(s1, s1); |
| uint16x8_t sum = vaddq_u16(t0, t1); |
| uint8x8_t res = vrshrn_n_u16(sum, 2); |
| |
| // Mask off out-of-bounds indices. |
| uint8x8_t current_dst = vld1_u8(dst); |
| uint8x8_t mask = vcgt_u8(vdup_n_u8(sz), vcreate_u8(0x0706050403020100)); |
| res = vbsl_u8(mask, res, current_dst); |
| |
| vst1_u8(dst, res); |
| } |
| } else if (strength == 2) { // Filter: {5, 6, 5}. |
| const uint8_t *src = edge + 1; |
| |
| const uint8x8x3_t filter = { { vdup_n_u8(5), vdup_n_u8(6), vdup_n_u8(5) } }; |
| |
| while (sz >= 8) { |
| uint8x8_t s0 = vld1_u8(src); |
| uint8x8_t s1 = vld1_u8(src + 1); |
| uint8x8_t s2 = vld1_u8(src + 2); |
| |
| uint16x8_t accum = vmull_u8(s0, filter.val[0]); |
| accum = vmlal_u8(accum, s1, filter.val[1]); |
| accum = vmlal_u8(accum, s2, filter.val[2]); |
| uint8x8_t res = vrshrn_n_u16(accum, 4); |
| |
| vst1_u8(dst, res); |
| |
| src += 8; |
| dst += 8; |
| sz -= 8; |
| } |
| |
| if (sz > 0) { // Handle sz < 8 to avoid modifying out-of-bounds values. |
| uint8x8_t s0 = vld1_u8(src); |
| uint8x8_t s1 = vld1_u8(src + 1); |
| uint8x8_t s2 = vld1_u8(src + 2); |
| |
| uint16x8_t accum = vmull_u8(s0, filter.val[0]); |
| accum = vmlal_u8(accum, s1, filter.val[1]); |
| accum = vmlal_u8(accum, s2, filter.val[2]); |
| uint8x8_t res = vrshrn_n_u16(accum, 4); |
| |
| // Mask off out-of-bounds indices. |
| uint8x8_t current_dst = vld1_u8(dst); |
| uint8x8_t mask = vcgt_u8(vdup_n_u8(sz), vcreate_u8(0x0706050403020100)); |
| res = vbsl_u8(mask, res, current_dst); |
| |
| vst1_u8(dst, res); |
| } |
| } else { // Filter {2, 4, 4, 4, 2}. |
| const uint8_t *src = edge; |
| |
| while (sz >= 8) { |
| uint8x8_t s0 = vld1_u8(src); |
| uint8x8_t s1 = vld1_u8(src + 1); |
| uint8x8_t s2 = vld1_u8(src + 2); |
| uint8x8_t s3 = vld1_u8(src + 3); |
| uint8x8_t s4 = vld1_u8(src + 4); |
| |
| // Make use of the identity: |
| // (2*a + 4*b + 4*c + 4*d + 2*e) >> 4 == (a + ((b + c + d) << 1) + e) >> 3 |
| uint16x8_t t0 = vaddl_u8(s0, s4); |
| uint16x8_t t1 = vaddl_u8(s1, s2); |
| t1 = vaddw_u8(t1, s3); |
| t1 = vaddq_u16(t1, t1); |
| uint16x8_t sum = vaddq_u16(t0, t1); |
| uint8x8_t res = vrshrn_n_u16(sum, 3); |
| |
| vst1_u8(dst, res); |
| |
| src += 8; |
| dst += 8; |
| sz -= 8; |
| } |
| |
| if (sz > 0) { // Handle sz < 8 to avoid modifying out-of-bounds values. |
| uint8x8_t s0 = vld1_u8(src); |
| uint8x8_t s1 = vld1_u8(src + 1); |
| uint8x8_t s2 = vld1_u8(src + 2); |
| uint8x8_t s3 = vld1_u8(src + 3); |
| uint8x8_t s4 = vld1_u8(src + 4); |
| |
| uint16x8_t t0 = vaddl_u8(s0, s4); |
| uint16x8_t t1 = vaddl_u8(s1, s2); |
| t1 = vaddw_u8(t1, s3); |
| t1 = vaddq_u16(t1, t1); |
| uint16x8_t sum = vaddq_u16(t0, t1); |
| uint8x8_t res = vrshrn_n_u16(sum, 3); |
| |
| // Mask off out-of-bounds indices. |
| uint8x8_t current_dst = vld1_u8(dst); |
| uint8x8_t mask = vcgt_u8(vdup_n_u8(sz), vcreate_u8(0x0706050403020100)); |
| res = vbsl_u8(mask, res, current_dst); |
| |
| vst1_u8(dst, res); |
| } |
| } |
| } |
| |
| void av1_upsample_intra_edge_neon(uint8_t *p, int sz) { |
| if (!sz) return; |
| |
| assert(sz <= MAX_UPSAMPLE_SZ); |
| |
| uint8_t edge[MAX_UPSAMPLE_SZ + 3]; |
| const uint8_t *src = edge; |
| |
| // Copy p[-1..(sz-1)] and pad out both ends. |
| edge[0] = p[-1]; |
| edge[1] = p[-1]; |
| memcpy(edge + 2, p, sz); |
| edge[sz + 2] = p[sz - 1]; |
| p[-2] = p[-1]; |
| |
| uint8_t *dst = p - 1; |
| |
| do { |
| uint8x8_t s0 = vld1_u8(src); |
| uint8x8_t s1 = vld1_u8(src + 1); |
| uint8x8_t s2 = vld1_u8(src + 2); |
| uint8x8_t s3 = vld1_u8(src + 3); |
| |
| int16x8_t t0 = vreinterpretq_s16_u16(vaddl_u8(s0, s3)); |
| int16x8_t t1 = vreinterpretq_s16_u16(vaddl_u8(s1, s2)); |
| t1 = vmulq_n_s16(t1, 9); |
| t1 = vsubq_s16(t1, t0); |
| |
| uint8x8x2_t res = { { vqrshrun_n_s16(t1, 4), s2 } }; |
| |
| vst2_u8(dst, res); |
| |
| src += 8; |
| dst += 16; |
| sz -= 8; |
| } while (sz > 0); |
| } |