Vitalii Dziumenko | 22980c7 | 2020-04-17 15:50:12 +0300 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (c) 2020, Alliance for Open Media. All rights reserved |
| 3 | * |
| 4 | * This source code is subject to the terms of the BSD 2 Clause License and |
| 5 | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| 6 | * was not distributed with this source code in the LICENSE file, you can |
| 7 | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| 8 | * Media Patent License 1.0 was not distributed with this source code in the |
| 9 | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
| 10 | */ |
| 11 | |
| 12 | #include <arm_neon.h> |
| 13 | #include <assert.h> |
| 14 | |
| 15 | #include "av1/common/arm/mem_neon.h" |
| 16 | #include "config/aom_dsp_rtcd.h" |
| 17 | |
| 18 | static INLINE uint32x4_t sum_squares_i16_4x4_neon(const int16_t *src, |
| 19 | int stride) { |
| 20 | const int16x4_t v_val_01_lo = vld1_s16(src + 0 * stride); |
| 21 | const int16x4_t v_val_01_hi = vld1_s16(src + 1 * stride); |
| 22 | const int16x4_t v_val_23_lo = vld1_s16(src + 2 * stride); |
| 23 | const int16x4_t v_val_23_hi = vld1_s16(src + 3 * stride); |
| 24 | int32x4_t v_sq_01_d = vmull_s16(v_val_01_lo, v_val_01_lo); |
| 25 | v_sq_01_d = vmlal_s16(v_sq_01_d, v_val_01_hi, v_val_01_hi); |
| 26 | int32x4_t v_sq_23_d = vmull_s16(v_val_23_lo, v_val_23_lo); |
| 27 | v_sq_23_d = vmlal_s16(v_sq_23_d, v_val_23_hi, v_val_23_hi); |
| 28 | #if defined(__aarch64__) |
| 29 | return vreinterpretq_u32_s32(vpaddq_s32(v_sq_01_d, v_sq_23_d)); |
| 30 | #else |
| 31 | return vreinterpretq_u32_s32(vcombine_s32( |
| 32 | vqmovn_s64(vpaddlq_s32(v_sq_01_d)), vqmovn_s64(vpaddlq_s32(v_sq_23_d)))); |
| 33 | #endif |
| 34 | } |
| 35 | |
| 36 | uint64_t aom_sum_squares_2d_i16_4x4_neon(const int16_t *src, int stride) { |
| 37 | const uint32x4_t v_sum_0123_d = sum_squares_i16_4x4_neon(src, stride); |
| 38 | #if defined(__aarch64__) |
| 39 | return (uint64_t)vaddvq_u32(v_sum_0123_d); |
| 40 | #else |
| 41 | uint64x2_t v_sum_d = vpaddlq_u32(v_sum_0123_d); |
| 42 | v_sum_d = vaddq_u64(v_sum_d, vextq_u64(v_sum_d, v_sum_d, 1)); |
| 43 | return vgetq_lane_u64(v_sum_d, 0); |
| 44 | #endif |
| 45 | } |
| 46 | |
| 47 | uint64_t aom_sum_squares_2d_i16_4xn_neon(const int16_t *src, int stride, |
| 48 | int height) { |
| 49 | int r = 0; |
| 50 | uint32x4_t v_acc_q = vdupq_n_u32(0); |
| 51 | do { |
| 52 | const uint32x4_t v_acc_d = sum_squares_i16_4x4_neon(src, stride); |
| 53 | v_acc_q = vaddq_u32(v_acc_q, v_acc_d); |
| 54 | src += stride << 2; |
| 55 | r += 4; |
| 56 | } while (r < height); |
| 57 | |
| 58 | uint64x2_t v_acc_64 = vpaddlq_u32(v_acc_q); |
| 59 | #if defined(__aarch64__) |
| 60 | return vaddvq_u64(v_acc_64); |
| 61 | #else |
| 62 | v_acc_64 = vaddq_u64(v_acc_64, vextq_u64(v_acc_64, v_acc_64, 1)); |
| 63 | return vgetq_lane_u64(v_acc_64, 0); |
| 64 | #endif |
| 65 | } |
| 66 | |
| 67 | uint64_t aom_sum_squares_2d_i16_nxn_neon(const int16_t *src, int stride, |
| 68 | int width, int height) { |
| 69 | int r = 0; |
| 70 | const int32x4_t zero = vdupq_n_s32(0); |
| 71 | uint64x2_t v_acc_q = vreinterpretq_u64_s32(zero); |
| 72 | do { |
| 73 | int32x4_t v_sum = zero; |
| 74 | int c = 0; |
| 75 | do { |
| 76 | const int16_t *b = src + c; |
| 77 | const int16x8_t v_val_0 = vld1q_s16(b + 0 * stride); |
| 78 | const int16x8_t v_val_1 = vld1q_s16(b + 1 * stride); |
| 79 | const int16x8_t v_val_2 = vld1q_s16(b + 2 * stride); |
| 80 | const int16x8_t v_val_3 = vld1q_s16(b + 3 * stride); |
| 81 | const int16x4_t v_val_0_lo = vget_low_s16(v_val_0); |
| 82 | const int16x4_t v_val_1_lo = vget_low_s16(v_val_1); |
| 83 | const int16x4_t v_val_2_lo = vget_low_s16(v_val_2); |
| 84 | const int16x4_t v_val_3_lo = vget_low_s16(v_val_3); |
| 85 | int32x4_t v_sum_01 = vmull_s16(v_val_0_lo, v_val_0_lo); |
| 86 | v_sum_01 = vmlal_s16(v_sum_01, v_val_1_lo, v_val_1_lo); |
| 87 | int32x4_t v_sum_23 = vmull_s16(v_val_2_lo, v_val_2_lo); |
| 88 | v_sum_23 = vmlal_s16(v_sum_23, v_val_3_lo, v_val_3_lo); |
| 89 | #if defined(__aarch64__) |
| 90 | v_sum_01 = vmlal_high_s16(v_sum_01, v_val_0, v_val_0); |
| 91 | v_sum_01 = vmlal_high_s16(v_sum_01, v_val_1, v_val_1); |
| 92 | v_sum_23 = vmlal_high_s16(v_sum_23, v_val_2, v_val_2); |
| 93 | v_sum_23 = vmlal_high_s16(v_sum_23, v_val_3, v_val_3); |
| 94 | v_sum = vaddq_s32(v_sum, vpaddq_s32(v_sum_01, v_sum_23)); |
| 95 | #else |
| 96 | const int16x4_t v_val_0_hi = vget_high_s16(v_val_0); |
| 97 | const int16x4_t v_val_1_hi = vget_high_s16(v_val_1); |
| 98 | const int16x4_t v_val_2_hi = vget_high_s16(v_val_2); |
| 99 | const int16x4_t v_val_3_hi = vget_high_s16(v_val_3); |
| 100 | v_sum_01 = vmlal_s16(v_sum_01, v_val_0_hi, v_val_0_hi); |
| 101 | v_sum_01 = vmlal_s16(v_sum_01, v_val_1_hi, v_val_1_hi); |
| 102 | v_sum_23 = vmlal_s16(v_sum_23, v_val_2_hi, v_val_2_hi); |
| 103 | v_sum_23 = vmlal_s16(v_sum_23, v_val_3_hi, v_val_3_hi); |
| 104 | v_sum = vaddq_s32(v_sum, vcombine_s32(vqmovn_s64(vpaddlq_s32(v_sum_01)), |
| 105 | vqmovn_s64(vpaddlq_s32(v_sum_23)))); |
| 106 | #endif |
| 107 | c += 8; |
| 108 | } while (c < width); |
| 109 | |
| 110 | v_acc_q = vpadalq_u32(v_acc_q, vreinterpretq_u32_s32(v_sum)); |
| 111 | |
| 112 | src += 4 * stride; |
| 113 | r += 4; |
| 114 | } while (r < height); |
| 115 | #if defined(__aarch64__) |
| 116 | return vaddvq_u64(v_acc_q); |
| 117 | #else |
| 118 | v_acc_q = vaddq_u64(v_acc_q, vextq_u64(v_acc_q, v_acc_q, 1)); |
| 119 | return vgetq_lane_u64(v_acc_q, 0); |
| 120 | #endif |
| 121 | } |
| 122 | |
| 123 | uint64_t aom_sum_squares_2d_i16_neon(const int16_t *src, int stride, int width, |
| 124 | int height) { |
| 125 | // 4 elements per row only requires half an SIMD register, so this |
| 126 | // must be a special case, but also note that over 75% of all calls |
| 127 | // are with size == 4, so it is also the common case. |
| 128 | if (LIKELY(width == 4 && height == 4)) { |
| 129 | return aom_sum_squares_2d_i16_4x4_neon(src, stride); |
| 130 | } else if (LIKELY(width == 4 && (height & 3) == 0)) { |
| 131 | return aom_sum_squares_2d_i16_4xn_neon(src, stride, height); |
| 132 | } else if (LIKELY((width & 7) == 0 && (height & 3) == 0)) { |
| 133 | // Generic case |
| 134 | return aom_sum_squares_2d_i16_nxn_neon(src, stride, width, height); |
| 135 | } else { |
| 136 | return aom_sum_squares_2d_i16_c(src, stride, width, height); |
| 137 | } |
| 138 | } |