blob: 2f452f55b5172116bb5a9f5f7323a98486f43508 [file] [log] [blame]
Yaowu Xuc27fc142016-08-22 16:08:15 -07001/*
Yaowu Xu9c01aa12016-09-01 14:32:49 -07002 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
Yaowu Xuc27fc142016-08-22 16:08:15 -07003 *
Yaowu Xu9c01aa12016-09-01 14:32:49 -07004 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
Yaowu Xuc27fc142016-08-22 16:08:15 -070010 */
11
12#include <arm_neon.h>
13
Yaowu Xuf883b422016-08-30 14:01:10 -070014#include "./aom_config.h"
Yaowu Xuc27fc142016-08-22 16:08:15 -070015
Yaowu Xuf883b422016-08-30 14:01:10 -070016#include "aom/aom_integer.h"
Yaowu Xuc27fc142016-08-22 16:08:15 -070017
Yaowu Xuf883b422016-08-30 14:01:10 -070018unsigned int aom_sad8x16_neon(unsigned char *src_ptr, int src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -070019 unsigned char *ref_ptr, int ref_stride) {
20 uint8x8_t d0, d8;
21 uint16x8_t q12;
22 uint32x4_t q1;
23 uint64x2_t q3;
24 uint32x2_t d5;
25 int i;
26
27 d0 = vld1_u8(src_ptr);
28 src_ptr += src_stride;
29 d8 = vld1_u8(ref_ptr);
30 ref_ptr += ref_stride;
31 q12 = vabdl_u8(d0, d8);
32
33 for (i = 0; i < 15; i++) {
34 d0 = vld1_u8(src_ptr);
35 src_ptr += src_stride;
36 d8 = vld1_u8(ref_ptr);
37 ref_ptr += ref_stride;
38 q12 = vabal_u8(q12, d0, d8);
39 }
40
41 q1 = vpaddlq_u16(q12);
42 q3 = vpaddlq_u32(q1);
43 d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
44 vreinterpret_u32_u64(vget_high_u64(q3)));
45
46 return vget_lane_u32(d5, 0);
47}
48
Yaowu Xuf883b422016-08-30 14:01:10 -070049unsigned int aom_sad4x4_neon(unsigned char *src_ptr, int src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -070050 unsigned char *ref_ptr, int ref_stride) {
51 uint8x8_t d0, d8;
52 uint16x8_t q12;
53 uint32x2_t d1;
54 uint64x1_t d3;
55 int i;
56
57 d0 = vld1_u8(src_ptr);
58 src_ptr += src_stride;
59 d8 = vld1_u8(ref_ptr);
60 ref_ptr += ref_stride;
61 q12 = vabdl_u8(d0, d8);
62
63 for (i = 0; i < 3; i++) {
64 d0 = vld1_u8(src_ptr);
65 src_ptr += src_stride;
66 d8 = vld1_u8(ref_ptr);
67 ref_ptr += ref_stride;
68 q12 = vabal_u8(q12, d0, d8);
69 }
70
71 d1 = vpaddl_u16(vget_low_u16(q12));
72 d3 = vpaddl_u32(d1);
73
74 return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
75}
76
Yaowu Xuf883b422016-08-30 14:01:10 -070077unsigned int aom_sad16x8_neon(unsigned char *src_ptr, int src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -070078 unsigned char *ref_ptr, int ref_stride) {
79 uint8x16_t q0, q4;
80 uint16x8_t q12, q13;
81 uint32x4_t q1;
82 uint64x2_t q3;
83 uint32x2_t d5;
84 int i;
85
86 q0 = vld1q_u8(src_ptr);
87 src_ptr += src_stride;
88 q4 = vld1q_u8(ref_ptr);
89 ref_ptr += ref_stride;
90 q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
91 q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
92
93 for (i = 0; i < 7; i++) {
94 q0 = vld1q_u8(src_ptr);
95 src_ptr += src_stride;
96 q4 = vld1q_u8(ref_ptr);
97 ref_ptr += ref_stride;
98 q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
99 q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
100 }
101
102 q12 = vaddq_u16(q12, q13);
103 q1 = vpaddlq_u16(q12);
104 q3 = vpaddlq_u32(q1);
105 d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
106 vreinterpret_u32_u64(vget_high_u64(q3)));
107
108 return vget_lane_u32(d5, 0);
109}
110
111static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
112 const uint16x8_t vec_hi) {
113 const uint32x4_t vec_l_lo =
114 vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
115 const uint32x4_t vec_l_hi =
116 vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
117 const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
118 const uint64x2_t b = vpaddlq_u32(a);
119 const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
120 vreinterpret_u32_u64(vget_high_u64(b)));
121 return vget_lane_u32(c, 0);
122}
123static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) {
124 const uint32x4_t a = vpaddlq_u16(vec_16x8);
125 const uint64x2_t b = vpaddlq_u32(a);
126 const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
127 vreinterpret_u32_u64(vget_high_u64(b)));
128 return vget_lane_u32(c, 0);
129}
130
Yaowu Xuf883b422016-08-30 14:01:10 -0700131unsigned int aom_sad64x64_neon(const uint8_t *src, int src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700132 const uint8_t *ref, int ref_stride) {
133 int i;
134 uint16x8_t vec_accum_lo = vdupq_n_u16(0);
135 uint16x8_t vec_accum_hi = vdupq_n_u16(0);
136 for (i = 0; i < 64; ++i) {
137 const uint8x16_t vec_src_00 = vld1q_u8(src);
138 const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
139 const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
140 const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
141 const uint8x16_t vec_ref_00 = vld1q_u8(ref);
142 const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
143 const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
144 const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
145 src += src_stride;
146 ref += ref_stride;
147 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),
148 vget_low_u8(vec_ref_00));
149 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),
150 vget_high_u8(vec_ref_00));
151 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),
152 vget_low_u8(vec_ref_16));
153 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
154 vget_high_u8(vec_ref_16));
155 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_32),
156 vget_low_u8(vec_ref_32));
157 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32),
158 vget_high_u8(vec_ref_32));
159 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48),
160 vget_low_u8(vec_ref_48));
161 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48),
162 vget_high_u8(vec_ref_48));
163 }
164 return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi);
165}
166
Yaowu Xuf883b422016-08-30 14:01:10 -0700167unsigned int aom_sad32x32_neon(const uint8_t *src, int src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700168 const uint8_t *ref, int ref_stride) {
169 int i;
170 uint16x8_t vec_accum_lo = vdupq_n_u16(0);
171 uint16x8_t vec_accum_hi = vdupq_n_u16(0);
172
173 for (i = 0; i < 32; ++i) {
174 const uint8x16_t vec_src_00 = vld1q_u8(src);
175 const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
176 const uint8x16_t vec_ref_00 = vld1q_u8(ref);
177 const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
178 src += src_stride;
179 ref += ref_stride;
180 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),
181 vget_low_u8(vec_ref_00));
182 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),
183 vget_high_u8(vec_ref_00));
184 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),
185 vget_low_u8(vec_ref_16));
186 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
187 vget_high_u8(vec_ref_16));
188 }
189 return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
190}
191
Yaowu Xuf883b422016-08-30 14:01:10 -0700192unsigned int aom_sad16x16_neon(const uint8_t *src, int src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700193 const uint8_t *ref, int ref_stride) {
194 int i;
195 uint16x8_t vec_accum_lo = vdupq_n_u16(0);
196 uint16x8_t vec_accum_hi = vdupq_n_u16(0);
197
198 for (i = 0; i < 16; ++i) {
199 const uint8x16_t vec_src = vld1q_u8(src);
200 const uint8x16_t vec_ref = vld1q_u8(ref);
201 src += src_stride;
202 ref += ref_stride;
203 vec_accum_lo =
204 vabal_u8(vec_accum_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref));
205 vec_accum_hi =
206 vabal_u8(vec_accum_hi, vget_high_u8(vec_src), vget_high_u8(vec_ref));
207 }
208 return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
209}
210
Yaowu Xuf883b422016-08-30 14:01:10 -0700211unsigned int aom_sad8x8_neon(const uint8_t *src, int src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700212 const uint8_t *ref, int ref_stride) {
213 int i;
214 uint16x8_t vec_accum = vdupq_n_u16(0);
215
216 for (i = 0; i < 8; ++i) {
217 const uint8x8_t vec_src = vld1_u8(src);
218 const uint8x8_t vec_ref = vld1_u8(ref);
219 src += src_stride;
220 ref += ref_stride;
221 vec_accum = vabal_u8(vec_accum, vec_src, vec_ref);
222 }
223 return horizontal_add_16x8(vec_accum);
224}