blob: f05d3ceae2e8f4a05da1def2ec4eab5be4c0125b [file] [log] [blame]
Yaowu Xuc27fc142016-08-22 16:08:15 -07001/*
Yaowu Xu9c01aa12016-09-01 14:32:49 -07002 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
Yaowu Xuc27fc142016-08-22 16:08:15 -07003 *
Yaowu Xu9c01aa12016-09-01 14:32:49 -07004 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
Yaowu Xuc27fc142016-08-22 16:08:15 -070010 */
11
12#include <arm_neon.h>
13
Yaowu Xuf883b422016-08-30 14:01:10 -070014#include "./aom_dsp_rtcd.h"
15#include "aom/aom_integer.h"
Yaowu Xuc27fc142016-08-22 16:08:15 -070016
Yaowu Xuf883b422016-08-30 14:01:10 -070017void aom_convolve_avg_neon(const uint8_t *src, // r0
Yaowu Xuc27fc142016-08-22 16:08:15 -070018 ptrdiff_t src_stride, // r1
19 uint8_t *dst, // r2
20 ptrdiff_t dst_stride, // r3
21 const int16_t *filter_x, int filter_x_stride,
22 const int16_t *filter_y, int filter_y_stride, int w,
23 int h) {
24 uint8_t *d;
25 uint8x8_t d0u8, d1u8, d2u8, d3u8;
26 uint32x2_t d0u32, d2u32;
27 uint8x16_t q0u8, q1u8, q2u8, q3u8, q8u8, q9u8, q10u8, q11u8;
28 (void)filter_x;
29 (void)filter_x_stride;
30 (void)filter_y;
31 (void)filter_y_stride;
32
33 d = dst;
34 if (w > 32) { // avg64
35 for (; h > 0; h -= 1) {
36 q0u8 = vld1q_u8(src);
37 q1u8 = vld1q_u8(src + 16);
38 q2u8 = vld1q_u8(src + 32);
39 q3u8 = vld1q_u8(src + 48);
40 src += src_stride;
41 q8u8 = vld1q_u8(d);
42 q9u8 = vld1q_u8(d + 16);
43 q10u8 = vld1q_u8(d + 32);
44 q11u8 = vld1q_u8(d + 48);
45 d += dst_stride;
46
47 q0u8 = vrhaddq_u8(q0u8, q8u8);
48 q1u8 = vrhaddq_u8(q1u8, q9u8);
49 q2u8 = vrhaddq_u8(q2u8, q10u8);
50 q3u8 = vrhaddq_u8(q3u8, q11u8);
51
52 vst1q_u8(dst, q0u8);
53 vst1q_u8(dst + 16, q1u8);
54 vst1q_u8(dst + 32, q2u8);
55 vst1q_u8(dst + 48, q3u8);
56 dst += dst_stride;
57 }
58 } else if (w == 32) { // avg32
59 for (; h > 0; h -= 2) {
60 q0u8 = vld1q_u8(src);
61 q1u8 = vld1q_u8(src + 16);
62 src += src_stride;
63 q2u8 = vld1q_u8(src);
64 q3u8 = vld1q_u8(src + 16);
65 src += src_stride;
66 q8u8 = vld1q_u8(d);
67 q9u8 = vld1q_u8(d + 16);
68 d += dst_stride;
69 q10u8 = vld1q_u8(d);
70 q11u8 = vld1q_u8(d + 16);
71 d += dst_stride;
72
73 q0u8 = vrhaddq_u8(q0u8, q8u8);
74 q1u8 = vrhaddq_u8(q1u8, q9u8);
75 q2u8 = vrhaddq_u8(q2u8, q10u8);
76 q3u8 = vrhaddq_u8(q3u8, q11u8);
77
78 vst1q_u8(dst, q0u8);
79 vst1q_u8(dst + 16, q1u8);
80 dst += dst_stride;
81 vst1q_u8(dst, q2u8);
82 vst1q_u8(dst + 16, q3u8);
83 dst += dst_stride;
84 }
85 } else if (w > 8) { // avg16
86 for (; h > 0; h -= 2) {
87 q0u8 = vld1q_u8(src);
88 src += src_stride;
89 q1u8 = vld1q_u8(src);
90 src += src_stride;
91 q2u8 = vld1q_u8(d);
92 d += dst_stride;
93 q3u8 = vld1q_u8(d);
94 d += dst_stride;
95
96 q0u8 = vrhaddq_u8(q0u8, q2u8);
97 q1u8 = vrhaddq_u8(q1u8, q3u8);
98
99 vst1q_u8(dst, q0u8);
100 dst += dst_stride;
101 vst1q_u8(dst, q1u8);
102 dst += dst_stride;
103 }
104 } else if (w == 8) { // avg8
105 for (; h > 0; h -= 2) {
106 d0u8 = vld1_u8(src);
107 src += src_stride;
108 d1u8 = vld1_u8(src);
109 src += src_stride;
110 d2u8 = vld1_u8(d);
111 d += dst_stride;
112 d3u8 = vld1_u8(d);
113 d += dst_stride;
114
115 q0u8 = vcombine_u8(d0u8, d1u8);
116 q1u8 = vcombine_u8(d2u8, d3u8);
117 q0u8 = vrhaddq_u8(q0u8, q1u8);
118
119 vst1_u8(dst, vget_low_u8(q0u8));
120 dst += dst_stride;
121 vst1_u8(dst, vget_high_u8(q0u8));
122 dst += dst_stride;
123 }
124 } else { // avg4
125 for (; h > 0; h -= 2) {
126 d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 0);
127 src += src_stride;
128 d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 1);
129 src += src_stride;
130 d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 0);
131 d += dst_stride;
132 d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 1);
133 d += dst_stride;
134
135 d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32), vreinterpret_u8_u32(d2u32));
136
137 d0u32 = vreinterpret_u32_u8(d0u8);
138 vst1_lane_u32((uint32_t *)dst, d0u32, 0);
139 dst += dst_stride;
140 vst1_lane_u32((uint32_t *)dst, d0u32, 1);
141 dst += dst_stride;
142 }
143 }
144 return;
145}