blob: 9599ae06d21fbe93686896d8ab6bd8a2ebe9c949 [file] [log] [blame]
Yaowu Xuc27fc142016-08-22 16:08:15 -07001/*
Yaowu Xu9c01aa12016-09-01 14:32:49 -07002 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
Yaowu Xuc27fc142016-08-22 16:08:15 -07003 *
Yaowu Xu9c01aa12016-09-01 14:32:49 -07004 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
Yaowu Xuc27fc142016-08-22 16:08:15 -070010 */
11
12#include <arm_neon.h>
Tom Finegan44702c82018-05-22 13:00:39 -070013
14#include "config/aom_dsp_rtcd.h"
Tom Finegan60e653d2018-05-22 11:34:58 -070015#include "config/aom_config.h"
Yaowu Xuc27fc142016-08-22 16:08:15 -070016
17#include "aom_ports/mem.h"
Yaowu Xuf883b422016-08-30 14:01:10 -070018#include "aom/aom_integer.h"
Yaowu Xuc27fc142016-08-22 16:08:15 -070019
20#include "aom_dsp/variance.h"
Jonathan Wrightce3b0012022-07-19 16:39:56 +010021#include "aom_dsp/arm/mem_neon.h"
Yaowu Xuc27fc142016-08-22 16:08:15 -070022
Jonathan Wrightce3b0012022-07-19 16:39:56 +010023static void var_filter_block2d_bil_w4(const uint8_t *src_ptr, uint8_t *dst_ptr,
24 int src_stride, int pixel_step,
25 int dst_height, int filter_offset) {
26 const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
27 const uint8x8_t f1 = vdup_n_u8(filter_offset);
28
Gerda Zsejke More759b5132022-11-23 16:23:50 +010029 int i = dst_height;
Jonathan Wrightce3b0012022-07-19 16:39:56 +010030 do {
31 uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
32 uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
33 uint16x8_t blend = vmull_u8(s0, f0);
34 blend = vmlal_u8(blend, s1, f1);
35 uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
36 vst1_u8(dst_ptr, blend_u8);
37
38 src_ptr += 2 * src_stride;
39 dst_ptr += 2 * 4;
Gerda Zsejke More759b5132022-11-23 16:23:50 +010040 i -= 2;
41 } while (i != 0);
Vitalii Dziumenkob54b7b42020-05-14 11:35:10 +030042}
43
Jonathan Wrightce3b0012022-07-19 16:39:56 +010044static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, uint8_t *dst_ptr,
45 int src_stride, int pixel_step,
46 int dst_height, int filter_offset) {
47 const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
48 const uint8x8_t f1 = vdup_n_u8(filter_offset);
49
Gerda Zsejke More759b5132022-11-23 16:23:50 +010050 int i = dst_height;
Jonathan Wrightce3b0012022-07-19 16:39:56 +010051 do {
52 uint8x8_t s0 = vld1_u8(src_ptr);
53 uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
54 uint16x8_t blend = vmull_u8(s0, f0);
55 blend = vmlal_u8(blend, s1, f1);
56 uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
57 vst1_u8(dst_ptr, blend_u8);
58
59 src_ptr += src_stride;
60 dst_ptr += 8;
Gerda Zsejke More759b5132022-11-23 16:23:50 +010061 } while (--i != 0);
Jonathan Wrightce3b0012022-07-19 16:39:56 +010062}
63
64static void var_filter_block2d_bil_large(const uint8_t *src_ptr,
65 uint8_t *dst_ptr, int src_stride,
66 int pixel_step, int dst_width,
67 int dst_height, int filter_offset) {
68 const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
69 const uint8x8_t f1 = vdup_n_u8(filter_offset);
70
Gerda Zsejke More759b5132022-11-23 16:23:50 +010071 int i = dst_height;
Jonathan Wrightce3b0012022-07-19 16:39:56 +010072 do {
73 int j = 0;
74 do {
75 uint8x16_t s0 = vld1q_u8(src_ptr + j);
76 uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
77 uint16x8_t blend_l = vmull_u8(vget_low_u8(s0), f0);
78 blend_l = vmlal_u8(blend_l, vget_low_u8(s1), f1);
79 uint16x8_t blend_h = vmull_u8(vget_high_u8(s0), f0);
80 blend_h = vmlal_u8(blend_h, vget_high_u8(s1), f1);
81 uint8x16_t blend_u8 =
82 vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3));
83 vst1q_u8(dst_ptr + j, blend_u8);
84
85 j += 16;
86 } while (j < dst_width);
87
88 src_ptr += src_stride;
89 dst_ptr += dst_width;
Gerda Zsejke More759b5132022-11-23 16:23:50 +010090 } while (--i != 0);
Jonathan Wrightce3b0012022-07-19 16:39:56 +010091}
92
93static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, uint8_t *dst_ptr,
94 int src_stride, int pixel_step,
95 int dst_height, int filter_offset) {
96 var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 16,
97 dst_height, filter_offset);
98}
99
100static void var_filter_block2d_bil_w32(const uint8_t *src_ptr, uint8_t *dst_ptr,
101 int src_stride, int pixel_step,
102 int dst_height, int filter_offset) {
103 var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 32,
104 dst_height, filter_offset);
105}
106
107static void var_filter_block2d_bil_w64(const uint8_t *src_ptr, uint8_t *dst_ptr,
108 int src_stride, int pixel_step,
109 int dst_height, int filter_offset) {
110 var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 64,
111 dst_height, filter_offset);
112}
113
114static void var_filter_block2d_bil_w128(const uint8_t *src_ptr,
115 uint8_t *dst_ptr, int src_stride,
116 int pixel_step, int dst_height,
117 int filter_offset) {
118 var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 128,
119 dst_height, filter_offset);
120}
121
Jonathan Wright2ce72a4332022-07-19 18:35:38 +0100122static void var_filter_block2d_avg(const uint8_t *src_ptr, uint8_t *dst_ptr,
123 int src_stride, int pixel_step,
124 int dst_width, int dst_height) {
125 // We only specialise on the filter values for large block sizes (>= 16x16.)
126 assert(dst_width >= 16 && dst_width % 16 == 0);
127
Gerda Zsejke More759b5132022-11-23 16:23:50 +0100128 int i = dst_height;
Jonathan Wright2ce72a4332022-07-19 18:35:38 +0100129 do {
130 int j = 0;
131 do {
132 uint8x16_t s0 = vld1q_u8(src_ptr + j);
133 uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
134 uint8x16_t avg = vrhaddq_u8(s0, s1);
135 vst1q_u8(dst_ptr + j, avg);
136
137 j += 16;
138 } while (j < dst_width);
139
140 src_ptr += src_stride;
141 dst_ptr += dst_width;
Gerda Zsejke More759b5132022-11-23 16:23:50 +0100142 } while (--i != 0);
Jonathan Wright2ce72a4332022-07-19 18:35:38 +0100143}
144
Jonathan Wrightce3b0012022-07-19 16:39:56 +0100145#define SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \
146 unsigned int aom_sub_pixel_variance##w##x##h##_neon( \
147 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
148 const uint8_t *ref, int ref_stride, uint32_t *sse) { \
149 uint8_t tmp0[w * (h + padding)]; \
150 uint8_t tmp1[w * h]; \
151 var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
152 xoffset); \
153 var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
154 return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
Vitalii Dziumenkob54b7b42020-05-14 11:35:10 +0300155 }
Vitalii Dziumenkob54b7b42020-05-14 11:35:10 +0300156
Jonathan Wright2ce72a4332022-07-19 18:35:38 +0100157#define SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \
158 unsigned int aom_sub_pixel_variance##w##x##h##_neon( \
159 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
160 const uint8_t *ref, int ref_stride, unsigned int *sse) { \
161 if (xoffset == 0) { \
162 if (yoffset == 0) { \
163 return aom_variance##w##x##h##_neon(src, src_stride, ref, ref_stride, \
164 sse); \
165 } else if (yoffset == 4) { \
166 uint8_t tmp[w * h]; \
167 var_filter_block2d_avg(src, tmp, src_stride, src_stride, w, h); \
168 return aom_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \
169 } else { \
170 uint8_t tmp[w * h]; \
171 var_filter_block2d_bil_w##w(src, tmp, src_stride, src_stride, h, \
172 yoffset); \
173 return aom_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \
174 } \
175 } else if (xoffset == 4) { \
176 uint8_t tmp0[w * (h + padding)]; \
Jonathan Wright2ce72a4332022-07-19 18:35:38 +0100177 if (yoffset == 0) { \
Jonathan Wright2a296042023-01-16 16:59:25 +0000178 var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h); \
Jonathan Wright2ce72a4332022-07-19 18:35:38 +0100179 return aom_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \
180 } else if (yoffset == 4) { \
181 uint8_t tmp1[w * (h + padding)]; \
Jonathan Wright2a296042023-01-16 16:59:25 +0000182 var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \
Jonathan Wright2ce72a4332022-07-19 18:35:38 +0100183 var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
184 return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
185 } else { \
186 uint8_t tmp1[w * (h + padding)]; \
Jonathan Wright2a296042023-01-16 16:59:25 +0000187 var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \
Jonathan Wright2ce72a4332022-07-19 18:35:38 +0100188 var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
189 return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
190 } \
191 } else { \
192 uint8_t tmp0[w * (h + padding)]; \
Jonathan Wright2ce72a4332022-07-19 18:35:38 +0100193 if (yoffset == 0) { \
Jonathan Wright2a296042023-01-16 16:59:25 +0000194 var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset); \
Jonathan Wright2ce72a4332022-07-19 18:35:38 +0100195 return aom_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \
196 } else if (yoffset == 4) { \
197 uint8_t tmp1[w * h]; \
Jonathan Wright2a296042023-01-16 16:59:25 +0000198 var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
199 xoffset); \
Jonathan Wright2ce72a4332022-07-19 18:35:38 +0100200 var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
201 return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
202 } else { \
203 uint8_t tmp1[w * h]; \
Jonathan Wright2a296042023-01-16 16:59:25 +0000204 var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
205 xoffset); \
Jonathan Wright2ce72a4332022-07-19 18:35:38 +0100206 var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
207 return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
208 } \
209 } \
210 }
211
Jonathan Wrightce3b0012022-07-19 16:39:56 +0100212SUBPEL_VARIANCE_WXH_NEON(4, 4, 2)
213SUBPEL_VARIANCE_WXH_NEON(4, 8, 2)
Yaowu Xuc27fc142016-08-22 16:08:15 -0700214
Jonathan Wrightce3b0012022-07-19 16:39:56 +0100215SUBPEL_VARIANCE_WXH_NEON(8, 4, 1)
216SUBPEL_VARIANCE_WXH_NEON(8, 8, 1)
217SUBPEL_VARIANCE_WXH_NEON(8, 16, 1)
Yaowu Xuc27fc142016-08-22 16:08:15 -0700218
Jonathan Wrightce3b0012022-07-19 16:39:56 +0100219SUBPEL_VARIANCE_WXH_NEON(16, 8, 1)
Jonathan Wright2ce72a4332022-07-19 18:35:38 +0100220SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1)
221SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1)
Yaowu Xuc27fc142016-08-22 16:08:15 -0700222
Jonathan Wright2ce72a4332022-07-19 18:35:38 +0100223SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1)
224SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1)
225SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1)
Yaowu Xuc27fc142016-08-22 16:08:15 -0700226
Jonathan Wright2ce72a4332022-07-19 18:35:38 +0100227SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1)
228SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1)
229SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 128, 1)
Yaowu Xuc27fc142016-08-22 16:08:15 -0700230
Jonathan Wright2ce72a4332022-07-19 18:35:38 +0100231SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(128, 64, 1)
232SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(128, 128, 1)
Jerome Jiangc381ba82020-10-30 14:55:57 -0700233
234// Realtime mode doesn't use 4x rectangular blocks.
235#if !CONFIG_REALTIME_ONLY
Jerome Jiangc381ba82020-10-30 14:55:57 -0700236
Jonathan Wrightce3b0012022-07-19 16:39:56 +0100237SUBPEL_VARIANCE_WXH_NEON(4, 16, 2)
Jerome Jiangc381ba82020-10-30 14:55:57 -0700238
Jonathan Wrightce3b0012022-07-19 16:39:56 +0100239SUBPEL_VARIANCE_WXH_NEON(8, 32, 1)
Jerome Jiangc381ba82020-10-30 14:55:57 -0700240
Jonathan Wrightce3b0012022-07-19 16:39:56 +0100241SUBPEL_VARIANCE_WXH_NEON(16, 4, 1)
Jonathan Wright2ce72a4332022-07-19 18:35:38 +0100242SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 64, 1)
Jerome Jiangc381ba82020-10-30 14:55:57 -0700243
Jonathan Wright2ce72a4332022-07-19 18:35:38 +0100244SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 8, 1)
Jerome Jiangc381ba82020-10-30 14:55:57 -0700245
Jonathan Wright2ce72a4332022-07-19 18:35:38 +0100246SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 16, 1)
Jerome Jiangc381ba82020-10-30 14:55:57 -0700247
Jerome Jiangc381ba82020-10-30 14:55:57 -0700248#endif // !CONFIG_REALTIME_ONLY
Jonathan Wrightce3b0012022-07-19 16:39:56 +0100249
250#undef SUBPEL_VARIANCE_WXH_NEON
Jonathan Wright2ce72a4332022-07-19 18:35:38 +0100251#undef SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON
Jonathan Wright3df6c182022-11-19 14:57:25 +0000252
253// Combine bilinear filter with aom_comp_avg_pred for blocks having width 4.
254static void avg_pred_var_filter_block2d_bil_w4(const uint8_t *src_ptr,
255 uint8_t *dst_ptr, int src_stride,
256 int pixel_step, int dst_height,
257 int filter_offset,
258 const uint8_t *second_pred) {
259 const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
260 const uint8x8_t f1 = vdup_n_u8(filter_offset);
261
Gerda Zsejke More759b5132022-11-23 16:23:50 +0100262 int i = dst_height;
Jonathan Wright3df6c182022-11-19 14:57:25 +0000263 do {
264 uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
265 uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
266 uint16x8_t blend = vmull_u8(s0, f0);
267 blend = vmlal_u8(blend, s1, f1);
268 uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
269
270 uint8x8_t p = vld1_u8(second_pred);
271 uint8x8_t avg = vrhadd_u8(blend_u8, p);
272
273 vst1_u8(dst_ptr, avg);
274
275 src_ptr += 2 * src_stride;
276 dst_ptr += 2 * 4;
277 second_pred += 2 * 4;
Gerda Zsejke More759b5132022-11-23 16:23:50 +0100278 i -= 2;
279 } while (i != 0);
Jonathan Wright3df6c182022-11-19 14:57:25 +0000280}
281
282// Combine bilinear filter with aom_comp_avg_pred for blocks having width 8.
283static void avg_pred_var_filter_block2d_bil_w8(const uint8_t *src_ptr,
284 uint8_t *dst_ptr, int src_stride,
285 int pixel_step, int dst_height,
286 int filter_offset,
287 const uint8_t *second_pred) {
288 const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
289 const uint8x8_t f1 = vdup_n_u8(filter_offset);
290
Gerda Zsejke More759b5132022-11-23 16:23:50 +0100291 int i = dst_height;
Jonathan Wright3df6c182022-11-19 14:57:25 +0000292 do {
293 uint8x8_t s0 = vld1_u8(src_ptr);
294 uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
295 uint16x8_t blend = vmull_u8(s0, f0);
296 blend = vmlal_u8(blend, s1, f1);
297 uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
298
299 uint8x8_t p = vld1_u8(second_pred);
300 uint8x8_t avg = vrhadd_u8(blend_u8, p);
301
302 vst1_u8(dst_ptr, avg);
303
304 src_ptr += src_stride;
305 dst_ptr += 8;
306 second_pred += 8;
Gerda Zsejke More759b5132022-11-23 16:23:50 +0100307 } while (--i > 0);
Jonathan Wright3df6c182022-11-19 14:57:25 +0000308}
309
310// Combine bilinear filter with aom_comp_avg_pred for large blocks.
311static void avg_pred_var_filter_block2d_bil_large(
312 const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
313 int dst_width, int dst_height, int filter_offset,
314 const uint8_t *second_pred) {
315 const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
316 const uint8x8_t f1 = vdup_n_u8(filter_offset);
317
Gerda Zsejke More759b5132022-11-23 16:23:50 +0100318 int i = dst_height;
Jonathan Wright3df6c182022-11-19 14:57:25 +0000319 do {
320 int j = 0;
321 do {
322 uint8x16_t s0 = vld1q_u8(src_ptr + j);
323 uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
324 uint16x8_t blend_l = vmull_u8(vget_low_u8(s0), f0);
325 blend_l = vmlal_u8(blend_l, vget_low_u8(s1), f1);
326 uint16x8_t blend_h = vmull_u8(vget_high_u8(s0), f0);
327 blend_h = vmlal_u8(blend_h, vget_high_u8(s1), f1);
328 uint8x16_t blend_u8 =
329 vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3));
330
331 uint8x16_t p = vld1q_u8(second_pred);
332 uint8x16_t avg = vrhaddq_u8(blend_u8, p);
333
334 vst1q_u8(dst_ptr + j, avg);
335
336 j += 16;
337 second_pred += 16;
338 } while (j < dst_width);
339
340 src_ptr += src_stride;
341 dst_ptr += dst_width;
Gerda Zsejke More759b5132022-11-23 16:23:50 +0100342 } while (--i != 0);
Jonathan Wright3df6c182022-11-19 14:57:25 +0000343}
344
345// Combine bilinear filter with aom_comp_avg_pred for blocks having width 16.
346static void avg_pred_var_filter_block2d_bil_w16(
347 const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
348 int dst_height, int filter_offset, const uint8_t *second_pred) {
349 avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
350 pixel_step, 16, dst_height,
351 filter_offset, second_pred);
352}
353
354// Combine bilinear filter with aom_comp_avg_pred for blocks having width 32.
355static void avg_pred_var_filter_block2d_bil_w32(
356 const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
357 int dst_height, int filter_offset, const uint8_t *second_pred) {
358 avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
359 pixel_step, 32, dst_height,
360 filter_offset, second_pred);
361}
362
363// Combine bilinear filter with aom_comp_avg_pred for blocks having width 64.
364static void avg_pred_var_filter_block2d_bil_w64(
365 const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
366 int dst_height, int filter_offset, const uint8_t *second_pred) {
367 avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
368 pixel_step, 64, dst_height,
369 filter_offset, second_pred);
370}
371
372// Combine bilinear filter with aom_comp_avg_pred for blocks having width 128.
373static void avg_pred_var_filter_block2d_bil_w128(
374 const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
375 int dst_height, int filter_offset, const uint8_t *second_pred) {
376 avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
377 pixel_step, 128, dst_height,
378 filter_offset, second_pred);
379}
380
Jonathan Wrightfef440c2022-11-19 17:14:19 +0000381// Combine averaging subpel filter with aom_comp_avg_pred.
382static void avg_pred_var_filter_block2d_avg(const uint8_t *src_ptr,
383 uint8_t *dst_ptr, int src_stride,
384 int pixel_step, int dst_width,
385 int dst_height,
386 const uint8_t *second_pred) {
387 // We only specialise on the filter values for large block sizes (>= 16x16.)
388 assert(dst_width >= 16 && dst_width % 16 == 0);
389
Gerda Zsejke More759b5132022-11-23 16:23:50 +0100390 int i = dst_height;
Jonathan Wrightfef440c2022-11-19 17:14:19 +0000391 do {
392 int j = 0;
393 do {
394 uint8x16_t s0 = vld1q_u8(src_ptr + j);
395 uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
396 uint8x16_t avg = vrhaddq_u8(s0, s1);
397
398 uint8x16_t p = vld1q_u8(second_pred);
399 avg = vrhaddq_u8(avg, p);
400
401 vst1q_u8(dst_ptr + j, avg);
402
403 j += 16;
404 second_pred += 16;
405 } while (j < dst_width);
406
407 src_ptr += src_stride;
408 dst_ptr += dst_width;
Gerda Zsejke More759b5132022-11-23 16:23:50 +0100409 } while (--i != 0);
Jonathan Wrightfef440c2022-11-19 17:14:19 +0000410}
411
412// Implementation of aom_comp_avg_pred for blocks having width >= 16.
413static void avg_pred(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride,
414 int dst_width, int dst_height,
415 const uint8_t *second_pred) {
416 // We only specialise on the filter values for large block sizes (>= 16x16.)
417 assert(dst_width >= 16 && dst_width % 16 == 0);
418
Gerda Zsejke More759b5132022-11-23 16:23:50 +0100419 int i = dst_height;
Jonathan Wrightfef440c2022-11-19 17:14:19 +0000420 do {
421 int j = 0;
422 do {
423 uint8x16_t s = vld1q_u8(src_ptr + j);
424 uint8x16_t p = vld1q_u8(second_pred);
425
426 uint8x16_t avg = vrhaddq_u8(s, p);
427
428 vst1q_u8(dst_ptr + j, avg);
429
430 j += 16;
431 second_pred += 16;
432 } while (j < dst_width);
433
434 src_ptr += src_stride;
435 dst_ptr += dst_width;
Gerda Zsejke More759b5132022-11-23 16:23:50 +0100436 } while (--i != 0);
Jonathan Wrightfef440c2022-11-19 17:14:19 +0000437}
438
Wan-Teh Chang17cbad52022-12-01 19:12:58 -0800439#define SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \
440 unsigned int aom_sub_pixel_avg_variance##w##x##h##_neon( \
441 const uint8_t *src, int source_stride, int xoffset, int yoffset, \
442 const uint8_t *ref, int ref_stride, uint32_t *sse, \
443 const uint8_t *second_pred) { \
444 uint8_t tmp0[w * (h + padding)]; \
445 uint8_t tmp1[w * h]; \
446 var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, (h + padding), \
447 xoffset); \
448 avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \
449 second_pred); \
450 return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
Jonathan Wright3df6c182022-11-19 14:57:25 +0000451 }
452
Jonathan Wrightfef440c2022-11-19 17:14:19 +0000453#define SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \
454 unsigned int aom_sub_pixel_avg_variance##w##x##h##_neon( \
Wan-Teh Chang17cbad52022-12-01 19:12:58 -0800455 const uint8_t *src, int source_stride, int xoffset, int yoffset, \
Jonathan Wrightfef440c2022-11-19 17:14:19 +0000456 const uint8_t *ref, int ref_stride, unsigned int *sse, \
457 const uint8_t *second_pred) { \
458 if (xoffset == 0) { \
459 uint8_t tmp[w * h]; \
460 if (yoffset == 0) { \
Wan-Teh Chang17cbad52022-12-01 19:12:58 -0800461 avg_pred(src, tmp, source_stride, w, h, second_pred); \
Jonathan Wrightfef440c2022-11-19 17:14:19 +0000462 return aom_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \
463 } else if (yoffset == 4) { \
Wan-Teh Chang17cbad52022-12-01 19:12:58 -0800464 avg_pred_var_filter_block2d_avg(src, tmp, source_stride, \
465 source_stride, w, h, second_pred); \
Jonathan Wrightfef440c2022-11-19 17:14:19 +0000466 return aom_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \
467 } else { \
Wan-Teh Chang17cbad52022-12-01 19:12:58 -0800468 avg_pred_var_filter_block2d_bil_w##w( \
469 src, tmp, source_stride, source_stride, h, yoffset, second_pred); \
Jonathan Wrightfef440c2022-11-19 17:14:19 +0000470 return aom_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \
471 } \
472 } else if (xoffset == 4) { \
473 uint8_t tmp0[w * (h + padding)]; \
474 if (yoffset == 0) { \
Jonathan Wright2a296042023-01-16 16:59:25 +0000475 avg_pred_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h, \
476 second_pred); \
Jonathan Wrightfef440c2022-11-19 17:14:19 +0000477 return aom_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \
478 } else if (yoffset == 4) { \
479 uint8_t tmp1[w * (h + padding)]; \
Jonathan Wright2a296042023-01-16 16:59:25 +0000480 var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
Jonathan Wrightfef440c2022-11-19 17:14:19 +0000481 avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred); \
482 return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
483 } else { \
484 uint8_t tmp1[w * (h + padding)]; \
Jonathan Wright2a296042023-01-16 16:59:25 +0000485 var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
Jonathan Wrightfef440c2022-11-19 17:14:19 +0000486 avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \
487 second_pred); \
488 return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
489 } \
490 } else { \
491 uint8_t tmp0[w * (h + padding)]; \
492 if (yoffset == 0) { \
Jonathan Wright2a296042023-01-16 16:59:25 +0000493 avg_pred_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h, \
494 xoffset, second_pred); \
Jonathan Wrightfef440c2022-11-19 17:14:19 +0000495 return aom_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \
496 } else if (yoffset == 4) { \
497 uint8_t tmp1[w * h]; \
Wan-Teh Chang17cbad52022-12-01 19:12:58 -0800498 var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \
499 (h + padding), xoffset); \
Jonathan Wrightfef440c2022-11-19 17:14:19 +0000500 avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred); \
501 return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
502 } else { \
503 uint8_t tmp1[w * h]; \
Wan-Teh Chang17cbad52022-12-01 19:12:58 -0800504 var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \
505 (h + padding), xoffset); \
Jonathan Wrightfef440c2022-11-19 17:14:19 +0000506 avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \
507 second_pred); \
508 return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
509 } \
510 } \
511 }
512
Jonathan Wright3df6c182022-11-19 14:57:25 +0000513SUBPEL_AVG_VARIANCE_WXH_NEON(4, 4, 2)
514SUBPEL_AVG_VARIANCE_WXH_NEON(4, 8, 2)
515
516SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 1)
517SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 1)
518SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 1)
519
520SUBPEL_AVG_VARIANCE_WXH_NEON(16, 8, 1)
Jonathan Wrightfef440c2022-11-19 17:14:19 +0000521SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 16, 1)
522SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 32, 1)
Jonathan Wright3df6c182022-11-19 14:57:25 +0000523
Jonathan Wrightfef440c2022-11-19 17:14:19 +0000524SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 16, 1)
525SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 32, 1)
526SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 64, 1)
Jonathan Wright3df6c182022-11-19 14:57:25 +0000527
Jonathan Wrightfef440c2022-11-19 17:14:19 +0000528SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 32, 1)
529SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 64, 1)
530SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 128, 1)
Jonathan Wright3df6c182022-11-19 14:57:25 +0000531
Jonathan Wrightfef440c2022-11-19 17:14:19 +0000532SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(128, 64, 1)
533SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(128, 128, 1)
Jonathan Wright3df6c182022-11-19 14:57:25 +0000534
535#if !CONFIG_REALTIME_ONLY
536
537SUBPEL_AVG_VARIANCE_WXH_NEON(4, 16, 2)
538
539SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 1)
540
541SUBPEL_AVG_VARIANCE_WXH_NEON(16, 4, 1)
Jonathan Wrightfef440c2022-11-19 17:14:19 +0000542SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 64, 1)
Jonathan Wright3df6c182022-11-19 14:57:25 +0000543
Jonathan Wrightfef440c2022-11-19 17:14:19 +0000544SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 8, 1)
Jonathan Wright3df6c182022-11-19 14:57:25 +0000545
Jonathan Wrightfef440c2022-11-19 17:14:19 +0000546SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 16, 1)
Jonathan Wright3df6c182022-11-19 14:57:25 +0000547
548#endif // !CONFIG_REALTIME_ONLY
549
550#undef SUBPEL_AVG_VARIANCE_WXH_NEON
Jonathan Wrightfef440c2022-11-19 17:14:19 +0000551#undef SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON
Jonathan Wright07663e42023-04-05 14:17:32 +0100552
553#if !CONFIG_REALTIME_ONLY
554
555#define OBMC_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \
556 unsigned int aom_obmc_sub_pixel_variance##w##x##h##_neon( \
557 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
558 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
559 uint8_t tmp0[w * (h + padding)]; \
560 uint8_t tmp1[w * h]; \
561 var_filter_block2d_bil_w##w(pre, tmp0, pre_stride, 1, h + padding, \
562 xoffset); \
563 var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
564 return aom_obmc_variance##w##x##h(tmp1, w, wsrc, mask, sse); \
565 }
566
Jonathan Wright341aded2023-04-05 14:35:08 +0100567#define SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \
568 unsigned int aom_obmc_sub_pixel_variance##w##x##h##_neon( \
569 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
570 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
571 if (xoffset == 0) { \
572 if (yoffset == 0) { \
573 return aom_obmc_variance##w##x##h##_neon(pre, pre_stride, wsrc, mask, \
574 sse); \
575 } else if (yoffset == 4) { \
576 uint8_t tmp[w * h]; \
577 var_filter_block2d_avg(pre, tmp, pre_stride, pre_stride, w, h); \
578 return aom_obmc_variance##w##x##h##_neon(tmp, w, wsrc, mask, sse); \
579 } else { \
580 uint8_t tmp[w * h]; \
581 var_filter_block2d_bil_w##w(pre, tmp, pre_stride, pre_stride, h, \
582 yoffset); \
583 return aom_obmc_variance##w##x##h##_neon(tmp, w, wsrc, mask, sse); \
584 } \
585 } else if (xoffset == 4) { \
586 uint8_t tmp0[w * (h + padding)]; \
587 if (yoffset == 0) { \
588 var_filter_block2d_avg(pre, tmp0, pre_stride, 1, w, h); \
589 return aom_obmc_variance##w##x##h##_neon(tmp0, w, wsrc, mask, sse); \
590 } else if (yoffset == 4) { \
591 uint8_t tmp1[w * (h + padding)]; \
592 var_filter_block2d_avg(pre, tmp0, pre_stride, 1, w, h + padding); \
593 var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
594 return aom_obmc_variance##w##x##h##_neon(tmp1, w, wsrc, mask, sse); \
595 } else { \
596 uint8_t tmp1[w * (h + padding)]; \
597 var_filter_block2d_avg(pre, tmp0, pre_stride, 1, w, h + padding); \
598 var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
599 return aom_obmc_variance##w##x##h##_neon(tmp1, w, wsrc, mask, sse); \
600 } \
601 } else { \
602 uint8_t tmp0[w * (h + padding)]; \
603 if (yoffset == 0) { \
604 var_filter_block2d_bil_w##w(pre, tmp0, pre_stride, 1, h, xoffset); \
605 return aom_obmc_variance##w##x##h##_neon(tmp0, w, wsrc, mask, sse); \
606 } else if (yoffset == 4) { \
607 uint8_t tmp1[w * h]; \
608 var_filter_block2d_bil_w##w(pre, tmp0, pre_stride, 1, h + padding, \
609 xoffset); \
610 var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
611 return aom_obmc_variance##w##x##h##_neon(tmp1, w, wsrc, mask, sse); \
612 } else { \
613 uint8_t tmp1[w * h]; \
614 var_filter_block2d_bil_w##w(pre, tmp0, pre_stride, 1, h + padding, \
615 xoffset); \
616 var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
617 return aom_obmc_variance##w##x##h##_neon(tmp1, w, wsrc, mask, sse); \
618 } \
619 } \
620 }
621
Jonathan Wright07663e42023-04-05 14:17:32 +0100622OBMC_SUBPEL_VARIANCE_WXH_NEON(4, 4, 2)
623OBMC_SUBPEL_VARIANCE_WXH_NEON(4, 8, 2)
624OBMC_SUBPEL_VARIANCE_WXH_NEON(4, 16, 2)
625
626OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 1)
627OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 1)
628OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 1)
629OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 1)
630
631OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 4, 1)
632OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 8, 1)
Jonathan Wright341aded2023-04-05 14:35:08 +0100633SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1)
634SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1)
635SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 64, 1)
Jonathan Wright07663e42023-04-05 14:17:32 +0100636
Jonathan Wright341aded2023-04-05 14:35:08 +0100637SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(32, 8, 1)
638SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1)
639SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1)
640SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1)
Jonathan Wright07663e42023-04-05 14:17:32 +0100641
Jonathan Wright341aded2023-04-05 14:35:08 +0100642SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(64, 16, 1)
643SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1)
644SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1)
645SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(64, 128, 1)
Jonathan Wright07663e42023-04-05 14:17:32 +0100646
Jonathan Wright341aded2023-04-05 14:35:08 +0100647SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(128, 64, 1)
648SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(128, 128, 1)
Jonathan Wright07663e42023-04-05 14:17:32 +0100649
650#undef OBMC_SUBPEL_VARIANCE_WXH_NEON
Jonathan Wright341aded2023-04-05 14:35:08 +0100651#undef SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON
Jonathan Wright07663e42023-04-05 14:17:32 +0100652#endif // !CONFIG_REALTIME_ONLY
Salome Thirot6d461a32023-04-28 14:29:32 +0100653
654#define MASKED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \
655 unsigned int aom_masked_sub_pixel_variance##w##x##h##_neon( \
656 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
657 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
658 const uint8_t *msk, int msk_stride, int invert_mask, \
659 unsigned int *sse) { \
660 uint8_t tmp0[w * (h + padding)]; \
661 uint8_t tmp1[w * h]; \
662 uint8_t tmp2[w * h]; \
663 var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
664 xoffset); \
665 var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
666 aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk, msk_stride, \
667 invert_mask); \
668 return aom_variance##w##x##h##_neon(tmp2, w, ref, ref_stride, sse); \
669 }
670
Salome Thirot92b5cd32023-04-28 18:02:45 +0100671#define SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \
672 unsigned int aom_masked_sub_pixel_variance##w##x##h##_neon( \
673 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
674 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
675 const uint8_t *msk, int msk_stride, int invert_mask, \
676 unsigned int *sse) { \
677 if (xoffset == 0) { \
678 uint8_t tmp0[w * h]; \
679 if (yoffset == 0) { \
680 aom_comp_mask_pred_neon(tmp0, second_pred, w, h, src, src_stride, msk, \
681 msk_stride, invert_mask); \
682 return aom_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \
683 } else if (yoffset == 4) { \
684 uint8_t tmp1[w * h]; \
685 var_filter_block2d_avg(src, tmp0, src_stride, src_stride, w, h); \
686 aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk, \
687 msk_stride, invert_mask); \
688 return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
689 } else { \
690 uint8_t tmp1[w * h]; \
691 var_filter_block2d_bil_w##w(src, tmp0, src_stride, src_stride, h, \
692 yoffset); \
693 aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk, \
694 msk_stride, invert_mask); \
695 return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
696 } \
697 } else if (xoffset == 4) { \
698 uint8_t tmp0[w * (h + padding)]; \
699 if (yoffset == 0) { \
700 uint8_t tmp1[w * h]; \
701 var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h); \
702 aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk, \
703 msk_stride, invert_mask); \
704 return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
705 } else if (yoffset == 4) { \
706 uint8_t tmp1[w * h]; \
707 uint8_t tmp2[w * h]; \
708 var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \
709 var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
710 aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk, \
711 msk_stride, invert_mask); \
712 return aom_variance##w##x##h##_neon(tmp2, w, ref, ref_stride, sse); \
713 } else { \
714 uint8_t tmp1[w * h]; \
715 uint8_t tmp2[w * h]; \
716 var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \
717 var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
718 aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk, \
719 msk_stride, invert_mask); \
720 return aom_variance##w##x##h##_neon(tmp2, w, ref, ref_stride, sse); \
721 } \
722 } else { \
723 if (yoffset == 0) { \
724 uint8_t tmp0[w * h]; \
725 uint8_t tmp1[w * h]; \
726 var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset); \
727 aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk, \
728 msk_stride, invert_mask); \
729 return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
730 } else if (yoffset == 4) { \
731 uint8_t tmp0[w * (h + padding)]; \
732 uint8_t tmp1[w * h]; \
733 uint8_t tmp2[w * h]; \
734 var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
735 xoffset); \
736 var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
737 aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk, \
738 msk_stride, invert_mask); \
739 return aom_variance##w##x##h##_neon(tmp2, w, ref, ref_stride, sse); \
740 } else { \
741 uint8_t tmp0[w * (h + padding)]; \
742 uint8_t tmp1[w * (h + padding)]; \
743 uint8_t tmp2[w * h]; \
744 var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
745 xoffset); \
746 var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
747 aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk, \
748 msk_stride, invert_mask); \
749 return aom_variance##w##x##h##_neon(tmp2, w, ref, ref_stride, sse); \
750 } \
751 } \
752 }
753
Salome Thirot6d461a32023-04-28 14:29:32 +0100754MASKED_SUBPEL_VARIANCE_WXH_NEON(4, 4, 2)
755MASKED_SUBPEL_VARIANCE_WXH_NEON(4, 8, 2)
756
757MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 1)
758MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 1)
759MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 1)
760
761MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 8, 1)
Salome Thirot92b5cd32023-04-28 18:02:45 +0100762SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1)
763SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1)
Salome Thirot6d461a32023-04-28 14:29:32 +0100764
Salome Thirot92b5cd32023-04-28 18:02:45 +0100765SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1)
766SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1)
767SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1)
Salome Thirot6d461a32023-04-28 14:29:32 +0100768
Salome Thirot92b5cd32023-04-28 18:02:45 +0100769SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1)
770SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1)
771SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 128, 1)
Salome Thirot6d461a32023-04-28 14:29:32 +0100772
Salome Thirot92b5cd32023-04-28 18:02:45 +0100773SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(128, 64, 1)
774SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(128, 128, 1)
Salome Thirot6d461a32023-04-28 14:29:32 +0100775
776// Realtime mode doesn't use 4x rectangular blocks.
777#if !CONFIG_REALTIME_ONLY
778MASKED_SUBPEL_VARIANCE_WXH_NEON(4, 16, 2)
779MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 1)
780MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 4, 1)
Salome Thirot92b5cd32023-04-28 18:02:45 +0100781SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 64, 1)
782SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 8, 1)
783SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 16, 1)
Salome Thirot6d461a32023-04-28 14:29:32 +0100784#endif // !CONFIG_REALTIME_ONLY
785
786#undef MASKED_SUBPEL_VARIANCE_WXH_NEON
Salome Thirot92b5cd32023-04-28 18:02:45 +0100787#undef SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON