|  | /* | 
|  | * Copyright (c) 2023 The WebM project authors. All rights reserved. | 
|  | * Copyright (c) 2023, Alliance for Open Media. All rights reserved. | 
|  | * | 
|  | * This source code is subject to the terms of the BSD 2 Clause License and | 
|  | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License | 
|  | * was not distributed with this source code in the LICENSE file, you can | 
|  | * obtain it at www.aomedia.org/license/software. If the Alliance for Open | 
|  | * Media Patent License 1.0 was not distributed with this source code in the | 
|  | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. | 
|  | */ | 
|  |  | 
|  | #include <arm_neon.h> | 
|  |  | 
|  | #include "config/aom_config.h" | 
|  | #include "config/aom_dsp_rtcd.h" | 
|  |  | 
|  | #include "aom_dsp/aom_filter.h" | 
|  | #include "aom_dsp/arm/mem_neon.h" | 
|  | #include "aom_dsp/arm/sum_neon.h" | 
|  | #include "aom_dsp/variance.h" | 
|  |  | 
|  | // The bilinear filters look like this: | 
|  | // | 
|  | // {{ 128,  0 }, { 112, 16 }, { 96, 32 }, { 80,  48 }, | 
|  | //  {  64, 64 }, {  48, 80 }, { 32, 96 }, { 16, 112 }} | 
|  | // | 
|  | // We can factor out the highest common multiple, such that the sum of both | 
|  | // weights will be 8 instead of 128. The benefits of this are two-fold: | 
|  | // | 
|  | // 1) We can infer the filter values from the filter_offset parameter in the | 
|  | // bilinear filter functions below - we don't have to actually load the values | 
|  | // from memory: | 
|  | // f0 = 8 - filter_offset | 
|  | // f1 = filter_offset | 
|  | // | 
|  | // 2) Scaling the pixel values by 8, instead of 128 enables us to operate on | 
|  | // 16-bit data types at all times, rather than widening out to 32-bit and | 
|  | // requiring double the number of data processing instructions. (12-bit * 8 = | 
|  | // 15-bit.) | 
|  |  | 
|  | // Process a block exactly 4 wide and any height. | 
|  | static void highbd_var_filter_block2d_bil_w4(const uint16_t *src_ptr, | 
|  | uint16_t *dst_ptr, int src_stride, | 
|  | int pixel_step, int dst_height, | 
|  | int filter_offset) { | 
|  | const uint16x4_t f0 = vdup_n_u16(8 - filter_offset); | 
|  | const uint16x4_t f1 = vdup_n_u16(filter_offset); | 
|  |  | 
|  | int i = dst_height; | 
|  | do { | 
|  | uint16x4_t s0 = load_unaligned_u16_4x1(src_ptr); | 
|  | uint16x4_t s1 = load_unaligned_u16_4x1(src_ptr + pixel_step); | 
|  |  | 
|  | uint16x4_t blend = vmul_u16(s0, f0); | 
|  | blend = vmla_u16(blend, s1, f1); | 
|  | blend = vrshr_n_u16(blend, 3); | 
|  |  | 
|  | vst1_u16(dst_ptr, blend); | 
|  |  | 
|  | src_ptr += src_stride; | 
|  | dst_ptr += 4; | 
|  | } while (--i != 0); | 
|  | } | 
|  |  | 
|  | // Process a block which is a multiple of 8 and any height. | 
|  | static void highbd_var_filter_block2d_bil_large(const uint16_t *src_ptr, | 
|  | uint16_t *dst_ptr, | 
|  | int src_stride, int pixel_step, | 
|  | int dst_width, int dst_height, | 
|  | int filter_offset) { | 
|  | const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset); | 
|  | const uint16x8_t f1 = vdupq_n_u16(filter_offset); | 
|  |  | 
|  | int i = dst_height; | 
|  | do { | 
|  | int j = 0; | 
|  | do { | 
|  | uint16x8_t s0 = vld1q_u16(src_ptr + j); | 
|  | uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step); | 
|  |  | 
|  | uint16x8_t blend = vmulq_u16(s0, f0); | 
|  | blend = vmlaq_u16(blend, s1, f1); | 
|  | blend = vrshrq_n_u16(blend, 3); | 
|  |  | 
|  | vst1q_u16(dst_ptr + j, blend); | 
|  |  | 
|  | j += 8; | 
|  | } while (j < dst_width); | 
|  |  | 
|  | src_ptr += src_stride; | 
|  | dst_ptr += dst_width; | 
|  | } while (--i != 0); | 
|  | } | 
|  |  | 
|  | static void highbd_var_filter_block2d_bil_w8(const uint16_t *src_ptr, | 
|  | uint16_t *dst_ptr, int src_stride, | 
|  | int pixel_step, int dst_height, | 
|  | int filter_offset) { | 
|  | highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, | 
|  | 8, dst_height, filter_offset); | 
|  | } | 
|  |  | 
|  | static void highbd_var_filter_block2d_bil_w16(const uint16_t *src_ptr, | 
|  | uint16_t *dst_ptr, int src_stride, | 
|  | int pixel_step, int dst_height, | 
|  | int filter_offset) { | 
|  | highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, | 
|  | 16, dst_height, filter_offset); | 
|  | } | 
|  |  | 
|  | static void highbd_var_filter_block2d_bil_w32(const uint16_t *src_ptr, | 
|  | uint16_t *dst_ptr, int src_stride, | 
|  | int pixel_step, int dst_height, | 
|  | int filter_offset) { | 
|  | highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, | 
|  | 32, dst_height, filter_offset); | 
|  | } | 
|  |  | 
|  | static void highbd_var_filter_block2d_bil_w64(const uint16_t *src_ptr, | 
|  | uint16_t *dst_ptr, int src_stride, | 
|  | int pixel_step, int dst_height, | 
|  | int filter_offset) { | 
|  | highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, | 
|  | 64, dst_height, filter_offset); | 
|  | } | 
|  |  | 
|  | static void highbd_var_filter_block2d_bil_w128(const uint16_t *src_ptr, | 
|  | uint16_t *dst_ptr, | 
|  | int src_stride, int pixel_step, | 
|  | int dst_height, | 
|  | int filter_offset) { | 
|  | highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, | 
|  | 128, dst_height, filter_offset); | 
|  | } | 
|  |  | 
|  | static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr, | 
|  | uint16_t *dst_ptr, int src_stride, | 
|  | int pixel_step, int dst_width, | 
|  | int dst_height) { | 
|  | int i = dst_height; | 
|  |  | 
|  | // We only specialize on the filter values for large block sizes (>= 16x16.) | 
|  | assert(dst_width >= 16 && dst_width % 16 == 0); | 
|  |  | 
|  | do { | 
|  | int j = 0; | 
|  | do { | 
|  | uint16x8_t s0 = vld1q_u16(src_ptr + j); | 
|  | uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step); | 
|  | uint16x8_t avg = vrhaddq_u16(s0, s1); | 
|  | vst1q_u16(dst_ptr + j, avg); | 
|  |  | 
|  | j += 8; | 
|  | } while (j < dst_width); | 
|  |  | 
|  | src_ptr += src_stride; | 
|  | dst_ptr += dst_width; | 
|  | } while (--i != 0); | 
|  | } | 
|  |  | 
|  | #define HBD_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h)                           \ | 
|  | unsigned int aom_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon(     \ | 
|  | const uint8_t *src, int src_stride, int xoffset, int yoffset,            \ | 
|  | const uint8_t *ref, int ref_stride, uint32_t *sse) {                     \ | 
|  | uint16_t tmp0[w * (h + 1)];                                                \ | 
|  | uint16_t tmp1[w * h];                                                      \ | 
|  | uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \ | 
|  | \ | 
|  | highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1),  \ | 
|  | xoffset);                               \ | 
|  | highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);          \ | 
|  | \ | 
|  | return aom_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \ | 
|  | w, ref, ref_stride, sse); \ | 
|  | } | 
|  |  | 
|  | #define HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h)               \ | 
|  | unsigned int aom_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon(     \ | 
|  | const uint8_t *src, int src_stride, int xoffset, int yoffset,            \ | 
|  | const uint8_t *ref, int ref_stride, unsigned int *sse) {                 \ | 
|  | uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \ | 
|  | \ | 
|  | if (xoffset == 0) {                                                        \ | 
|  | if (yoffset == 0) {                                                      \ | 
|  | return aom_highbd_##bitdepth##_variance##w##x##h(                      \ | 
|  | CONVERT_TO_BYTEPTR(src_ptr), src_stride, ref, ref_stride, sse);    \ | 
|  | } else if (yoffset == 4) {                                               \ | 
|  | uint16_t tmp[w * h];                                                   \ | 
|  | highbd_var_filter_block2d_avg(src_ptr, tmp, src_stride, src_stride, w, \ | 
|  | h);                                      \ | 
|  | return aom_highbd_##bitdepth##_variance##w##x##h(                      \ | 
|  | CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \ | 
|  | } else {                                                                 \ | 
|  | uint16_t tmp[w * h];                                                   \ | 
|  | highbd_var_filter_block2d_bil_w##w(src_ptr, tmp, src_stride,           \ | 
|  | src_stride, h, yoffset);            \ | 
|  | return aom_highbd_##bitdepth##_variance##w##x##h(                      \ | 
|  | CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \ | 
|  | }                                                                        \ | 
|  | } else if (xoffset == 4) {                                                 \ | 
|  | uint16_t tmp0[w * (h + 1)];                                              \ | 
|  | if (yoffset == 0) {                                                      \ | 
|  | highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h);     \ | 
|  | return aom_highbd_##bitdepth##_variance##w##x##h(                      \ | 
|  | CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \ | 
|  | } else if (yoffset == 4) {                                               \ | 
|  | uint16_t tmp1[w * (h + 1)];                                            \ | 
|  | highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w,         \ | 
|  | (h + 1));                                \ | 
|  | highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \ | 
|  | return aom_highbd_##bitdepth##_variance##w##x##h(                      \ | 
|  | CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \ | 
|  | } else {                                                                 \ | 
|  | uint16_t tmp1[w * (h + 1)];                                            \ | 
|  | highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w,         \ | 
|  | (h + 1));                                \ | 
|  | highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \ | 
|  | return aom_highbd_##bitdepth##_variance##w##x##h(                      \ | 
|  | CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \ | 
|  | }                                                                        \ | 
|  | } else {                                                                   \ | 
|  | uint16_t tmp0[w * (h + 1)];                                              \ | 
|  | if (yoffset == 0) {                                                      \ | 
|  | highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h,    \ | 
|  | xoffset);                           \ | 
|  | return aom_highbd_##bitdepth##_variance##w##x##h(                      \ | 
|  | CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \ | 
|  | } else if (yoffset == 4) {                                               \ | 
|  | uint16_t tmp1[w * h];                                                  \ | 
|  | highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,       \ | 
|  | (h + 1), xoffset);                  \ | 
|  | highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \ | 
|  | return aom_highbd_##bitdepth##_variance##w##x##h(                      \ | 
|  | CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \ | 
|  | } else {                                                                 \ | 
|  | uint16_t tmp1[w * h];                                                  \ | 
|  | highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,       \ | 
|  | (h + 1), xoffset);                  \ | 
|  | highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \ | 
|  | return aom_highbd_##bitdepth##_variance##w##x##h(                      \ | 
|  | CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \ | 
|  | }                                                                        \ | 
|  | }                                                                          \ | 
|  | } | 
|  |  | 
|  | // 8-bit | 
|  | HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4) | 
|  | HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8) | 
|  |  | 
|  | HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4) | 
|  | HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8) | 
|  | HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16) | 
|  |  | 
|  | HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8) | 
|  | HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16) | 
|  | HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32) | 
|  |  | 
|  | HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16) | 
|  | HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32) | 
|  | HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64) | 
|  |  | 
|  | HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32) | 
|  | HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64) | 
|  | HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 128) | 
|  |  | 
|  | HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 64) | 
|  | HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 128) | 
|  |  | 
|  | #if !CONFIG_REALTIME_ONLY | 
|  | HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 16) | 
|  |  | 
|  | HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 32) | 
|  |  | 
|  | HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 4) | 
|  | HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 64) | 
|  |  | 
|  | HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 8) | 
|  |  | 
|  | HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 16) | 
|  | #endif  // !CONFIG_REALTIME_ONLY | 
|  |  | 
|  | // 10-bit | 
|  | HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4) | 
|  | HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8) | 
|  |  | 
|  | HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4) | 
|  | HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8) | 
|  | HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16) | 
|  |  | 
|  | HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8) | 
|  | HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16) | 
|  | HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32) | 
|  |  | 
|  | HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16) | 
|  | HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32) | 
|  | HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64) | 
|  |  | 
|  | HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32) | 
|  | HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64) | 
|  | HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 128) | 
|  |  | 
|  | HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 64) | 
|  | HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 128) | 
|  |  | 
|  | #if !CONFIG_REALTIME_ONLY | 
|  | HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 16) | 
|  |  | 
|  | HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 32) | 
|  |  | 
|  | HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 4) | 
|  | HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 64) | 
|  |  | 
|  | HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 8) | 
|  |  | 
|  | HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 16) | 
|  | #endif  // !CONFIG_REALTIME_ONLY | 
|  |  | 
|  | // 12-bit | 
|  | HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4) | 
|  | HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8) | 
|  |  | 
|  | HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4) | 
|  | HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8) | 
|  | HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16) | 
|  |  | 
|  | HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8) | 
|  | HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16) | 
|  | HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32) | 
|  |  | 
|  | HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16) | 
|  | HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32) | 
|  | HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64) | 
|  |  | 
|  | HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32) | 
|  | HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64) | 
|  | HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 128) | 
|  |  | 
|  | HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 64) | 
|  | HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 128) | 
|  |  | 
|  | #if !CONFIG_REALTIME_ONLY | 
|  | HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 16) | 
|  |  | 
|  | HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 32) | 
|  |  | 
|  | HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 4) | 
|  | HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 64) | 
|  |  | 
|  | HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 8) | 
|  |  | 
|  | HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 16) | 
|  | #endif  // !CONFIG_REALTIME_ONLY | 
|  |  | 
|  | // Combine bilinear filter with aom_highbd_comp_avg_pred for blocks having | 
|  | // width 4. | 
|  | static void highbd_avg_pred_var_filter_block2d_bil_w4( | 
|  | const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, | 
|  | int dst_height, int filter_offset, const uint16_t *second_pred) { | 
|  | const uint16x4_t f0 = vdup_n_u16(8 - filter_offset); | 
|  | const uint16x4_t f1 = vdup_n_u16(filter_offset); | 
|  |  | 
|  | int i = dst_height; | 
|  | do { | 
|  | uint16x4_t s0 = load_unaligned_u16_4x1(src_ptr); | 
|  | uint16x4_t s1 = load_unaligned_u16_4x1(src_ptr + pixel_step); | 
|  | uint16x4_t p = vld1_u16(second_pred); | 
|  |  | 
|  | uint16x4_t blend = vmul_u16(s0, f0); | 
|  | blend = vmla_u16(blend, s1, f1); | 
|  | blend = vrshr_n_u16(blend, 3); | 
|  |  | 
|  | vst1_u16(dst_ptr, vrhadd_u16(blend, p)); | 
|  |  | 
|  | src_ptr += src_stride; | 
|  | dst_ptr += 4; | 
|  | second_pred += 4; | 
|  | } while (--i != 0); | 
|  | } | 
|  |  | 
|  | // Combine bilinear filter with aom_highbd_comp_avg_pred for large blocks. | 
|  | static void highbd_avg_pred_var_filter_block2d_bil_large( | 
|  | const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, | 
|  | int dst_width, int dst_height, int filter_offset, | 
|  | const uint16_t *second_pred) { | 
|  | const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset); | 
|  | const uint16x8_t f1 = vdupq_n_u16(filter_offset); | 
|  |  | 
|  | int i = dst_height; | 
|  | do { | 
|  | int j = 0; | 
|  | do { | 
|  | uint16x8_t s0 = vld1q_u16(src_ptr + j); | 
|  | uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step); | 
|  | uint16x8_t p = vld1q_u16(second_pred); | 
|  |  | 
|  | uint16x8_t blend = vmulq_u16(s0, f0); | 
|  | blend = vmlaq_u16(blend, s1, f1); | 
|  | blend = vrshrq_n_u16(blend, 3); | 
|  |  | 
|  | vst1q_u16(dst_ptr + j, vrhaddq_u16(blend, p)); | 
|  |  | 
|  | j += 8; | 
|  | second_pred += 8; | 
|  | } while (j < dst_width); | 
|  |  | 
|  | src_ptr += src_stride; | 
|  | dst_ptr += dst_width; | 
|  | } while (--i != 0); | 
|  | } | 
|  |  | 
|  | static void highbd_avg_pred_var_filter_block2d_bil_w8( | 
|  | const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, | 
|  | int dst_height, int filter_offset, const uint16_t *second_pred) { | 
|  | highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, | 
|  | pixel_step, 8, dst_height, | 
|  | filter_offset, second_pred); | 
|  | } | 
|  |  | 
|  | static void highbd_avg_pred_var_filter_block2d_bil_w16( | 
|  | const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, | 
|  | int dst_height, int filter_offset, const uint16_t *second_pred) { | 
|  | highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, | 
|  | pixel_step, 16, dst_height, | 
|  | filter_offset, second_pred); | 
|  | } | 
|  |  | 
|  | static void highbd_avg_pred_var_filter_block2d_bil_w32( | 
|  | const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, | 
|  | int dst_height, int filter_offset, const uint16_t *second_pred) { | 
|  | highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, | 
|  | pixel_step, 32, dst_height, | 
|  | filter_offset, second_pred); | 
|  | } | 
|  |  | 
|  | static void highbd_avg_pred_var_filter_block2d_bil_w64( | 
|  | const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, | 
|  | int dst_height, int filter_offset, const uint16_t *second_pred) { | 
|  | highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, | 
|  | pixel_step, 64, dst_height, | 
|  | filter_offset, second_pred); | 
|  | } | 
|  |  | 
|  | static void highbd_avg_pred_var_filter_block2d_bil_w128( | 
|  | const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, | 
|  | int dst_height, int filter_offset, const uint16_t *second_pred) { | 
|  | highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, | 
|  | pixel_step, 128, dst_height, | 
|  | filter_offset, second_pred); | 
|  | } | 
|  |  | 
|  | // Combine averaging subpel filter with aom_highbd_comp_avg_pred. | 
|  | static void highbd_avg_pred_var_filter_block2d_avg( | 
|  | const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, | 
|  | int dst_width, int dst_height, const uint16_t *second_pred) { | 
|  | int i = dst_height; | 
|  |  | 
|  | // We only specialize on the filter values for large block sizes (>= 16x16.) | 
|  | assert(dst_width >= 16 && dst_width % 16 == 0); | 
|  |  | 
|  | do { | 
|  | int j = 0; | 
|  | do { | 
|  | uint16x8_t s0 = vld1q_u16(src_ptr + j); | 
|  | uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step); | 
|  | uint16x8_t avg = vrhaddq_u16(s0, s1); | 
|  |  | 
|  | uint16x8_t p = vld1q_u16(second_pred); | 
|  | avg = vrhaddq_u16(avg, p); | 
|  |  | 
|  | vst1q_u16(dst_ptr + j, avg); | 
|  |  | 
|  | j += 8; | 
|  | second_pred += 8; | 
|  | } while (j < dst_width); | 
|  |  | 
|  | src_ptr += src_stride; | 
|  | dst_ptr += dst_width; | 
|  | } while (--i != 0); | 
|  | } | 
|  |  | 
|  | // Implementation of aom_highbd_comp_avg_pred for blocks having width >= 16. | 
|  | static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr, | 
|  | int src_stride, int dst_width, int dst_height, | 
|  | const uint16_t *second_pred) { | 
|  | int i = dst_height; | 
|  |  | 
|  | // We only specialize on the filter values for large block sizes (>= 16x16.) | 
|  | assert(dst_width >= 16 && dst_width % 16 == 0); | 
|  |  | 
|  | do { | 
|  | int j = 0; | 
|  | do { | 
|  | uint16x8_t s = vld1q_u16(src_ptr + j); | 
|  | uint16x8_t p = vld1q_u16(second_pred); | 
|  |  | 
|  | uint16x8_t avg = vrhaddq_u16(s, p); | 
|  |  | 
|  | vst1q_u16(dst_ptr + j, avg); | 
|  |  | 
|  | j += 8; | 
|  | second_pred += 8; | 
|  | } while (j < dst_width); | 
|  |  | 
|  | src_ptr += src_stride; | 
|  | dst_ptr += dst_width; | 
|  | } while (--i != 0); | 
|  | } | 
|  |  | 
|  | #define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h)                       \ | 
|  | uint32_t aom_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon(     \ | 
|  | const uint8_t *src, int src_stride, int xoffset, int yoffset,            \ | 
|  | const uint8_t *ref, int ref_stride, uint32_t *sse,                       \ | 
|  | const uint8_t *second_pred) {                                            \ | 
|  | uint16_t tmp0[w * (h + 1)];                                                \ | 
|  | uint16_t tmp1[w * h];                                                      \ | 
|  | uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \ | 
|  | \ | 
|  | highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1),  \ | 
|  | xoffset);                               \ | 
|  | highbd_avg_pred_var_filter_block2d_bil_w##w(                               \ | 
|  | tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));       \ | 
|  | \ | 
|  | return aom_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \ | 
|  | w, ref, ref_stride, sse); \ | 
|  | } | 
|  |  | 
|  | #define HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h)           \ | 
|  | unsigned int aom_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \ | 
|  | const uint8_t *src, int source_stride, int xoffset, int yoffset,         \ | 
|  | const uint8_t *ref, int ref_stride, uint32_t *sse,                       \ | 
|  | const uint8_t *second_pred) {                                            \ | 
|  | uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \ | 
|  | \ | 
|  | if (xoffset == 0) {                                                        \ | 
|  | uint16_t tmp[w * h];                                                     \ | 
|  | if (yoffset == 0) {                                                      \ | 
|  | highbd_avg_pred(src_ptr, tmp, source_stride, w, h,                     \ | 
|  | CONVERT_TO_SHORTPTR(second_pred));                     \ | 
|  | return aom_highbd_##bitdepth##_variance##w##x##h(                      \ | 
|  | CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \ | 
|  | } else if (yoffset == 4) {                                               \ | 
|  | highbd_avg_pred_var_filter_block2d_avg(                                \ | 
|  | src_ptr, tmp, source_stride, source_stride, w, h,                  \ | 
|  | CONVERT_TO_SHORTPTR(second_pred));                                 \ | 
|  | return aom_highbd_##bitdepth##_variance##w##x##h(                      \ | 
|  | CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \ | 
|  | } else {                                                                 \ | 
|  | highbd_avg_pred_var_filter_block2d_bil_w##w(                           \ | 
|  | src_ptr, tmp, source_stride, source_stride, h, yoffset,            \ | 
|  | CONVERT_TO_SHORTPTR(second_pred));                                 \ | 
|  | return aom_highbd_##bitdepth##_variance##w##x##h(                      \ | 
|  | CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \ | 
|  | }                                                                        \ | 
|  | } else if (xoffset == 4) {                                                 \ | 
|  | uint16_t tmp0[w * (h + 1)];                                              \ | 
|  | if (yoffset == 0) {                                                      \ | 
|  | highbd_avg_pred_var_filter_block2d_avg(                                \ | 
|  | src_ptr, tmp0, source_stride, 1, w, h,                             \ | 
|  | CONVERT_TO_SHORTPTR(second_pred));                                 \ | 
|  | return aom_highbd_##bitdepth##_variance##w##x##h(                      \ | 
|  | CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \ | 
|  | } else if (yoffset == 4) {                                               \ | 
|  | uint16_t tmp1[w * (h + 1)];                                            \ | 
|  | highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w,      \ | 
|  | (h + 1));                                \ | 
|  | highbd_avg_pred_var_filter_block2d_avg(                                \ | 
|  | tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred));         \ | 
|  | return aom_highbd_##bitdepth##_variance##w##x##h(                      \ | 
|  | CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \ | 
|  | } else {                                                                 \ | 
|  | uint16_t tmp1[w * (h + 1)];                                            \ | 
|  | highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w,      \ | 
|  | (h + 1));                                \ | 
|  | highbd_avg_pred_var_filter_block2d_bil_w##w(                           \ | 
|  | tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));   \ | 
|  | return aom_highbd_##bitdepth##_variance##w##x##h(                      \ | 
|  | CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \ | 
|  | }                                                                        \ | 
|  | } else {                                                                   \ | 
|  | uint16_t tmp0[w * (h + 1)];                                              \ | 
|  | if (yoffset == 0) {                                                      \ | 
|  | highbd_avg_pred_var_filter_block2d_bil_w##w(                           \ | 
|  | src_ptr, tmp0, source_stride, 1, h, xoffset,                       \ | 
|  | CONVERT_TO_SHORTPTR(second_pred));                                 \ | 
|  | return aom_highbd_##bitdepth##_variance##w##x##h(                      \ | 
|  | CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \ | 
|  | } else if (yoffset == 4) {                                               \ | 
|  | uint16_t tmp1[w * h];                                                  \ | 
|  | highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1,    \ | 
|  | (h + 1), xoffset);                  \ | 
|  | highbd_avg_pred_var_filter_block2d_avg(                                \ | 
|  | tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred));         \ | 
|  | return aom_highbd_##bitdepth##_variance##w##x##h(                      \ | 
|  | CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \ | 
|  | } else {                                                                 \ | 
|  | uint16_t tmp1[w * h];                                                  \ | 
|  | highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1,    \ | 
|  | (h + 1), xoffset);                  \ | 
|  | highbd_avg_pred_var_filter_block2d_bil_w##w(                           \ | 
|  | tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));   \ | 
|  | return aom_highbd_##bitdepth##_variance##w##x##h(                      \ | 
|  | CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \ | 
|  | }                                                                        \ | 
|  | }                                                                          \ | 
|  | } | 
|  |  | 
|  | // 8-bit | 
|  | HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4) | 
|  | HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8) | 
|  |  | 
|  | HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 4) | 
|  | HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 8) | 
|  | HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 16) | 
|  |  | 
|  | HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 8) | 
|  | HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 16) | 
|  | HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 32) | 
|  |  | 
|  | HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 16) | 
|  | HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 32) | 
|  | HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 64) | 
|  |  | 
|  | HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 32) | 
|  | HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 64) | 
|  | HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 128) | 
|  |  | 
|  | HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 64) | 
|  | HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 128) | 
|  |  | 
|  | #if !CONFIG_REALTIME_ONLY | 
|  | HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 16) | 
|  |  | 
|  | HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 32) | 
|  |  | 
|  | HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 4) | 
|  | HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 64) | 
|  |  | 
|  | HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 8) | 
|  |  | 
|  | HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 16) | 
|  | #endif  // !CONFIG_REALTIME_ONLY | 
|  |  | 
|  | // 10-bit | 
|  | HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4) | 
|  | HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8) | 
|  |  | 
|  | HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 4) | 
|  | HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 8) | 
|  | HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 16) | 
|  |  | 
|  | HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 8) | 
|  | HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 16) | 
|  | HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 32) | 
|  |  | 
|  | HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 16) | 
|  | HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 32) | 
|  | HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 64) | 
|  |  | 
|  | HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 32) | 
|  | HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 64) | 
|  | HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 128) | 
|  |  | 
|  | HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 64) | 
|  | HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 128) | 
|  |  | 
|  | #if !CONFIG_REALTIME_ONLY | 
|  | HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 16) | 
|  |  | 
|  | HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 32) | 
|  |  | 
|  | HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 4) | 
|  | HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 64) | 
|  |  | 
|  | HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 8) | 
|  |  | 
|  | HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 16) | 
|  | #endif  // !CONFIG_REALTIME_ONLY | 
|  |  | 
|  | // 12-bit | 
|  | HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4) | 
|  | HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8) | 
|  |  | 
|  | HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 4) | 
|  | HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 8) | 
|  | HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 16) | 
|  |  | 
|  | HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 8) | 
|  | HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 16) | 
|  | HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 32) | 
|  |  | 
|  | HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 16) | 
|  | HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 32) | 
|  | HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 64) | 
|  |  | 
|  | HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 32) | 
|  | HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 64) | 
|  | HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 128) | 
|  |  | 
|  | HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 64) | 
|  | HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 128) | 
|  |  | 
|  | #if !CONFIG_REALTIME_ONLY | 
|  | HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 16) | 
|  |  | 
|  | HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 32) | 
|  |  | 
|  | HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 4) | 
|  | HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 64) | 
|  |  | 
|  | HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 8) | 
|  |  | 
|  | HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 16) | 
|  | #endif  // !CONFIG_REALTIME_ONLY | 
|  |  | 
|  | #define HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h)                    \ | 
|  | unsigned int                                                                 \ | 
|  | aom_highbd_##bitdepth##_masked_sub_pixel_variance##w##x##h##_neon(       \ | 
|  | const uint8_t *src, int src_stride, int xoffset, int yoffset,        \ | 
|  | const uint8_t *ref, int ref_stride, const uint8_t *second_pred,      \ | 
|  | const uint8_t *msk, int msk_stride, int invert_mask,                 \ | 
|  | unsigned int *sse) {                                                 \ | 
|  | uint16_t tmp0[w * (h + 1)];                                                \ | 
|  | uint16_t tmp1[w * (h + 1)];                                                \ | 
|  | uint16_t tmp2[w * h];                                                      \ | 
|  | uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \ | 
|  | highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1),  \ | 
|  | xoffset);                               \ | 
|  | highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);          \ | 
|  | aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, w,   \ | 
|  | h, CONVERT_TO_BYTEPTR(tmp1), w, msk,        \ | 
|  | msk_stride, invert_mask);                   \ | 
|  | return aom_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp2), \ | 
|  | w, ref, ref_stride, sse); \ | 
|  | } | 
|  |  | 
|  | #define HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h)        \ | 
|  | unsigned int                                                                 \ | 
|  | aom_highbd_##bitdepth##_masked_sub_pixel_variance##w##x##h##_neon(       \ | 
|  | const uint8_t *src, int src_stride, int xoffset, int yoffset,        \ | 
|  | const uint8_t *ref, int ref_stride, const uint8_t *second_pred,      \ | 
|  | const uint8_t *msk, int msk_stride, int invert_mask,                 \ | 
|  | unsigned int *sse) {                                                 \ | 
|  | uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \ | 
|  | if (xoffset == 0) {                                                        \ | 
|  | uint16_t tmp0[w * h];                                                    \ | 
|  | if (yoffset == 0) {                                                      \ | 
|  | aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp0), second_pred,  \ | 
|  | w, h, src, src_stride, msk, msk_stride, \ | 
|  | invert_mask);                           \ | 
|  | return aom_highbd_##bitdepth##_variance##w##x##h(                      \ | 
|  | CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \ | 
|  | } else if (yoffset == 4) {                                               \ | 
|  | uint16_t tmp1[w * h];                                                  \ | 
|  | highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, src_stride,   \ | 
|  | w, h);                                   \ | 
|  | aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred,  \ | 
|  | w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \ | 
|  | msk_stride, invert_mask);               \ | 
|  | return aom_highbd_##bitdepth##_variance##w##x##h(                      \ | 
|  | CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \ | 
|  | } else {                                                                 \ | 
|  | uint16_t tmp1[w * h];                                                  \ | 
|  | highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride,          \ | 
|  | src_stride, h, yoffset);            \ | 
|  | aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred,  \ | 
|  | w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \ | 
|  | msk_stride, invert_mask);               \ | 
|  | return aom_highbd_##bitdepth##_variance##w##x##h(                      \ | 
|  | CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \ | 
|  | }                                                                        \ | 
|  | } else if (xoffset == 4) {                                                 \ | 
|  | uint16_t tmp0[w * (h + 1)];                                              \ | 
|  | if (yoffset == 0) {                                                      \ | 
|  | uint16_t tmp1[w * h];                                                  \ | 
|  | highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h);     \ | 
|  | aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred,  \ | 
|  | w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \ | 
|  | msk_stride, invert_mask);               \ | 
|  | return aom_highbd_##bitdepth##_variance##w##x##h(                      \ | 
|  | CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \ | 
|  | } else if (yoffset == 4) {                                               \ | 
|  | uint16_t tmp1[w * h];                                                  \ | 
|  | uint16_t tmp2[w * h];                                                  \ | 
|  | highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w,         \ | 
|  | (h + 1));                                \ | 
|  | highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \ | 
|  | aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred,  \ | 
|  | w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \ | 
|  | msk_stride, invert_mask);               \ | 
|  | return aom_highbd_##bitdepth##_variance##w##x##h(                      \ | 
|  | CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse);                \ | 
|  | } else {                                                                 \ | 
|  | uint16_t tmp1[w * h];                                                  \ | 
|  | uint16_t tmp2[w * h];                                                  \ | 
|  | highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w,         \ | 
|  | (h + 1));                                \ | 
|  | highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \ | 
|  | aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred,  \ | 
|  | w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \ | 
|  | msk_stride, invert_mask);               \ | 
|  | return aom_highbd_##bitdepth##_variance##w##x##h(                      \ | 
|  | CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse);                \ | 
|  | }                                                                        \ | 
|  | } else {                                                                   \ | 
|  | if (yoffset == 0) {                                                      \ | 
|  | uint16_t tmp0[w * h];                                                  \ | 
|  | uint16_t tmp1[w * h];                                                  \ | 
|  | highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h,    \ | 
|  | xoffset);                           \ | 
|  | aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred,  \ | 
|  | w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \ | 
|  | msk_stride, invert_mask);               \ | 
|  | return aom_highbd_##bitdepth##_variance##w##x##h(                      \ | 
|  | CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \ | 
|  | } else if (yoffset == 4) {                                               \ | 
|  | uint16_t tmp0[w * (h + 1)];                                            \ | 
|  | uint16_t tmp1[w * h];                                                  \ | 
|  | uint16_t tmp2[w * h];                                                  \ | 
|  | highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,       \ | 
|  | (h + 1), xoffset);                  \ | 
|  | highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \ | 
|  | aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred,  \ | 
|  | w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \ | 
|  | msk_stride, invert_mask);               \ | 
|  | return aom_highbd_##bitdepth##_variance##w##x##h(                      \ | 
|  | CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse);                \ | 
|  | } else {                                                                 \ | 
|  | uint16_t tmp0[w * (h + 1)];                                            \ | 
|  | uint16_t tmp1[w * (h + 1)];                                            \ | 
|  | uint16_t tmp2[w * h];                                                  \ | 
|  | highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,       \ | 
|  | (h + 1), xoffset);                  \ | 
|  | highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \ | 
|  | aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred,  \ | 
|  | w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \ | 
|  | msk_stride, invert_mask);               \ | 
|  | return aom_highbd_##bitdepth##_variance##w##x##h(                      \ | 
|  | CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse);                \ | 
|  | }                                                                        \ | 
|  | }                                                                          \ | 
|  | } | 
|  |  | 
|  | // 8-bit | 
|  | HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4) | 
|  | HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8) | 
|  |  | 
|  | HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4) | 
|  | HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8) | 
|  | HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16) | 
|  |  | 
|  | HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8) | 
|  | HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16) | 
|  | HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32) | 
|  |  | 
|  | HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16) | 
|  | HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32) | 
|  | HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64) | 
|  |  | 
|  | HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32) | 
|  | HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64) | 
|  | HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 128) | 
|  |  | 
|  | HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 64) | 
|  | HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 128) | 
|  |  | 
|  | #if !CONFIG_REALTIME_ONLY | 
|  | HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 16) | 
|  |  | 
|  | HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 32) | 
|  |  | 
|  | HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 4) | 
|  | HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 64) | 
|  |  | 
|  | HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 8) | 
|  |  | 
|  | HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 16) | 
|  | #endif  // !CONFIG_REALTIME_ONLY | 
|  |  | 
|  | // 10-bit | 
|  | HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4) | 
|  | HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8) | 
|  |  | 
|  | HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4) | 
|  | HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8) | 
|  | HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16) | 
|  |  | 
|  | HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8) | 
|  | HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16) | 
|  | HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32) | 
|  |  | 
|  | HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16) | 
|  | HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32) | 
|  | HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64) | 
|  |  | 
|  | HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32) | 
|  | HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64) | 
|  | HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 128) | 
|  |  | 
|  | HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 64) | 
|  | HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 128) | 
|  |  | 
|  | #if !CONFIG_REALTIME_ONLY | 
|  | HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 4, 16) | 
|  |  | 
|  | HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 32) | 
|  |  | 
|  | HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 4) | 
|  | HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 64) | 
|  |  | 
|  | HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 8) | 
|  |  | 
|  | HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 16) | 
|  | #endif  // !CONFIG_REALTIME_ONLY | 
|  |  | 
|  | // 12-bit | 
|  | HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4) | 
|  | HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8) | 
|  |  | 
|  | HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4) | 
|  | HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8) | 
|  | HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16) | 
|  |  | 
|  | HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8) | 
|  | HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16) | 
|  | HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32) | 
|  |  | 
|  | HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16) | 
|  | HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32) | 
|  | HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64) | 
|  |  | 
|  | HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32) | 
|  | HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64) | 
|  | HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 128) | 
|  |  | 
|  | HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 64) | 
|  | HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 128) | 
|  |  | 
|  | #if !CONFIG_REALTIME_ONLY | 
|  | HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 4, 16) | 
|  |  | 
|  | HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 32) | 
|  |  | 
|  | HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 4) | 
|  | HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 64) | 
|  |  | 
|  | HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 8) | 
|  |  | 
|  | HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 16) | 
|  | #endif  // !CONFIG_REALTIME_ONLY | 
|  |  | 
|  | #if !CONFIG_REALTIME_ONLY | 
|  | #define HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h)                \ | 
|  | unsigned int                                                              \ | 
|  | aom_highbd_##bitdepth##_obmc_sub_pixel_variance##w##x##h##_neon(      \ | 
|  | const uint8_t *pre, int pre_stride, int xoffset, int yoffset,     \ | 
|  | const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {    \ | 
|  | uint16_t *pre_ptr = CONVERT_TO_SHORTPTR(pre);                           \ | 
|  | uint16_t tmp0[w * (h + 1)];                                             \ | 
|  | uint16_t tmp1[w * h];                                                   \ | 
|  | highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1, h + 1, \ | 
|  | xoffset);                            \ | 
|  | highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);       \ | 
|  | return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(           \ | 
|  | CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse);                      \ | 
|  | } | 
|  |  | 
|  | #define SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h)       \ | 
|  | unsigned int                                                                 \ | 
|  | aom_highbd_##bitdepth##_obmc_sub_pixel_variance##w##x##h##_neon(         \ | 
|  | const uint8_t *pre, int pre_stride, int xoffset, int yoffset,        \ | 
|  | const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {       \ | 
|  | uint16_t *pre_ptr = CONVERT_TO_SHORTPTR(pre);                              \ | 
|  | if (xoffset == 0) {                                                        \ | 
|  | if (yoffset == 0) {                                                      \ | 
|  | return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \ | 
|  | pre, pre_stride, wsrc, mask, sse);                                 \ | 
|  | } else if (yoffset == 4) {                                               \ | 
|  | uint16_t tmp[w * h];                                                   \ | 
|  | highbd_var_filter_block2d_avg(pre_ptr, tmp, pre_stride, pre_stride, w, \ | 
|  | h);                                      \ | 
|  | return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \ | 
|  | CONVERT_TO_BYTEPTR(tmp), w, wsrc, mask, sse);                      \ | 
|  | } else {                                                                 \ | 
|  | uint16_t tmp[w * h];                                                   \ | 
|  | highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp, pre_stride,           \ | 
|  | pre_stride, h, yoffset);            \ | 
|  | return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \ | 
|  | CONVERT_TO_BYTEPTR(tmp), w, wsrc, mask, sse);                      \ | 
|  | }                                                                        \ | 
|  | } else if (xoffset == 4) {                                                 \ | 
|  | uint16_t tmp0[w * (h + 1)];                                              \ | 
|  | if (yoffset == 0) {                                                      \ | 
|  | highbd_var_filter_block2d_avg(pre_ptr, tmp0, pre_stride, 1, w, h);     \ | 
|  | return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \ | 
|  | CONVERT_TO_BYTEPTR(tmp0), w, wsrc, mask, sse);                     \ | 
|  | } else if (yoffset == 4) {                                               \ | 
|  | uint16_t tmp1[w * (h + 1)];                                            \ | 
|  | highbd_var_filter_block2d_avg(pre_ptr, tmp0, pre_stride, 1, w, h + 1); \ | 
|  | highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \ | 
|  | return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \ | 
|  | CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse);                     \ | 
|  | } else {                                                                 \ | 
|  | uint16_t tmp1[w * (h + 1)];                                            \ | 
|  | highbd_var_filter_block2d_avg(pre_ptr, tmp0, pre_stride, 1, w, h + 1); \ | 
|  | highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \ | 
|  | return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \ | 
|  | CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse);                     \ | 
|  | }                                                                        \ | 
|  | } else {                                                                   \ | 
|  | uint16_t tmp0[w * (h + 1)];                                              \ | 
|  | if (yoffset == 0) {                                                      \ | 
|  | highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1, h,    \ | 
|  | xoffset);                           \ | 
|  | return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \ | 
|  | CONVERT_TO_BYTEPTR(tmp0), w, wsrc, mask, sse);                     \ | 
|  | } else if (yoffset == 4) {                                               \ | 
|  | uint16_t tmp1[w * h];                                                  \ | 
|  | highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1,       \ | 
|  | h + 1, xoffset);                    \ | 
|  | highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \ | 
|  | return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \ | 
|  | CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse);                     \ | 
|  | } else {                                                                 \ | 
|  | uint16_t tmp1[w * h];                                                  \ | 
|  | highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1,       \ | 
|  | h + 1, xoffset);                    \ | 
|  | highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \ | 
|  | return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \ | 
|  | CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse);                     \ | 
|  | }                                                                        \ | 
|  | }                                                                          \ | 
|  | } | 
|  |  | 
|  | // 8-bit | 
|  | HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4) | 
|  | HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8) | 
|  | HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 16) | 
|  |  | 
|  | HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4) | 
|  | HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8) | 
|  | HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16) | 
|  | HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 32) | 
|  |  | 
|  | HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 4) | 
|  | HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8) | 
|  | SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16) | 
|  | SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32) | 
|  | SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 64) | 
|  |  | 
|  | SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 8) | 
|  | SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16) | 
|  | SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32) | 
|  | SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64) | 
|  |  | 
|  | SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 16) | 
|  | SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32) | 
|  | SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64) | 
|  | SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 128) | 
|  |  | 
|  | SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 128, 64) | 
|  | SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 128, 128) | 
|  |  | 
|  | // 10-bit | 
|  | HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4) | 
|  | HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8) | 
|  | HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 4, 16) | 
|  |  | 
|  | HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4) | 
|  | HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8) | 
|  | HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16) | 
|  | HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 32) | 
|  |  | 
|  | HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 4) | 
|  | HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8) | 
|  | SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16) | 
|  | SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32) | 
|  | SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 64) | 
|  |  | 
|  | SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 8) | 
|  | SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16) | 
|  | SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32) | 
|  | SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64) | 
|  |  | 
|  | SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 16) | 
|  | SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32) | 
|  | SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64) | 
|  | SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 128) | 
|  |  | 
|  | SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 128, 64) | 
|  | SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 128, 128) | 
|  |  | 
|  | // 12-bit | 
|  | HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4) | 
|  | HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8) | 
|  | HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 4, 16) | 
|  |  | 
|  | HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4) | 
|  | HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8) | 
|  | HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16) | 
|  | HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 32) | 
|  |  | 
|  | HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 4) | 
|  | HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8) | 
|  | SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16) | 
|  | SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32) | 
|  | SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 64) | 
|  |  | 
|  | SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 8) | 
|  | SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16) | 
|  | SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32) | 
|  | SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64) | 
|  |  | 
|  | SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 16) | 
|  | SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32) | 
|  | SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64) | 
|  | SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 128) | 
|  |  | 
|  | SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 128, 64) | 
|  | SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 128, 128) | 
|  | #endif  // !CONFIG_REALTIME_ONLY |