ext-inter: Delete dead code
Patches https://aomedia-review.googlesource.com/c/11987/
and https://aomedia-review.googlesource.com/c/11988/
replaced the old masked motion search pipeline with
a new one which uses different SAD/SSE functions.
This resulted in a lot of dead code.
This patch removes the now-dead code. Note that this
includes vectorized SAD/SSE functions, which will need
to be rewritten at some point for the new pipeline. It
also includes the masked_compound_variance_* functions
since these turned out not to be used by the new pipeline.
To help with the later addition of vectorized functions, the
masked_sad/variance_test.cc files are kept but are modified
to work with the new functions. The tests are then disabled
until we actually have the vectorized functions.
Change-Id: I61b686abd14bba5280bed94e1be62eb74ea23d89
diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index c248f3e..08dea4e 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -317,11 +317,6 @@
"${AOM_ROOT}/aom_dsp/x86/variance_sse2.c"
"${AOM_ROOT}/aom_dsp/x86/sum_squares_sse2.c")
- set(AOM_DSP_ENCODER_INTRIN_SSSE3
- ${AOM_DSP_ENCODER_INTRIN_SSSE3}
- "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.c"
- "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.c")
-
set(AOM_DSP_ENCODER_ASM_SSSE3_X86_64
${AOM_DSP_ENCODER_ASM_SSSE3_X86_64}
"${AOM_ROOT}/aom_dsp/x86/avg_ssse3_x86_64.asm"
diff --git a/aom_dsp/aom_dsp.mk b/aom_dsp/aom_dsp.mk
index da173f5..1129ba3 100644
--- a/aom_dsp/aom_dsp.mk
+++ b/aom_dsp/aom_dsp.mk
@@ -342,10 +342,6 @@
endif
ifeq ($(CONFIG_AV1_ENCODER),yes)
-ifeq ($(CONFIG_EXT_INTER),yes)
-DSP_SRCS-$(HAVE_SSSE3) += x86/masked_sad_intrin_ssse3.c
-DSP_SRCS-$(HAVE_SSSE3) += x86/masked_variance_intrin_ssse3.c
-endif #CONFIG_EXT_INTER
ifeq ($(CONFIG_MOTION_VAR),yes)
DSP_SRCS-$(HAVE_SSE4_1) += x86/obmc_sad_sse4.c
DSP_SRCS-$(HAVE_SSE4_1) += x86/obmc_variance_sse4.c
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 28f997a..129ad72 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -738,17 +738,12 @@
if (aom_config("CONFIG_EXT_INTER") eq "yes") {
foreach (@block_sizes) {
($w, $h) = @$_;
- add_proto qw/unsigned int/, "aom_masked_sad${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
- specialize "aom_masked_sad${w}x${h}", qw/ssse3/;
add_proto qw/unsigned int/, "aom_masked_compound_sad${w}x${h}", "const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask";
}
if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
foreach (@block_sizes) {
($w, $h) = @$_;
- add_proto qw/unsigned int/, "aom_highbd_masked_sad${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
- specialize "aom_highbd_masked_sad${w}x${h}", qw/ssse3/;
-
add_proto qw/unsigned int/, "aom_highbd_masked_compound_sad${w}x${h}", "const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask";
}
}
@@ -1048,12 +1043,6 @@
#
foreach (@block_sizes) {
($w, $h) = @$_;
- add_proto qw/unsigned int/, "aom_masked_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
- add_proto qw/unsigned int/, "aom_masked_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
- specialize "aom_masked_variance${w}x${h}", qw/ssse3/;
- specialize "aom_masked_sub_pixel_variance${w}x${h}", qw/ssse3/;
-
- add_proto qw/unsigned int/, "aom_masked_compound_variance${w}x${h}", "const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *m, int m_stride, int invert_mask, unsigned int *sse";
add_proto qw/unsigned int/, "aom_masked_compound_sub_pixel_variance${w}x${h}", "const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse";
}
@@ -1061,12 +1050,6 @@
foreach $bd ("_", "_10_", "_12_") {
foreach (@block_sizes) {
($w, $h) = @$_;
- add_proto qw/unsigned int/, "aom_highbd${bd}masked_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
- add_proto qw/unsigned int/, "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
- specialize "aom_highbd${bd}masked_variance${w}x${h}", qw/ssse3/;
- specialize "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", qw/ssse3/;
-
- add_proto qw/unsigned int/, "aom_highbd${bd}masked_compound_variance${w}x${h}", "const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *m, int m_stride, int invert_mask, unsigned int *sse";
add_proto qw/unsigned int/, "aom_highbd${bd}masked_compound_sub_pixel_variance${w}x${h}", "const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse";
}
}
diff --git a/aom_dsp/sad.c b/aom_dsp/sad.c
index e7f31a1..e4be68c 100644
--- a/aom_dsp/sad.c
+++ b/aom_dsp/sad.c
@@ -312,30 +312,11 @@
#if CONFIG_AV1 && CONFIG_EXT_INTER
static INLINE
- unsigned int masked_sad(const uint8_t *a, int a_stride, const uint8_t *b,
- int b_stride, const uint8_t *m, int m_stride,
- int width, int height) {
- int y, x;
- unsigned int sad = 0;
-
- for (y = 0; y < height; y++) {
- for (x = 0; x < width; x++) sad += m[x] * abs(a[x] - b[x]);
-
- a += a_stride;
- b += b_stride;
- m += m_stride;
- }
- sad = (sad + 31) >> 6;
-
- return sad;
-}
-
-static INLINE unsigned int masked_compound_sad(const uint8_t *src,
- int src_stride, const uint8_t *a,
- int a_stride, const uint8_t *b,
- int b_stride, const uint8_t *m,
- int m_stride, int width,
- int height) {
+ unsigned int masked_compound_sad(const uint8_t *src, int src_stride,
+ const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ const uint8_t *m, int m_stride, int width,
+ int height) {
int y, x;
unsigned int sad = 0;
@@ -356,12 +337,6 @@
}
#define MASKSADMxN(m, n) \
- unsigned int aom_masked_sad##m##x##n##_c( \
- const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
- const uint8_t *msk, int msk_stride) { \
- return masked_sad(src, src_stride, ref, ref_stride, msk, msk_stride, m, \
- n); \
- } \
unsigned int aom_masked_compound_sad##m##x##n##_c( \
const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \
@@ -397,31 +372,11 @@
#if CONFIG_HIGHBITDEPTH
static INLINE
- unsigned int highbd_masked_sad(const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride,
- const uint8_t *m, int m_stride, int width,
- int height) {
- int y, x;
- unsigned int sad = 0;
- const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
- const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-
- for (y = 0; y < height; y++) {
- for (x = 0; x < width; x++) sad += m[x] * abs(a[x] - b[x]);
-
- a += a_stride;
- b += b_stride;
- m += m_stride;
- }
- sad = (sad + 31) >> 6;
-
- return sad;
-}
-
-static INLINE unsigned int highbd_masked_compound_sad(
- const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride, int width,
- int height) {
+ unsigned int highbd_masked_compound_sad(const uint8_t *src8, int src_stride,
+ const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride,
+ const uint8_t *m, int m_stride,
+ int width, int height) {
int y, x;
unsigned int sad = 0;
const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
@@ -445,12 +400,6 @@
}
#define HIGHBD_MASKSADMXN(m, n) \
- unsigned int aom_highbd_masked_sad##m##x##n##_c( \
- const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
- const uint8_t *msk, int msk_stride) { \
- return highbd_masked_sad(src, src_stride, ref, ref_stride, msk, \
- msk_stride, m, n); \
- } \
unsigned int aom_highbd_masked_compound_sad##m##x##n##_c( \
const uint8_t *src8, int src_stride, const uint8_t *ref8, \
int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \
diff --git a/aom_dsp/variance.c b/aom_dsp/variance.c
index 90d0622..85adcd1 100644
--- a/aom_dsp/variance.c
+++ b/aom_dsp/variance.c
@@ -714,163 +714,42 @@
}
}
-void masked_variance(const uint8_t *a, int a_stride, const uint8_t *b,
- int b_stride, const uint8_t *m, int m_stride, int w, int h,
- unsigned int *sse, int *sum) {
- int i, j;
-
- int64_t sum64 = 0;
- uint64_t sse64 = 0;
-
- for (i = 0; i < h; i++) {
- for (j = 0; j < w; j++) {
- const int diff = (a[j] - b[j]) * (m[j]);
- sum64 += diff;
- sse64 += diff * diff;
- }
-
- a += a_stride;
- b += b_stride;
- m += m_stride;
- }
- sum64 = (sum64 >= 0) ? sum64 : -sum64;
- *sum = (int)ROUND_POWER_OF_TWO(sum64, 6);
- *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 12);
-}
-
-void masked_compound_variance(const uint8_t *src, int src_stride,
- const uint8_t *a, int a_stride, const uint8_t *b,
- int b_stride, const uint8_t *m, int m_stride,
- int w, int h, unsigned int *sse, int *sum) {
- int i, j;
-
- int64_t sum64 = 0;
- uint64_t sse64 = 0;
-
- for (i = 0; i < h; i++) {
- for (j = 0; j < w; j++) {
- const uint8_t pred = AOM_BLEND_A64(m[j], a[j], b[j]);
- const int diff = pred - src[j];
- sum64 += diff;
- sse64 += diff * diff;
- }
-
- src += src_stride;
- a += a_stride;
- b += b_stride;
- m += m_stride;
- }
- sum64 = (sum64 >= 0) ? sum64 : -sum64;
- *sum = (int)ROUND_POWER_OF_TWO(sum64, 6);
- *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 12);
-}
-
-#define MASK_VAR(W, H) \
- unsigned int aom_masked_variance##W##x##H##_c( \
- const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, \
- const uint8_t *m, int m_stride, unsigned int *sse) { \
- int sum; \
- masked_variance(a, a_stride, b, b_stride, m, m_stride, W, H, sse, &sum); \
- return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
- } \
- \
- unsigned int aom_masked_compound_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
- const uint8_t *second_pred, const uint8_t *m, int m_stride, \
- int invert_mask, unsigned int *sse) { \
- int sum; \
- if (!invert_mask) \
- masked_compound_variance(src, src_stride, ref, ref_stride, second_pred, \
- W, m, m_stride, W, H, sse, &sum); \
- else \
- masked_compound_variance(src, src_stride, second_pred, W, ref, \
- ref_stride, m, m_stride, W, H, sse, &sum); \
- return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
+#define MASK_SUBPIX_VAR(W, H) \
+ unsigned int aom_masked_compound_sub_pixel_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
+ const uint8_t *msk, int msk_stride, int invert_mask, \
+ unsigned int *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint8_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
+ \
+ var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W, \
+ bilinear_filters_2t[xoffset]); \
+ var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ bilinear_filters_2t[yoffset]); \
+ \
+ aom_comp_mask_pred(temp3, second_pred, W, H, temp2, W, msk, msk_stride, \
+ invert_mask); \
+ return aom_variance##W##x##H##_c(temp3, W, ref, ref_stride, sse); \
}
-#define MASK_SUBPIX_VAR(W, H) \
- unsigned int aom_masked_sub_pixel_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, int xoffset, int yoffset, \
- const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, \
- unsigned int *sse) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint8_t temp2[H * W]; \
- \
- var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W, \
- bilinear_filters_2t[xoffset]); \
- var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
- bilinear_filters_2t[yoffset]); \
- \
- return aom_masked_variance##W##x##H##_c(temp2, W, dst, dst_stride, msk, \
- msk_stride, sse); \
- } \
- \
- unsigned int aom_masked_compound_sub_pixel_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, int xoffset, int yoffset, \
- const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
- const uint8_t *msk, int msk_stride, int invert_mask, \
- unsigned int *sse) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint8_t temp2[H * W]; \
- DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
- \
- var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W, \
- bilinear_filters_2t[xoffset]); \
- var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
- bilinear_filters_2t[yoffset]); \
- \
- aom_comp_mask_pred(temp3, second_pred, W, H, temp2, W, msk, msk_stride, \
- invert_mask); \
- return aom_variance##W##x##H##_c(temp3, W, ref, ref_stride, sse); \
- }
-
-MASK_VAR(4, 4)
MASK_SUBPIX_VAR(4, 4)
-
-MASK_VAR(4, 8)
MASK_SUBPIX_VAR(4, 8)
-
-MASK_VAR(8, 4)
MASK_SUBPIX_VAR(8, 4)
-
-MASK_VAR(8, 8)
MASK_SUBPIX_VAR(8, 8)
-
-MASK_VAR(8, 16)
MASK_SUBPIX_VAR(8, 16)
-
-MASK_VAR(16, 8)
MASK_SUBPIX_VAR(16, 8)
-
-MASK_VAR(16, 16)
MASK_SUBPIX_VAR(16, 16)
-
-MASK_VAR(16, 32)
MASK_SUBPIX_VAR(16, 32)
-
-MASK_VAR(32, 16)
MASK_SUBPIX_VAR(32, 16)
-
-MASK_VAR(32, 32)
MASK_SUBPIX_VAR(32, 32)
-
-MASK_VAR(32, 64)
MASK_SUBPIX_VAR(32, 64)
-
-MASK_VAR(64, 32)
MASK_SUBPIX_VAR(64, 32)
-
-MASK_VAR(64, 64)
MASK_SUBPIX_VAR(64, 64)
-
#if CONFIG_EXT_PARTITION
-MASK_VAR(64, 128)
MASK_SUBPIX_VAR(64, 128)
-
-MASK_VAR(128, 64)
MASK_SUBPIX_VAR(128, 64)
-
-MASK_VAR(128, 128)
MASK_SUBPIX_VAR(128, 128)
#endif // CONFIG_EXT_PARTITION
@@ -920,270 +799,7 @@
}
}
-void highbd_masked_variance64(const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride, const uint8_t *m,
- int m_stride, int w, int h, uint64_t *sse,
- int64_t *sum) {
- int i, j;
- uint16_t *a = CONVERT_TO_SHORTPTR(a8);
- uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-
- *sum = 0;
- *sse = 0;
-
- for (i = 0; i < h; i++) {
- for (j = 0; j < w; j++) {
- const int diff = (a[j] - b[j]) * (m[j]);
- *sum += (int64_t)diff;
- *sse += (int64_t)diff * diff;
- }
-
- a += a_stride;
- b += b_stride;
- m += m_stride;
- }
- *sum = (*sum >= 0) ? *sum : -*sum;
- *sum = ROUND_POWER_OF_TWO(*sum, 6);
- *sse = ROUND_POWER_OF_TWO(*sse, 12);
-}
-
-void highbd_masked_variance(const uint8_t *a8, int a_stride, const uint8_t *b8,
- int b_stride, const uint8_t *m, int m_stride, int w,
- int h, unsigned int *sse, int *sum) {
- int64_t sum64;
- uint64_t sse64;
- highbd_masked_variance64(a8, a_stride, b8, b_stride, m, m_stride, w, h,
- &sse64, &sum64);
- *sum = (int)sum64;
- *sse = (unsigned int)sse64;
-}
-
-void highbd_10_masked_variance(const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride,
- const uint8_t *m, int m_stride, int w, int h,
- unsigned int *sse, int *sum) {
- int64_t sum64;
- uint64_t sse64;
- highbd_masked_variance64(a8, a_stride, b8, b_stride, m, m_stride, w, h,
- &sse64, &sum64);
- *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
- *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
-}
-
-void highbd_12_masked_variance(const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride,
- const uint8_t *m, int m_stride, int w, int h,
- unsigned int *sse, int *sum) {
- int64_t sum64;
- uint64_t sse64;
- highbd_masked_variance64(a8, a_stride, b8, b_stride, m, m_stride, w, h,
- &sse64, &sum64);
- *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
- *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
-}
-
-void highbd_masked_compound_variance64(const uint8_t *src8, int src_stride,
- const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride,
- const uint8_t *m, int m_stride, int w,
- int h, uint64_t *sse, int64_t *sum) {
- int i, j;
- uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *a = CONVERT_TO_SHORTPTR(a8);
- uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-
- *sum = 0;
- *sse = 0;
-
- for (i = 0; i < h; i++) {
- for (j = 0; j < w; j++) {
- const uint16_t pred = AOM_BLEND_A64(m[j], a[j], b[j]);
- const int diff = pred - src[j];
- *sum += (int64_t)diff;
- *sse += (int64_t)diff * diff;
- }
-
- src += src_stride;
- a += a_stride;
- b += b_stride;
- m += m_stride;
- }
- *sum = (*sum >= 0) ? *sum : -*sum;
- *sum = ROUND_POWER_OF_TWO(*sum, 6);
- *sse = ROUND_POWER_OF_TWO(*sse, 12);
-}
-
-void highbd_masked_compound_variance(const uint8_t *src8, int src_stride,
- const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride,
- const uint8_t *m, int m_stride, int w,
- int h, unsigned int *sse, int *sum) {
- int64_t sum64;
- uint64_t sse64;
- highbd_masked_compound_variance64(src8, src_stride, a8, a_stride, b8,
- b_stride, m, m_stride, w, h, &sse64,
- &sum64);
- *sum = (int)sum64;
- *sse = (unsigned int)sse64;
-}
-
-void highbd_10_masked_compound_variance(const uint8_t *src8, int src_stride,
- const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride,
- const uint8_t *m, int m_stride, int w,
- int h, unsigned int *sse, int *sum) {
- int64_t sum64;
- uint64_t sse64;
- highbd_masked_compound_variance64(src8, src_stride, a8, a_stride, b8,
- b_stride, m, m_stride, w, h, &sse64,
- &sum64);
- *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
- *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
-}
-
-void highbd_12_masked_compound_variance(const uint8_t *src8, int src_stride,
- const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride,
- const uint8_t *m, int m_stride, int w,
- int h, unsigned int *sse, int *sum) {
- int64_t sum64;
- uint64_t sse64;
- highbd_masked_compound_variance64(src8, src_stride, a8, a_stride, b8,
- b_stride, m, m_stride, w, h, &sse64,
- &sum64);
- *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
- *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
-}
-
-#define HIGHBD_MASK_VAR(W, H) \
- unsigned int aom_highbd_masked_variance##W##x##H##_c( \
- const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, \
- const uint8_t *m, int m_stride, unsigned int *sse) { \
- int sum; \
- highbd_masked_variance(a, a_stride, b, b_stride, m, m_stride, W, H, sse, \
- &sum); \
- return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
- } \
- \
- unsigned int aom_highbd_10_masked_variance##W##x##H##_c( \
- const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, \
- const uint8_t *m, int m_stride, unsigned int *sse) { \
- int sum; \
- int64_t var; \
- highbd_10_masked_variance(a, a_stride, b, b_stride, m, m_stride, W, H, \
- sse, &sum); \
- var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
- return (var >= 0) ? (uint32_t)var : 0; \
- } \
- \
- unsigned int aom_highbd_12_masked_variance##W##x##H##_c( \
- const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, \
- const uint8_t *m, int m_stride, unsigned int *sse) { \
- int sum; \
- int64_t var; \
- highbd_12_masked_variance(a, a_stride, b, b_stride, m, m_stride, W, H, \
- sse, &sum); \
- var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
- return (var >= 0) ? (uint32_t)var : 0; \
- } \
- \
- unsigned int aom_highbd_masked_compound_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
- const uint8_t *second_pred, const uint8_t *m, int m_stride, \
- int invert_mask, unsigned int *sse) { \
- int sum; \
- if (!invert_mask) \
- highbd_masked_compound_variance(src, src_stride, ref, ref_stride, \
- second_pred, W, m, m_stride, W, H, sse, \
- &sum); \
- else \
- highbd_masked_compound_variance(src, src_stride, second_pred, W, ref, \
- ref_stride, m, m_stride, W, H, sse, \
- &sum); \
- return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
- } \
- \
- unsigned int aom_highbd_10_masked_compound_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
- const uint8_t *second_pred, const uint8_t *m, int m_stride, \
- int invert_mask, unsigned int *sse) { \
- int sum; \
- if (!invert_mask) \
- highbd_10_masked_compound_variance(src, src_stride, ref, ref_stride, \
- second_pred, W, m, m_stride, W, H, \
- sse, &sum); \
- else \
- highbd_10_masked_compound_variance(src, src_stride, second_pred, W, ref, \
- ref_stride, m, m_stride, W, H, sse, \
- &sum); \
- return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
- } \
- \
- unsigned int aom_highbd_12_masked_compound_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
- const uint8_t *second_pred, const uint8_t *m, int m_stride, \
- int invert_mask, unsigned int *sse) { \
- int sum; \
- if (!invert_mask) \
- highbd_12_masked_compound_variance(src, src_stride, ref, ref_stride, \
- second_pred, W, m, m_stride, W, H, \
- sse, &sum); \
- else \
- highbd_12_masked_compound_variance(src, src_stride, second_pred, W, ref, \
- ref_stride, m, m_stride, W, H, sse, \
- &sum); \
- return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
- }
-
#define HIGHBD_MASK_SUBPIX_VAR(W, H) \
- unsigned int aom_highbd_masked_sub_pixel_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, int xoffset, int yoffset, \
- const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, \
- unsigned int *sse) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint16_t temp2[H * W]; \
- \
- aom_highbd_var_filter_block2d_bil_first_pass( \
- src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
- aom_highbd_var_filter_block2d_bil_second_pass( \
- fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
- \
- return aom_highbd_masked_variance##W##x##H##_c( \
- CONVERT_TO_BYTEPTR(temp2), W, dst, dst_stride, msk, msk_stride, sse); \
- } \
- \
- unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, int xoffset, int yoffset, \
- const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, \
- unsigned int *sse) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint16_t temp2[H * W]; \
- \
- aom_highbd_var_filter_block2d_bil_first_pass( \
- src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
- aom_highbd_var_filter_block2d_bil_second_pass( \
- fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
- \
- return aom_highbd_10_masked_variance##W##x##H##_c( \
- CONVERT_TO_BYTEPTR(temp2), W, dst, dst_stride, msk, msk_stride, sse); \
- } \
- \
- unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, int xoffset, int yoffset, \
- const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, \
- unsigned int *sse) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint16_t temp2[H * W]; \
- \
- aom_highbd_var_filter_block2d_bil_first_pass( \
- src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
- aom_highbd_var_filter_block2d_bil_second_pass( \
- fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
- \
- return aom_highbd_12_masked_variance##W##x##H##_c( \
- CONVERT_TO_BYTEPTR(temp2), W, dst, dst_stride, msk, msk_stride, sse); \
- } \
- \
unsigned int aom_highbd_masked_compound_sub_pixel_variance##W##x##H##_c( \
const uint8_t *src, int src_stride, int xoffset, int yoffset, \
const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
@@ -1250,53 +866,22 @@
ref, ref_stride, sse); \
}
-HIGHBD_MASK_VAR(4, 4)
HIGHBD_MASK_SUBPIX_VAR(4, 4)
-
-HIGHBD_MASK_VAR(4, 8)
HIGHBD_MASK_SUBPIX_VAR(4, 8)
-
-HIGHBD_MASK_VAR(8, 4)
HIGHBD_MASK_SUBPIX_VAR(8, 4)
-
-HIGHBD_MASK_VAR(8, 8)
HIGHBD_MASK_SUBPIX_VAR(8, 8)
-
-HIGHBD_MASK_VAR(8, 16)
HIGHBD_MASK_SUBPIX_VAR(8, 16)
-
-HIGHBD_MASK_VAR(16, 8)
HIGHBD_MASK_SUBPIX_VAR(16, 8)
-
-HIGHBD_MASK_VAR(16, 16)
HIGHBD_MASK_SUBPIX_VAR(16, 16)
-
-HIGHBD_MASK_VAR(16, 32)
HIGHBD_MASK_SUBPIX_VAR(16, 32)
-
-HIGHBD_MASK_VAR(32, 16)
HIGHBD_MASK_SUBPIX_VAR(32, 16)
-
-HIGHBD_MASK_VAR(32, 32)
HIGHBD_MASK_SUBPIX_VAR(32, 32)
-
-HIGHBD_MASK_VAR(32, 64)
HIGHBD_MASK_SUBPIX_VAR(32, 64)
-
-HIGHBD_MASK_VAR(64, 32)
HIGHBD_MASK_SUBPIX_VAR(64, 32)
-
-HIGHBD_MASK_VAR(64, 64)
HIGHBD_MASK_SUBPIX_VAR(64, 64)
-
#if CONFIG_EXT_PARTITION
-HIGHBD_MASK_VAR(64, 128)
HIGHBD_MASK_SUBPIX_VAR(64, 128)
-
-HIGHBD_MASK_VAR(128, 64)
HIGHBD_MASK_SUBPIX_VAR(128, 64)
-
-HIGHBD_MASK_VAR(128, 128)
HIGHBD_MASK_SUBPIX_VAR(128, 128)
#endif // CONFIG_EXT_PARTITION
#endif // CONFIG_HIGHBITDEPTH
diff --git a/aom_dsp/variance.h b/aom_dsp/variance.h
index adcf8b4..1b546ab 100644
--- a/aom_dsp/variance.h
+++ b/aom_dsp/variance.h
@@ -55,26 +55,10 @@
int b_stride, unsigned int *sse, const uint8_t *second_pred);
#if CONFIG_AV1 && CONFIG_EXT_INTER
-typedef unsigned int (*aom_masked_sad_fn_t)(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- const uint8_t *msk_ptr,
- int msk_stride);
-typedef unsigned int (*aom_masked_variance_fn_t)(
- const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,
- const uint8_t *msk, int msk_stride, unsigned int *sse);
-typedef unsigned int (*aom_masked_subpixvariance_fn_t)(
- const uint8_t *src, int src_stride, int xoffset, int yoffset,
- const uint8_t *ref, int ref_stride, const uint8_t *msk, int msk_stride,
- unsigned int *sse);
-
typedef unsigned int (*aom_masked_compound_sad_fn_t)(
const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,
const uint8_t *second_pred, const uint8_t *msk, int msk_stride,
int invert_mask);
-typedef unsigned int (*aom_masked_compound_variance_fn_t)(
- const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,
- const uint8_t *second_pred, const uint8_t *m, int m_stride, int invert_mask,
- unsigned int *sse);
typedef unsigned int (*aom_masked_compound_subpixvariance_fn_t)(
const uint8_t *src, int src_stride, int xoffset, int yoffset,
const uint8_t *ref, int ref_stride, const uint8_t *second_pred,
@@ -106,12 +90,7 @@
aom_sad_multi_fn_t sdx8f;
aom_sad_multi_d_fn_t sdx4df;
#if CONFIG_EXT_INTER
- aom_masked_sad_fn_t msdf;
- aom_masked_variance_fn_t mvf;
- aom_masked_subpixvariance_fn_t msvf;
-
aom_masked_compound_sad_fn_t mcsdf;
- aom_masked_compound_variance_fn_t mcvf;
aom_masked_compound_subpixvariance_fn_t mcsvf;
#endif // CONFIG_EXT_INTER
#if CONFIG_MOTION_VAR
diff --git a/aom_dsp/x86/masked_sad_intrin_ssse3.c b/aom_dsp/x86/masked_sad_intrin_ssse3.c
deleted file mode 100644
index 5166e9e..0000000
--- a/aom_dsp/x86/masked_sad_intrin_ssse3.c
+++ /dev/null
@@ -1,334 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <stdlib.h>
-#include <emmintrin.h>
-#include <tmmintrin.h>
-
-#include "aom_ports/mem.h"
-#include "./aom_config.h"
-#include "aom/aom_integer.h"
-
-static INLINE __m128i width8_load_2rows(const uint8_t *ptr, int stride) {
- __m128i temp1 = _mm_loadl_epi64((const __m128i *)ptr);
- __m128i temp2 = _mm_loadl_epi64((const __m128i *)(ptr + stride));
- return _mm_unpacklo_epi64(temp1, temp2);
-}
-
-static INLINE __m128i width4_load_4rows(const uint8_t *ptr, int stride) {
- __m128i temp1 = _mm_cvtsi32_si128(*(const uint32_t *)ptr);
- __m128i temp2 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride));
- __m128i temp3 = _mm_unpacklo_epi32(temp1, temp2);
- temp1 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride * 2));
- temp2 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride * 3));
- temp1 = _mm_unpacklo_epi32(temp1, temp2);
- return _mm_unpacklo_epi64(temp3, temp1);
-}
-
-static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride,
- const uint8_t *b_ptr, int b_stride,
- const uint8_t *m_ptr, int m_stride,
- int width, int height);
-
-static INLINE unsigned int masked_sad8xh_ssse3(
- const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride,
- const uint8_t *m_ptr, int m_stride, int height);
-
-static INLINE unsigned int masked_sad4xh_ssse3(
- const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride,
- const uint8_t *m_ptr, int m_stride, int height);
-
-#define MASKSADMXN_SSSE3(m, n) \
- unsigned int aom_masked_sad##m##x##n##_ssse3( \
- const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
- const uint8_t *msk, int msk_stride) { \
- return masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, msk_stride, \
- m, n); \
- }
-
-#if CONFIG_EXT_PARTITION
-MASKSADMXN_SSSE3(128, 128)
-MASKSADMXN_SSSE3(128, 64)
-MASKSADMXN_SSSE3(64, 128)
-#endif // CONFIG_EXT_PARTITION
-MASKSADMXN_SSSE3(64, 64)
-MASKSADMXN_SSSE3(64, 32)
-MASKSADMXN_SSSE3(32, 64)
-MASKSADMXN_SSSE3(32, 32)
-MASKSADMXN_SSSE3(32, 16)
-MASKSADMXN_SSSE3(16, 32)
-MASKSADMXN_SSSE3(16, 16)
-MASKSADMXN_SSSE3(16, 8)
-
-#define MASKSAD8XN_SSSE3(n) \
- unsigned int aom_masked_sad8x##n##_ssse3( \
- const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
- const uint8_t *msk, int msk_stride) { \
- return masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, msk, \
- msk_stride, n); \
- }
-
-MASKSAD8XN_SSSE3(16)
-MASKSAD8XN_SSSE3(8)
-MASKSAD8XN_SSSE3(4)
-
-#define MASKSAD4XN_SSSE3(n) \
- unsigned int aom_masked_sad4x##n##_ssse3( \
- const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
- const uint8_t *msk, int msk_stride) { \
- return masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk, \
- msk_stride, n); \
- }
-
-MASKSAD4XN_SSSE3(8)
-MASKSAD4XN_SSSE3(4)
-
-// For width a multiple of 16
-// Assumes values in m are <=64
-static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride,
- const uint8_t *b_ptr, int b_stride,
- const uint8_t *m_ptr, int m_stride,
- int width, int height) {
- int y, x;
- __m128i a, b, m, temp1, temp2;
- __m128i res = _mm_setzero_si128();
- __m128i one = _mm_set1_epi16(1);
- // For each row
- for (y = 0; y < height; y++) {
- // Covering the full width
- for (x = 0; x < width; x += 16) {
- // Load a, b, m in xmm registers
- a = _mm_loadu_si128((const __m128i *)(a_ptr + x));
- b = _mm_loadu_si128((const __m128i *)(b_ptr + x));
- m = _mm_loadu_si128((const __m128i *)(m_ptr + x));
-
- // Calculate the difference between a & b
- temp1 = _mm_subs_epu8(a, b);
- temp2 = _mm_subs_epu8(b, a);
- temp1 = _mm_or_si128(temp1, temp2);
-
- // Multiply by m and add together
- temp2 = _mm_maddubs_epi16(temp1, m);
- // Pad out row result to 32 bit integers & add to running total
- res = _mm_add_epi32(res, _mm_madd_epi16(temp2, one));
- }
- // Move onto the next row
- a_ptr += a_stride;
- b_ptr += b_stride;
- m_ptr += m_stride;
- }
- res = _mm_hadd_epi32(res, _mm_setzero_si128());
- res = _mm_hadd_epi32(res, _mm_setzero_si128());
- // sad = (sad + 31) >> 6;
- return (_mm_cvtsi128_si32(res) + 31) >> 6;
-}
-
-static INLINE unsigned int masked_sad8xh_ssse3(
- const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride,
- const uint8_t *m_ptr, int m_stride, int height) {
- int y;
- __m128i a, b, m, temp1, temp2, row_res;
- __m128i res = _mm_setzero_si128();
- __m128i one = _mm_set1_epi16(1);
- // Add the masked SAD for 2 rows at a time
- for (y = 0; y < height; y += 2) {
- // Load a, b, m in xmm registers
- a = width8_load_2rows(a_ptr, a_stride);
- b = width8_load_2rows(b_ptr, b_stride);
- m = width8_load_2rows(m_ptr, m_stride);
-
- // Calculate the difference between a & b
- temp1 = _mm_subs_epu8(a, b);
- temp2 = _mm_subs_epu8(b, a);
- temp1 = _mm_or_si128(temp1, temp2);
-
- // Multiply by m and add together
- row_res = _mm_maddubs_epi16(temp1, m);
-
- // Pad out row result to 32 bit integers & add to running total
- res = _mm_add_epi32(res, _mm_madd_epi16(row_res, one));
-
- // Move onto the next rows
- a_ptr += a_stride * 2;
- b_ptr += b_stride * 2;
- m_ptr += m_stride * 2;
- }
- res = _mm_hadd_epi32(res, _mm_setzero_si128());
- res = _mm_hadd_epi32(res, _mm_setzero_si128());
- // sad = (sad + 31) >> 6;
- return (_mm_cvtsi128_si32(res) + 31) >> 6;
-}
-
-static INLINE unsigned int masked_sad4xh_ssse3(
- const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride,
- const uint8_t *m_ptr, int m_stride, int height) {
- int y;
- __m128i a, b, m, temp1, temp2, row_res;
- __m128i res = _mm_setzero_si128();
- __m128i one = _mm_set1_epi16(1);
- // Add the masked SAD for 4 rows at a time
- for (y = 0; y < height; y += 4) {
- // Load a, b, m in xmm registers
- a = width4_load_4rows(a_ptr, a_stride);
- b = width4_load_4rows(b_ptr, b_stride);
- m = width4_load_4rows(m_ptr, m_stride);
-
- // Calculate the difference between a & b
- temp1 = _mm_subs_epu8(a, b);
- temp2 = _mm_subs_epu8(b, a);
- temp1 = _mm_or_si128(temp1, temp2);
-
- // Multiply by m and add together
- row_res = _mm_maddubs_epi16(temp1, m);
-
- // Pad out row result to 32 bit integers & add to running total
- res = _mm_add_epi32(res, _mm_madd_epi16(row_res, one));
-
- // Move onto the next rows
- a_ptr += a_stride * 4;
- b_ptr += b_stride * 4;
- m_ptr += m_stride * 4;
- }
- // Pad out row result to 32 bit integers & add to running total
- res = _mm_hadd_epi32(res, _mm_setzero_si128());
- res = _mm_hadd_epi32(res, _mm_setzero_si128());
- // sad = (sad + 31) >> 6;
- return (_mm_cvtsi128_si32(res) + 31) >> 6;
-}
-
-#if CONFIG_HIGHBITDEPTH
-static INLINE __m128i highbd_width4_load_2rows(const uint16_t *ptr,
- int stride) {
- __m128i temp1 = _mm_loadl_epi64((const __m128i *)ptr);
- __m128i temp2 = _mm_loadl_epi64((const __m128i *)(ptr + stride));
- return _mm_unpacklo_epi64(temp1, temp2);
-}
-
-static INLINE unsigned int highbd_masked_sad_ssse3(
- const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride,
- const uint8_t *m_ptr, int m_stride, int width, int height);
-
-static INLINE unsigned int highbd_masked_sad4xh_ssse3(
- const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride,
- const uint8_t *m_ptr, int m_stride, int height);
-
-#define HIGHBD_MASKSADMXN_SSSE3(m, n) \
- unsigned int aom_highbd_masked_sad##m##x##n##_ssse3( \
- const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
- const uint8_t *msk, int msk_stride) { \
- return highbd_masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, \
- msk_stride, m, n); \
- }
-
-#if CONFIG_EXT_PARTITION
-HIGHBD_MASKSADMXN_SSSE3(128, 128)
-HIGHBD_MASKSADMXN_SSSE3(128, 64)
-HIGHBD_MASKSADMXN_SSSE3(64, 128)
-#endif // CONFIG_EXT_PARTITION
-HIGHBD_MASKSADMXN_SSSE3(64, 64)
-HIGHBD_MASKSADMXN_SSSE3(64, 32)
-HIGHBD_MASKSADMXN_SSSE3(32, 64)
-HIGHBD_MASKSADMXN_SSSE3(32, 32)
-HIGHBD_MASKSADMXN_SSSE3(32, 16)
-HIGHBD_MASKSADMXN_SSSE3(16, 32)
-HIGHBD_MASKSADMXN_SSSE3(16, 16)
-HIGHBD_MASKSADMXN_SSSE3(16, 8)
-HIGHBD_MASKSADMXN_SSSE3(8, 16)
-HIGHBD_MASKSADMXN_SSSE3(8, 8)
-HIGHBD_MASKSADMXN_SSSE3(8, 4)
-
-#define HIGHBD_MASKSAD4XN_SSSE3(n) \
- unsigned int aom_highbd_masked_sad4x##n##_ssse3( \
- const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
- const uint8_t *msk, int msk_stride) { \
- return highbd_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk, \
- msk_stride, n); \
- }
-
-HIGHBD_MASKSAD4XN_SSSE3(8)
-HIGHBD_MASKSAD4XN_SSSE3(4)
-
-// For width a multiple of 8
-// Assumes values in m are <=64
-static INLINE unsigned int highbd_masked_sad_ssse3(
- const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride,
- const uint8_t *m_ptr, int m_stride, int width, int height) {
- int y, x;
- __m128i a, b, m, temp1, temp2;
- const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr);
- const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8_ptr);
- __m128i res = _mm_setzero_si128();
- // For each row
- for (y = 0; y < height; y++) {
- // Covering the full width
- for (x = 0; x < width; x += 8) {
- // Load a, b, m in xmm registers
- a = _mm_loadu_si128((const __m128i *)(a_ptr + x));
- b = _mm_loadu_si128((const __m128i *)(b_ptr + x));
- m = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(m_ptr + x)),
- _mm_setzero_si128());
-
- // Calculate the difference between a & b
- temp1 = _mm_subs_epu16(a, b);
- temp2 = _mm_subs_epu16(b, a);
- temp1 = _mm_or_si128(temp1, temp2);
-
- // Add result of multiplying by m and add pairs together to running total
- res = _mm_add_epi32(res, _mm_madd_epi16(temp1, m));
- }
- // Move onto the next row
- a_ptr += a_stride;
- b_ptr += b_stride;
- m_ptr += m_stride;
- }
- res = _mm_hadd_epi32(res, _mm_setzero_si128());
- res = _mm_hadd_epi32(res, _mm_setzero_si128());
- // sad = (sad + 31) >> 6;
- return (_mm_cvtsi128_si32(res) + 31) >> 6;
-}
-
-static INLINE unsigned int highbd_masked_sad4xh_ssse3(
- const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride,
- const uint8_t *m_ptr, int m_stride, int height) {
- int y;
- __m128i a, b, m, temp1, temp2;
- const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr);
- const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8_ptr);
- __m128i res = _mm_setzero_si128();
- // Add the masked SAD for 2 rows at a time
- for (y = 0; y < height; y += 2) {
- // Load a, b, m in xmm registers
- a = highbd_width4_load_2rows(a_ptr, a_stride);
- b = highbd_width4_load_2rows(b_ptr, b_stride);
- temp1 = _mm_loadl_epi64((const __m128i *)m_ptr);
- temp2 = _mm_loadl_epi64((const __m128i *)(m_ptr + m_stride));
- m = _mm_unpacklo_epi8(_mm_unpacklo_epi32(temp1, temp2),
- _mm_setzero_si128());
-
- // Calculate the difference between a & b
- temp1 = _mm_subs_epu16(a, b);
- temp2 = _mm_subs_epu16(b, a);
- temp1 = _mm_or_si128(temp1, temp2);
-
- // Multiply by m and add together
- res = _mm_add_epi32(res, _mm_madd_epi16(temp1, m));
-
- // Move onto the next rows
- a_ptr += a_stride * 2;
- b_ptr += b_stride * 2;
- m_ptr += m_stride * 2;
- }
- res = _mm_hadd_epi32(res, _mm_setzero_si128());
- res = _mm_hadd_epi32(res, _mm_setzero_si128());
- // sad = (sad + 31) >> 6;
- return (_mm_cvtsi128_si32(res) + 31) >> 6;
-}
-#endif // CONFIG_HIGHBITDEPTH
diff --git a/aom_dsp/x86/masked_variance_intrin_ssse3.c b/aom_dsp/x86/masked_variance_intrin_ssse3.c
deleted file mode 100644
index fe14597..0000000
--- a/aom_dsp/x86/masked_variance_intrin_ssse3.c
+++ /dev/null
@@ -1,1948 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <stdlib.h>
-#include <emmintrin.h>
-#include <tmmintrin.h>
-
-#include "./aom_config.h"
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-#include "aom_dsp/aom_filter.h"
-
-// Half pixel shift
-#define HALF_PIXEL_OFFSET (BIL_SUBPEL_SHIFTS / 2)
-
-/*****************************************************************************
- * Horizontal additions
- *****************************************************************************/
-
-static INLINE int32_t hsum_epi32_si32(__m128i v_d) {
- v_d = _mm_hadd_epi32(v_d, v_d);
- v_d = _mm_hadd_epi32(v_d, v_d);
- return _mm_cvtsi128_si32(v_d);
-}
-
-static INLINE int64_t hsum_epi64_si64(__m128i v_q) {
- v_q = _mm_add_epi64(v_q, _mm_srli_si128(v_q, 8));
-#if ARCH_X86_64
- return _mm_cvtsi128_si64(v_q);
-#else
- {
- int64_t tmp;
- _mm_storel_epi64((__m128i *)&tmp, v_q);
- return tmp;
- }
-#endif
-}
-
-#if CONFIG_HIGHBITDEPTH
-static INLINE int64_t hsum_epi32_si64(__m128i v_d) {
- const __m128i v_sign_d = _mm_cmplt_epi32(v_d, _mm_setzero_si128());
- const __m128i v_0_q = _mm_unpacklo_epi32(v_d, v_sign_d);
- const __m128i v_1_q = _mm_unpackhi_epi32(v_d, v_sign_d);
- return hsum_epi64_si64(_mm_add_epi64(v_0_q, v_1_q));
-}
-#endif // CONFIG_HIGHBITDEPTH
-
-static INLINE uint32_t calc_masked_variance(__m128i v_sum_d, __m128i v_sse_q,
- uint32_t *sse, int w, int h) {
- int64_t sum64;
- uint64_t sse64;
-
- // Horizontal sum
- sum64 = hsum_epi32_si32(v_sum_d);
- sse64 = hsum_epi64_si64(v_sse_q);
-
- sum64 = (sum64 >= 0) ? sum64 : -sum64;
-
- // Round
- sum64 = ROUND_POWER_OF_TWO(sum64, 6);
- sse64 = ROUND_POWER_OF_TWO(sse64, 12);
-
- // Store the SSE
- *sse = (uint32_t)sse64;
- // Compute the variance
- return *sse - (uint32_t)((sum64 * sum64) / (w * h));
-}
-
-/*****************************************************************************
- * n*16 Wide versions
- *****************************************************************************/
-
-static INLINE unsigned int masked_variancewxh_ssse3(
- const uint8_t *a, int a_stride, const uint8_t *b, int b_stride,
- const uint8_t *m, int m_stride, int w, int h, unsigned int *sse) {
- int ii, jj;
-
- const __m128i v_zero = _mm_setzero_si128();
-
- __m128i v_sum_d = _mm_setzero_si128();
- __m128i v_sse_q = _mm_setzero_si128();
-
- assert((w % 16) == 0);
-
- for (ii = 0; ii < h; ii++) {
- for (jj = 0; jj < w; jj += 16) {
- // Load inputs - 8 bits
- const __m128i v_a_b = _mm_loadu_si128((const __m128i *)(a + jj));
- const __m128i v_b_b = _mm_loadu_si128((const __m128i *)(b + jj));
- const __m128i v_m_b = _mm_loadu_si128((const __m128i *)(m + jj));
-
- // Unpack to 16 bits - still containing max 8 bits
- const __m128i v_a0_w = _mm_unpacklo_epi8(v_a_b, v_zero);
- const __m128i v_b0_w = _mm_unpacklo_epi8(v_b_b, v_zero);
- const __m128i v_m0_w = _mm_unpacklo_epi8(v_m_b, v_zero);
- const __m128i v_a1_w = _mm_unpackhi_epi8(v_a_b, v_zero);
- const __m128i v_b1_w = _mm_unpackhi_epi8(v_b_b, v_zero);
- const __m128i v_m1_w = _mm_unpackhi_epi8(v_m_b, v_zero);
-
- // Difference: [-255, 255]
- const __m128i v_d0_w = _mm_sub_epi16(v_a0_w, v_b0_w);
- const __m128i v_d1_w = _mm_sub_epi16(v_a1_w, v_b1_w);
-
- // Error - [-255, 255] * [0, 64] = [0xc040, 0x3fc0] => fits in 15 bits
- const __m128i v_e0_w = _mm_mullo_epi16(v_d0_w, v_m0_w);
- const __m128i v_e0_d = _mm_madd_epi16(v_d0_w, v_m0_w);
- const __m128i v_e1_w = _mm_mullo_epi16(v_d1_w, v_m1_w);
- const __m128i v_e1_d = _mm_madd_epi16(v_d1_w, v_m1_w);
-
- // Squared error - using madd it's max (15 bits * 15 bits) * 2 = 31 bits
- const __m128i v_se0_d = _mm_madd_epi16(v_e0_w, v_e0_w);
- const __m128i v_se1_d = _mm_madd_epi16(v_e1_w, v_e1_w);
-
- // Sum of v_se{0,1}_d - 31 bits + 31 bits = 32 bits
- const __m128i v_se_d = _mm_add_epi32(v_se0_d, v_se1_d);
-
- // Unpack Squared error to 64 bits
- const __m128i v_se_lo_q = _mm_unpacklo_epi32(v_se_d, v_zero);
- const __m128i v_se_hi_q = _mm_unpackhi_epi32(v_se_d, v_zero);
-
- // Accumulate
- v_sum_d = _mm_add_epi32(v_sum_d, v_e0_d);
- v_sum_d = _mm_add_epi32(v_sum_d, v_e1_d);
- v_sse_q = _mm_add_epi64(v_sse_q, v_se_lo_q);
- v_sse_q = _mm_add_epi64(v_sse_q, v_se_hi_q);
- }
-
- // Move on to next row
- a += a_stride;
- b += b_stride;
- m += m_stride;
- }
-
- return calc_masked_variance(v_sum_d, v_sse_q, sse, w, h);
-}
-
-#define MASKED_VARWXH(W, H) \
- unsigned int aom_masked_variance##W##x##H##_ssse3( \
- const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, \
- const uint8_t *m, int m_stride, unsigned int *sse) { \
- return masked_variancewxh_ssse3(a, a_stride, b, b_stride, m, m_stride, W, \
- H, sse); \
- }
-
-MASKED_VARWXH(16, 8)
-MASKED_VARWXH(16, 16)
-MASKED_VARWXH(16, 32)
-MASKED_VARWXH(32, 16)
-MASKED_VARWXH(32, 32)
-MASKED_VARWXH(32, 64)
-MASKED_VARWXH(64, 32)
-MASKED_VARWXH(64, 64)
-#if CONFIG_EXT_PARTITION
-MASKED_VARWXH(64, 128)
-MASKED_VARWXH(128, 64)
-MASKED_VARWXH(128, 128)
-#endif // CONFIG_EXT_PARTITION
-
-/*****************************************************************************
- * 8 Wide versions
- *****************************************************************************/
-
-static INLINE unsigned int masked_variance8xh_ssse3(
- const uint8_t *a, int a_stride, const uint8_t *b, int b_stride,
- const uint8_t *m, int m_stride, int h, unsigned int *sse) {
- int ii;
-
- const __m128i v_zero = _mm_setzero_si128();
-
- __m128i v_sum_d = _mm_setzero_si128();
- __m128i v_sse_q = _mm_setzero_si128();
-
- for (ii = 0; ii < h; ii++) {
- // Load inputs - 8 bits
- const __m128i v_a_b = _mm_loadl_epi64((const __m128i *)a);
- const __m128i v_b_b = _mm_loadl_epi64((const __m128i *)b);
- const __m128i v_m_b = _mm_loadl_epi64((const __m128i *)m);
-
- // Unpack to 16 bits - still containing max 8 bits
- const __m128i v_a_w = _mm_unpacklo_epi8(v_a_b, v_zero);
- const __m128i v_b_w = _mm_unpacklo_epi8(v_b_b, v_zero);
- const __m128i v_m_w = _mm_unpacklo_epi8(v_m_b, v_zero);
-
- // Difference: [-255, 255]
- const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
-
- // Error - [-255, 255] * [0, 64] = [0xc040, 0x3fc0] => fits in 15 bits
- const __m128i v_e_w = _mm_mullo_epi16(v_d_w, v_m_w);
- const __m128i v_e_d = _mm_madd_epi16(v_d_w, v_m_w);
-
- // Squared error - using madd it's max (15 bits * 15 bits) * 2 = 31 bits
- const __m128i v_se_d = _mm_madd_epi16(v_e_w, v_e_w);
-
- // Unpack Squared error to 64 bits
- const __m128i v_se_lo_q = _mm_unpacklo_epi32(v_se_d, v_zero);
- const __m128i v_se_hi_q = _mm_unpackhi_epi32(v_se_d, v_zero);
-
- // Accumulate
- v_sum_d = _mm_add_epi32(v_sum_d, v_e_d);
- v_sse_q = _mm_add_epi64(v_sse_q, v_se_lo_q);
- v_sse_q = _mm_add_epi64(v_sse_q, v_se_hi_q);
-
- // Move on to next row
- a += a_stride;
- b += b_stride;
- m += m_stride;
- }
-
- return calc_masked_variance(v_sum_d, v_sse_q, sse, 8, h);
-}
-
-#define MASKED_VAR8XH(H) \
- unsigned int aom_masked_variance8x##H##_ssse3( \
- const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, \
- const uint8_t *m, int m_stride, unsigned int *sse) { \
- return masked_variance8xh_ssse3(a, a_stride, b, b_stride, m, m_stride, H, \
- sse); \
- }
-
-MASKED_VAR8XH(4)
-MASKED_VAR8XH(8)
-MASKED_VAR8XH(16)
-
-/*****************************************************************************
- * 4 Wide versions
- *****************************************************************************/
-
-static INLINE unsigned int masked_variance4xh_ssse3(
- const uint8_t *a, int a_stride, const uint8_t *b, int b_stride,
- const uint8_t *m, int m_stride, int h, unsigned int *sse) {
- int ii;
-
- const __m128i v_zero = _mm_setzero_si128();
-
- __m128i v_sum_d = _mm_setzero_si128();
- __m128i v_sse_q = _mm_setzero_si128();
-
- assert((h % 2) == 0);
-
- for (ii = 0; ii < h / 2; ii++) {
- // Load 2 input rows - 8 bits
- const __m128i v_a0_b = _mm_cvtsi32_si128(*(const uint32_t *)a);
- const __m128i v_b0_b = _mm_cvtsi32_si128(*(const uint32_t *)b);
- const __m128i v_m0_b = _mm_cvtsi32_si128(*(const uint32_t *)m);
- const __m128i v_a1_b = _mm_cvtsi32_si128(*(const uint32_t *)(a + a_stride));
- const __m128i v_b1_b = _mm_cvtsi32_si128(*(const uint32_t *)(b + b_stride));
- const __m128i v_m1_b = _mm_cvtsi32_si128(*(const uint32_t *)(m + m_stride));
-
- // Interleave 2 rows into a single register
- const __m128i v_a_b = _mm_unpacklo_epi32(v_a0_b, v_a1_b);
- const __m128i v_b_b = _mm_unpacklo_epi32(v_b0_b, v_b1_b);
- const __m128i v_m_b = _mm_unpacklo_epi32(v_m0_b, v_m1_b);
-
- // Unpack to 16 bits - still containing max 8 bits
- const __m128i v_a_w = _mm_unpacklo_epi8(v_a_b, v_zero);
- const __m128i v_b_w = _mm_unpacklo_epi8(v_b_b, v_zero);
- const __m128i v_m_w = _mm_unpacklo_epi8(v_m_b, v_zero);
-
- // Difference: [-255, 255]
- const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
-
- // Error - [-255, 255] * [0, 64] = [0xc040, 0x3fc0] => fits in 15 bits
- const __m128i v_e_w = _mm_mullo_epi16(v_d_w, v_m_w);
- const __m128i v_e_d = _mm_madd_epi16(v_d_w, v_m_w);
-
- // Squared error - using madd it's max (15 bits * 15 bits) * 2 = 31 bits
- const __m128i v_se_d = _mm_madd_epi16(v_e_w, v_e_w);
-
- // Unpack Squared error to 64 bits
- const __m128i v_se_lo_q = _mm_unpacklo_epi32(v_se_d, v_zero);
- const __m128i v_se_hi_q = _mm_unpackhi_epi32(v_se_d, v_zero);
-
- // Accumulate
- v_sum_d = _mm_add_epi32(v_sum_d, v_e_d);
- v_sse_q = _mm_add_epi64(v_sse_q, v_se_lo_q);
- v_sse_q = _mm_add_epi64(v_sse_q, v_se_hi_q);
-
- // Move on to next 2 row
- a += a_stride * 2;
- b += b_stride * 2;
- m += m_stride * 2;
- }
-
- return calc_masked_variance(v_sum_d, v_sse_q, sse, 4, h);
-}
-
-#define MASKED_VAR4XH(H) \
- unsigned int aom_masked_variance4x##H##_ssse3( \
- const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, \
- const uint8_t *m, int m_stride, unsigned int *sse) { \
- return masked_variance4xh_ssse3(a, a_stride, b, b_stride, m, m_stride, H, \
- sse); \
- }
-
-MASKED_VAR4XH(4)
-MASKED_VAR4XH(8)
-
-#if CONFIG_HIGHBITDEPTH
-
-// Main calculation for n*8 wide blocks
-static INLINE void highbd_masked_variance64_ssse3(
- const uint16_t *a, int a_stride, const uint16_t *b, int b_stride,
- const uint8_t *m, int m_stride, int w, int h, int64_t *sum, uint64_t *sse) {
- int ii, jj;
-
- const __m128i v_zero = _mm_setzero_si128();
-
- __m128i v_sum_d = _mm_setzero_si128();
- __m128i v_sse_q = _mm_setzero_si128();
-
- assert((w % 8) == 0);
-
- for (ii = 0; ii < h; ii++) {
- for (jj = 0; jj < w; jj += 8) {
- // Load inputs - 8 bits
- const __m128i v_a_w = _mm_loadu_si128((const __m128i *)(a + jj));
- const __m128i v_b_w = _mm_loadu_si128((const __m128i *)(b + jj));
- const __m128i v_m_b = _mm_loadl_epi64((const __m128i *)(m + jj));
-
- // Unpack m to 16 bits - still containing max 8 bits
- const __m128i v_m_w = _mm_unpacklo_epi8(v_m_b, v_zero);
-
- // Difference: [-4095, 4095]
- const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
-
- // Error - [-4095, 4095] * [0, 64] => sum of 2 of these fits in 19 bits
- const __m128i v_e_d = _mm_madd_epi16(v_d_w, v_m_w);
-
- // Squared error - max (18 bits * 18 bits) = 36 bits (no sign bit)
- const __m128i v_absd_w = _mm_abs_epi16(v_d_w);
- const __m128i v_dlo_d = _mm_unpacklo_epi16(v_absd_w, v_zero);
- const __m128i v_mlo_d = _mm_unpacklo_epi16(v_m_w, v_zero);
- const __m128i v_elo_d = _mm_madd_epi16(v_dlo_d, v_mlo_d);
- const __m128i v_dhi_d = _mm_unpackhi_epi16(v_absd_w, v_zero);
- const __m128i v_mhi_d = _mm_unpackhi_epi16(v_m_w, v_zero);
- const __m128i v_ehi_d = _mm_madd_epi16(v_dhi_d, v_mhi_d);
- // Square and sum the errors -> 36bits * 4 = 38bits
- __m128i v_se0_q, v_se1_q, v_se2_q, v_se3_q, v_se_q, v_elo1_d, v_ehi3_d;
- v_se0_q = _mm_mul_epu32(v_elo_d, v_elo_d);
- v_elo1_d = _mm_srli_si128(v_elo_d, 4);
- v_se1_q = _mm_mul_epu32(v_elo1_d, v_elo1_d);
- v_se0_q = _mm_add_epi64(v_se0_q, v_se1_q);
- v_se2_q = _mm_mul_epu32(v_ehi_d, v_ehi_d);
- v_ehi3_d = _mm_srli_si128(v_ehi_d, 4);
- v_se3_q = _mm_mul_epu32(v_ehi3_d, v_ehi3_d);
- v_se1_q = _mm_add_epi64(v_se2_q, v_se3_q);
- v_se_q = _mm_add_epi64(v_se0_q, v_se1_q);
-
- // Accumulate
- v_sum_d = _mm_add_epi32(v_sum_d, v_e_d);
- v_sse_q = _mm_add_epi64(v_sse_q, v_se_q);
- }
-
- // Move on to next row
- a += a_stride;
- b += b_stride;
- m += m_stride;
- }
-
- // Horizontal sum
- *sum = hsum_epi32_si64(v_sum_d);
- *sse = hsum_epi64_si64(v_sse_q);
-
- // Round
- *sum = (*sum >= 0) ? *sum : -*sum;
- *sum = ROUND_POWER_OF_TWO(*sum, 6);
- *sse = ROUND_POWER_OF_TWO(*sse, 12);
-}
-
-// Main calculation for 4 wide blocks
-static INLINE void highbd_masked_variance64_4wide_ssse3(
- const uint16_t *a, int a_stride, const uint16_t *b, int b_stride,
- const uint8_t *m, int m_stride, int h, int64_t *sum, uint64_t *sse) {
- int ii;
-
- const __m128i v_zero = _mm_setzero_si128();
-
- __m128i v_sum_d = _mm_setzero_si128();
- __m128i v_sse_q = _mm_setzero_si128();
-
- assert((h % 2) == 0);
-
- for (ii = 0; ii < h / 2; ii++) {
- // Load 2 input rows - 8 bits
- const __m128i v_a0_w = _mm_loadl_epi64((const __m128i *)a);
- const __m128i v_b0_w = _mm_loadl_epi64((const __m128i *)b);
- const __m128i v_m0_b = _mm_cvtsi32_si128(*(const uint32_t *)m);
- const __m128i v_a1_w = _mm_loadl_epi64((const __m128i *)(a + a_stride));
- const __m128i v_b1_w = _mm_loadl_epi64((const __m128i *)(b + b_stride));
- const __m128i v_m1_b = _mm_cvtsi32_si128(*(const uint32_t *)(m + m_stride));
-
- // Interleave 2 rows into a single register
- const __m128i v_a_w = _mm_unpacklo_epi64(v_a0_w, v_a1_w);
- const __m128i v_b_w = _mm_unpacklo_epi64(v_b0_w, v_b1_w);
- const __m128i v_m_b = _mm_unpacklo_epi32(v_m0_b, v_m1_b);
-
- // Unpack to 16 bits - still containing max 8 bits
- const __m128i v_m_w = _mm_unpacklo_epi8(v_m_b, v_zero);
-
- // Difference: [-4095, 4095]
- const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
-
- // Error - [-4095, 4095] * [0, 64] => fits in 19 bits (incld sign bit)
- const __m128i v_e_d = _mm_madd_epi16(v_d_w, v_m_w);
-
- // Squared error - max (18 bits * 18 bits) = 36 bits (no sign bit)
- const __m128i v_absd_w = _mm_abs_epi16(v_d_w);
- const __m128i v_dlo_d = _mm_unpacklo_epi16(v_absd_w, v_zero);
- const __m128i v_mlo_d = _mm_unpacklo_epi16(v_m_w, v_zero);
- const __m128i v_elo_d = _mm_madd_epi16(v_dlo_d, v_mlo_d);
- const __m128i v_dhi_d = _mm_unpackhi_epi16(v_absd_w, v_zero);
- const __m128i v_mhi_d = _mm_unpackhi_epi16(v_m_w, v_zero);
- const __m128i v_ehi_d = _mm_madd_epi16(v_dhi_d, v_mhi_d);
- // Square and sum the errors -> 36bits * 4 = 38bits
- __m128i v_se0_q, v_se1_q, v_se2_q, v_se3_q, v_se_q, v_elo1_d, v_ehi3_d;
- v_se0_q = _mm_mul_epu32(v_elo_d, v_elo_d);
- v_elo1_d = _mm_srli_si128(v_elo_d, 4);
- v_se1_q = _mm_mul_epu32(v_elo1_d, v_elo1_d);
- v_se0_q = _mm_add_epi64(v_se0_q, v_se1_q);
- v_se2_q = _mm_mul_epu32(v_ehi_d, v_ehi_d);
- v_ehi3_d = _mm_srli_si128(v_ehi_d, 4);
- v_se3_q = _mm_mul_epu32(v_ehi3_d, v_ehi3_d);
- v_se1_q = _mm_add_epi64(v_se2_q, v_se3_q);
- v_se_q = _mm_add_epi64(v_se0_q, v_se1_q);
-
- // Accumulate
- v_sum_d = _mm_add_epi32(v_sum_d, v_e_d);
- v_sse_q = _mm_add_epi64(v_sse_q, v_se_q);
-
- // Move on to next row
- a += a_stride * 2;
- b += b_stride * 2;
- m += m_stride * 2;
- }
-
- // Horizontal sum
- *sum = hsum_epi32_si32(v_sum_d);
- *sse = hsum_epi64_si64(v_sse_q);
-
- // Round
- *sum = (*sum >= 0) ? *sum : -*sum;
- *sum = ROUND_POWER_OF_TWO(*sum, 6);
- *sse = ROUND_POWER_OF_TWO(*sse, 12);
-}
-
-static INLINE unsigned int highbd_masked_variancewxh_ssse3(
- const uint16_t *a, int a_stride, const uint16_t *b, int b_stride,
- const uint8_t *m, int m_stride, int w, int h, unsigned int *sse) {
- uint64_t sse64;
- int64_t sum64;
-
- if (w == 4)
- highbd_masked_variance64_4wide_ssse3(a, a_stride, b, b_stride, m, m_stride,
- h, &sum64, &sse64);
- else
- highbd_masked_variance64_ssse3(a, a_stride, b, b_stride, m, m_stride, w, h,
- &sum64, &sse64);
-
- // Store the SSE
- *sse = (uint32_t)sse64;
- // Compute and return variance
- return *sse - (uint32_t)((sum64 * sum64) / (w * h));
-}
-
-static INLINE unsigned int highbd_10_masked_variancewxh_ssse3(
- const uint16_t *a, int a_stride, const uint16_t *b, int b_stride,
- const uint8_t *m, int m_stride, int w, int h, unsigned int *sse) {
- uint64_t sse64;
- int64_t sum64;
-
- if (w == 4)
- highbd_masked_variance64_4wide_ssse3(a, a_stride, b, b_stride, m, m_stride,
- h, &sum64, &sse64);
- else
- highbd_masked_variance64_ssse3(a, a_stride, b, b_stride, m, m_stride, w, h,
- &sum64, &sse64);
-
- // Normalise
- sum64 = ROUND_POWER_OF_TWO(sum64, 2);
- sse64 = ROUND_POWER_OF_TWO(sse64, 4);
-
- // Store the SSE
- *sse = (uint32_t)sse64;
- // Compute and return variance
- return *sse - (uint32_t)((sum64 * sum64) / (w * h));
-}
-
-static INLINE unsigned int highbd_12_masked_variancewxh_ssse3(
- const uint16_t *a, int a_stride, const uint16_t *b, int b_stride,
- const uint8_t *m, int m_stride, int w, int h, unsigned int *sse) {
- uint64_t sse64;
- int64_t sum64;
-
- if (w == 4)
- highbd_masked_variance64_4wide_ssse3(a, a_stride, b, b_stride, m, m_stride,
- h, &sum64, &sse64);
- else
- highbd_masked_variance64_ssse3(a, a_stride, b, b_stride, m, m_stride, w, h,
- &sum64, &sse64);
-
- sum64 = ROUND_POWER_OF_TWO(sum64, 4);
- sse64 = ROUND_POWER_OF_TWO(sse64, 8);
-
- // Store the SSE
- *sse = (uint32_t)sse64;
- // Compute and return variance
- return *sse - (uint32_t)((sum64 * sum64) / (w * h));
-}
-
-#define HIGHBD_MASKED_VARWXH(W, H) \
- unsigned int aom_highbd_masked_variance##W##x##H##_ssse3( \
- const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, \
- const uint8_t *m, int m_stride, unsigned int *sse) { \
- uint16_t *a = CONVERT_TO_SHORTPTR(a8); \
- uint16_t *b = CONVERT_TO_SHORTPTR(b8); \
- return highbd_masked_variancewxh_ssse3(a, a_stride, b, b_stride, m, \
- m_stride, W, H, sse); \
- } \
- \
- unsigned int aom_highbd_10_masked_variance##W##x##H##_ssse3( \
- const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, \
- const uint8_t *m, int m_stride, unsigned int *sse) { \
- uint16_t *a = CONVERT_TO_SHORTPTR(a8); \
- uint16_t *b = CONVERT_TO_SHORTPTR(b8); \
- return highbd_10_masked_variancewxh_ssse3(a, a_stride, b, b_stride, m, \
- m_stride, W, H, sse); \
- } \
- \
- unsigned int aom_highbd_12_masked_variance##W##x##H##_ssse3( \
- const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, \
- const uint8_t *m, int m_stride, unsigned int *sse) { \
- uint16_t *a = CONVERT_TO_SHORTPTR(a8); \
- uint16_t *b = CONVERT_TO_SHORTPTR(b8); \
- return highbd_12_masked_variancewxh_ssse3(a, a_stride, b, b_stride, m, \
- m_stride, W, H, sse); \
- }
-
-HIGHBD_MASKED_VARWXH(4, 4)
-HIGHBD_MASKED_VARWXH(4, 8)
-HIGHBD_MASKED_VARWXH(8, 4)
-HIGHBD_MASKED_VARWXH(8, 8)
-HIGHBD_MASKED_VARWXH(8, 16)
-HIGHBD_MASKED_VARWXH(16, 8)
-HIGHBD_MASKED_VARWXH(16, 16)
-HIGHBD_MASKED_VARWXH(16, 32)
-HIGHBD_MASKED_VARWXH(32, 16)
-HIGHBD_MASKED_VARWXH(32, 32)
-HIGHBD_MASKED_VARWXH(32, 64)
-HIGHBD_MASKED_VARWXH(64, 32)
-HIGHBD_MASKED_VARWXH(64, 64)
-#if CONFIG_EXT_PARTITION
-HIGHBD_MASKED_VARWXH(64, 128)
-HIGHBD_MASKED_VARWXH(128, 64)
-HIGHBD_MASKED_VARWXH(128, 128)
-#endif // CONFIG_EXT_PARTITION
-
-#endif
-
-//////////////////////////////////////////////////////////////////////////////
-// Sub pixel versions
-//////////////////////////////////////////////////////////////////////////////
-
-typedef __m128i (*filter_fn_t)(__m128i v_a_b, __m128i v_b_b,
- __m128i v_filter_b);
-
-static INLINE __m128i apply_filter_avg(const __m128i v_a_b, const __m128i v_b_b,
- const __m128i v_filter_b) {
- (void)v_filter_b;
- return _mm_avg_epu8(v_a_b, v_b_b);
-}
-
-static INLINE __m128i apply_filter(const __m128i v_a_b, const __m128i v_b_b,
- const __m128i v_filter_b) {
- const __m128i v_rounding_w = _mm_set1_epi16(1 << (FILTER_BITS - 1));
- __m128i v_input_lo_b = _mm_unpacklo_epi8(v_a_b, v_b_b);
- __m128i v_input_hi_b = _mm_unpackhi_epi8(v_a_b, v_b_b);
- __m128i v_temp0_w = _mm_maddubs_epi16(v_input_lo_b, v_filter_b);
- __m128i v_temp1_w = _mm_maddubs_epi16(v_input_hi_b, v_filter_b);
- __m128i v_res_lo_w =
- _mm_srai_epi16(_mm_add_epi16(v_temp0_w, v_rounding_w), FILTER_BITS);
- __m128i v_res_hi_w =
- _mm_srai_epi16(_mm_add_epi16(v_temp1_w, v_rounding_w), FILTER_BITS);
- return _mm_packus_epi16(v_res_lo_w, v_res_hi_w);
-}
-
-// Apply the filter to the contents of the lower half of a and b
-static INLINE void apply_filter_lo(const __m128i v_a_lo_b,
- const __m128i v_b_lo_b,
- const __m128i v_filter_b, __m128i *v_res_w) {
- const __m128i v_rounding_w = _mm_set1_epi16(1 << (FILTER_BITS - 1));
- __m128i v_input_b = _mm_unpacklo_epi8(v_a_lo_b, v_b_lo_b);
- __m128i v_temp0_w = _mm_maddubs_epi16(v_input_b, v_filter_b);
- *v_res_w =
- _mm_srai_epi16(_mm_add_epi16(v_temp0_w, v_rounding_w), FILTER_BITS);
-}
-
-static void sum_and_sse(const __m128i v_a_b, const __m128i v_b_b,
- const __m128i v_m_b, __m128i *v_sum_d,
- __m128i *v_sse_q) {
- const __m128i v_zero = _mm_setzero_si128();
- // Unpack to 16 bits - still containing max 8 bits
- const __m128i v_a0_w = _mm_unpacklo_epi8(v_a_b, v_zero);
- const __m128i v_b0_w = _mm_unpacklo_epi8(v_b_b, v_zero);
- const __m128i v_m0_w = _mm_unpacklo_epi8(v_m_b, v_zero);
- const __m128i v_a1_w = _mm_unpackhi_epi8(v_a_b, v_zero);
- const __m128i v_b1_w = _mm_unpackhi_epi8(v_b_b, v_zero);
- const __m128i v_m1_w = _mm_unpackhi_epi8(v_m_b, v_zero);
-
- // Difference: [-255, 255]
- const __m128i v_d0_w = _mm_sub_epi16(v_a0_w, v_b0_w);
- const __m128i v_d1_w = _mm_sub_epi16(v_a1_w, v_b1_w);
-
- // Error - [-255, 255] * [0, 64] = [0xc040, 0x3fc0] => fits in 15 bits
- const __m128i v_e0_w = _mm_mullo_epi16(v_d0_w, v_m0_w);
- const __m128i v_e0_d = _mm_madd_epi16(v_d0_w, v_m0_w);
- const __m128i v_e1_w = _mm_mullo_epi16(v_d1_w, v_m1_w);
- const __m128i v_e1_d = _mm_madd_epi16(v_d1_w, v_m1_w);
-
- // Squared error - using madd it's max (15 bits * 15 bits) * 2 = 31 bits
- const __m128i v_se0_d = _mm_madd_epi16(v_e0_w, v_e0_w);
- const __m128i v_se1_d = _mm_madd_epi16(v_e1_w, v_e1_w);
-
- // Sum of v_se{0,1}_d - 31 bits + 31 bits = 32 bits
- const __m128i v_se_d = _mm_add_epi32(v_se0_d, v_se1_d);
-
- // Unpack Squared error to 64 bits
- const __m128i v_se_lo_q = _mm_unpacklo_epi32(v_se_d, v_zero);
- const __m128i v_se_hi_q = _mm_unpackhi_epi32(v_se_d, v_zero);
-
- // Accumulate
- *v_sum_d = _mm_add_epi32(*v_sum_d, v_e0_d);
- *v_sum_d = _mm_add_epi32(*v_sum_d, v_e1_d);
- *v_sse_q = _mm_add_epi64(*v_sse_q, v_se_lo_q);
- *v_sse_q = _mm_add_epi64(*v_sse_q, v_se_hi_q);
-}
-
-// Functions for width (W) >= 16
-unsigned int aom_masked_subpel_varWxH_xzero(const uint8_t *src, int src_stride,
- int yoffset, const uint8_t *dst,
- int dst_stride, const uint8_t *msk,
- int msk_stride, unsigned int *sse,
- int w, int h,
- filter_fn_t filter_fn) {
- int i, j;
- __m128i v_src0_b, v_src1_b, v_res_b, v_dst_b, v_msk_b;
- __m128i v_sum_d = _mm_setzero_si128();
- __m128i v_sse_q = _mm_setzero_si128();
- const __m128i v_filter_b = _mm_set1_epi16(
- (bilinear_filters_2t[yoffset][1] << 8) + bilinear_filters_2t[yoffset][0]);
- assert(yoffset < BIL_SUBPEL_SHIFTS);
- for (j = 0; j < w; j += 16) {
- // Load the first row ready
- v_src0_b = _mm_loadu_si128((const __m128i *)(src + j));
- // Process 2 rows at a time
- for (i = 0; i < h; i += 2) {
- // Load the next row apply the filter
- v_src1_b = _mm_loadu_si128((const __m128i *)(src + j + src_stride));
- v_res_b = filter_fn(v_src0_b, v_src1_b, v_filter_b);
- // Load the dst and msk for the variance calculation
- v_dst_b = _mm_loadu_si128((const __m128i *)(dst + j));
- v_msk_b = _mm_loadu_si128((const __m128i *)(msk + j));
- sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
-
- // Load the next row apply the filter
- v_src0_b = _mm_loadu_si128((const __m128i *)(src + j + src_stride * 2));
- v_res_b = filter_fn(v_src1_b, v_src0_b, v_filter_b);
- // Load the dst and msk for the variance calculation
- v_dst_b = _mm_loadu_si128((const __m128i *)(dst + j + dst_stride));
- v_msk_b = _mm_loadu_si128((const __m128i *)(msk + j + msk_stride));
- sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
- // Move onto the next block of rows
- src += src_stride * 2;
- dst += dst_stride * 2;
- msk += msk_stride * 2;
- }
- // Reset to the top of the block
- src -= src_stride * h;
- dst -= dst_stride * h;
- msk -= msk_stride * h;
- }
- return calc_masked_variance(v_sum_d, v_sse_q, sse, w, h);
-}
-unsigned int aom_masked_subpel_varWxH_yzero(const uint8_t *src, int src_stride,
- int xoffset, const uint8_t *dst,
- int dst_stride, const uint8_t *msk,
- int msk_stride, unsigned int *sse,
- int w, int h,
- filter_fn_t filter_fn) {
- int i, j;
- __m128i v_src0_b, v_src1_b, v_res_b, v_dst_b, v_msk_b;
- __m128i v_sum_d = _mm_setzero_si128();
- __m128i v_sse_q = _mm_setzero_si128();
- const __m128i v_filter_b = _mm_set1_epi16(
- (bilinear_filters_2t[xoffset][1] << 8) + bilinear_filters_2t[xoffset][0]);
- assert(xoffset < BIL_SUBPEL_SHIFTS);
- for (i = 0; i < h; i++) {
- for (j = 0; j < w; j += 16) {
- // Load this row and one below & apply the filter to them
- v_src0_b = _mm_loadu_si128((const __m128i *)(src + j));
- v_src1_b = _mm_loadu_si128((const __m128i *)(src + j + 1));
- v_res_b = filter_fn(v_src0_b, v_src1_b, v_filter_b);
-
- // Load the dst and msk for the variance calculation
- v_dst_b = _mm_loadu_si128((const __m128i *)(dst + j));
- v_msk_b = _mm_loadu_si128((const __m128i *)(msk + j));
- sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
- }
- src += src_stride;
- dst += dst_stride;
- msk += msk_stride;
- }
- return calc_masked_variance(v_sum_d, v_sse_q, sse, w, h);
-}
-unsigned int aom_masked_subpel_varWxH_xnonzero_ynonzero(
- const uint8_t *src, int src_stride, int xoffset, int yoffset,
- const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
- unsigned int *sse, int w, int h, filter_fn_t xfilter_fn,
- filter_fn_t yfilter_fn) {
- int i, j;
- __m128i v_src0_b, v_src1_b, v_src2_b, v_src3_b;
- __m128i v_filtered0_b, v_filtered1_b, v_res_b, v_dst_b, v_msk_b;
- __m128i v_sum_d = _mm_setzero_si128();
- __m128i v_sse_q = _mm_setzero_si128();
- const __m128i v_filterx_b = _mm_set1_epi16(
- (bilinear_filters_2t[xoffset][1] << 8) + bilinear_filters_2t[xoffset][0]);
- const __m128i v_filtery_b = _mm_set1_epi16(
- (bilinear_filters_2t[yoffset][1] << 8) + bilinear_filters_2t[yoffset][0]);
- assert(yoffset < BIL_SUBPEL_SHIFTS);
- assert(xoffset < BIL_SUBPEL_SHIFTS);
- for (j = 0; j < w; j += 16) {
- // Load the first row ready
- v_src0_b = _mm_loadu_si128((const __m128i *)(src + j));
- v_src1_b = _mm_loadu_si128((const __m128i *)(src + j + 1));
- v_filtered0_b = xfilter_fn(v_src0_b, v_src1_b, v_filterx_b);
- // Process 2 rows at a time
- for (i = 0; i < h; i += 2) {
- // Load the next row & apply the filter
- v_src2_b = _mm_loadu_si128((const __m128i *)(src + src_stride + j));
- v_src3_b = _mm_loadu_si128((const __m128i *)(src + src_stride + j + 1));
- v_filtered1_b = xfilter_fn(v_src2_b, v_src3_b, v_filterx_b);
- // Load the dst and msk for the variance calculation
- v_dst_b = _mm_loadu_si128((const __m128i *)(dst + j));
- v_msk_b = _mm_loadu_si128((const __m128i *)(msk + j));
- // Complete the calculation for this row and add it to the running total
- v_res_b = yfilter_fn(v_filtered0_b, v_filtered1_b, v_filtery_b);
- sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
-
- // Load the next row & apply the filter
- v_src0_b = _mm_loadu_si128((const __m128i *)(src + src_stride * 2 + j));
- v_src1_b =
- _mm_loadu_si128((const __m128i *)(src + src_stride * 2 + j + 1));
- v_filtered0_b = xfilter_fn(v_src0_b, v_src1_b, v_filterx_b);
- // Load the dst and msk for the variance calculation
- v_dst_b = _mm_loadu_si128((const __m128i *)(dst + dst_stride + j));
- v_msk_b = _mm_loadu_si128((const __m128i *)(msk + msk_stride + j));
- // Complete the calculation for this row and add it to the running total
- v_res_b = yfilter_fn(v_filtered1_b, v_filtered0_b, v_filtery_b);
- sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
- // Move onto the next block of rows
- src += src_stride * 2;
- dst += dst_stride * 2;
- msk += msk_stride * 2;
- }
- // Reset to the top of the block
- src -= src_stride * h;
- dst -= dst_stride * h;
- msk -= msk_stride * h;
- }
- return calc_masked_variance(v_sum_d, v_sse_q, sse, w, h);
-}
-
-// Note order in which rows loaded xmm[127:96] = row 1, xmm[95:64] = row 2,
-// xmm[63:32] = row 3, xmm[31:0] = row 4
-unsigned int aom_masked_subpel_var4xH_xzero(const uint8_t *src, int src_stride,
- int yoffset, const uint8_t *dst,
- int dst_stride, const uint8_t *msk,
- int msk_stride, unsigned int *sse,
- int h) {
- int i;
- __m128i v_src0_b, v_src1_b, v_src2_b, v_src3_b, v_filtered1_w, v_filtered2_w;
- __m128i v_dst0_b, v_dst1_b, v_dst2_b, v_dst3_b;
- __m128i v_msk0_b, v_msk1_b, v_msk2_b, v_msk3_b, v_res_b;
- __m128i v_sum_d = _mm_setzero_si128();
- __m128i v_sse_q = _mm_setzero_si128();
- __m128i v_filter_b = _mm_set1_epi16((bilinear_filters_2t[yoffset][1] << 8) +
- bilinear_filters_2t[yoffset][0]);
- assert(yoffset < BIL_SUBPEL_SHIFTS);
- // Load the first row of src data ready
- v_src0_b = _mm_loadl_epi64((const __m128i *)src);
- for (i = 0; i < h; i += 4) {
- // Load the rest of the source data for these rows
- v_src1_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 1));
- v_src1_b = _mm_unpacklo_epi32(v_src1_b, v_src0_b);
- v_src2_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
- v_src3_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3));
- v_src3_b = _mm_unpacklo_epi32(v_src3_b, v_src2_b);
- v_src0_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 4));
- // Load the dst data
- v_dst0_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 0));
- v_dst1_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 1));
- v_dst0_b = _mm_unpacklo_epi32(v_dst1_b, v_dst0_b);
- v_dst2_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 2));
- v_dst3_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 3));
- v_dst2_b = _mm_unpacklo_epi32(v_dst3_b, v_dst2_b);
- v_dst0_b = _mm_unpacklo_epi64(v_dst2_b, v_dst0_b);
- // Load the mask data
- v_msk0_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 0));
- v_msk1_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 1));
- v_msk0_b = _mm_unpacklo_epi32(v_msk1_b, v_msk0_b);
- v_msk2_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 2));
- v_msk3_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 3));
- v_msk2_b = _mm_unpacklo_epi32(v_msk3_b, v_msk2_b);
- v_msk0_b = _mm_unpacklo_epi64(v_msk2_b, v_msk0_b);
- // Apply the y filter
- if (yoffset == HALF_PIXEL_OFFSET) {
- v_src1_b = _mm_unpacklo_epi64(v_src3_b, v_src1_b);
- v_src2_b =
- _mm_or_si128(_mm_slli_si128(v_src1_b, 4),
- _mm_and_si128(v_src0_b, _mm_setr_epi32(-1, 0, 0, 0)));
- v_res_b = _mm_avg_epu8(v_src1_b, v_src2_b);
- } else {
- v_src2_b =
- _mm_or_si128(_mm_slli_si128(v_src1_b, 4),
- _mm_and_si128(v_src2_b, _mm_setr_epi32(-1, 0, 0, 0)));
- apply_filter_lo(v_src1_b, v_src2_b, v_filter_b, &v_filtered1_w);
- v_src2_b =
- _mm_or_si128(_mm_slli_si128(v_src3_b, 4),
- _mm_and_si128(v_src0_b, _mm_setr_epi32(-1, 0, 0, 0)));
- apply_filter_lo(v_src3_b, v_src2_b, v_filter_b, &v_filtered2_w);
- v_res_b = _mm_packus_epi16(v_filtered2_w, v_filtered1_w);
- }
- // Compute the sum and SSE
- sum_and_sse(v_res_b, v_dst0_b, v_msk0_b, &v_sum_d, &v_sse_q);
- // Move onto the next set of rows
- src += src_stride * 4;
- dst += dst_stride * 4;
- msk += msk_stride * 4;
- }
- return calc_masked_variance(v_sum_d, v_sse_q, sse, 4, h);
-}
-
-// Note order in which rows loaded xmm[127:64] = row 1, xmm[63:0] = row 2
-unsigned int aom_masked_subpel_var8xH_xzero(const uint8_t *src, int src_stride,
- int yoffset, const uint8_t *dst,
- int dst_stride, const uint8_t *msk,
- int msk_stride, unsigned int *sse,
- int h) {
- int i;
- __m128i v_src0_b, v_src1_b, v_filtered0_w, v_filtered1_w, v_res_b;
- __m128i v_dst_b = _mm_setzero_si128();
- __m128i v_msk_b = _mm_setzero_si128();
- __m128i v_sum_d = _mm_setzero_si128();
- __m128i v_sse_q = _mm_setzero_si128();
- __m128i v_filter_b = _mm_set1_epi16((bilinear_filters_2t[yoffset][1] << 8) +
- bilinear_filters_2t[yoffset][0]);
- assert(yoffset < BIL_SUBPEL_SHIFTS);
- // Load the first row of src data ready
- v_src0_b = _mm_loadl_epi64((const __m128i *)src);
- for (i = 0; i < h; i += 2) {
- if (yoffset == HALF_PIXEL_OFFSET) {
- // Load the rest of the source data for these rows
- v_src1_b = _mm_or_si128(
- _mm_slli_si128(v_src0_b, 8),
- _mm_loadl_epi64((const __m128i *)(src + src_stride * 1)));
- v_src0_b = _mm_or_si128(
- _mm_slli_si128(v_src1_b, 8),
- _mm_loadl_epi64((const __m128i *)(src + src_stride * 2)));
- // Apply the y filter
- v_res_b = _mm_avg_epu8(v_src1_b, v_src0_b);
- } else {
- // Load the data and apply the y filter
- v_src1_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 1));
- apply_filter_lo(v_src0_b, v_src1_b, v_filter_b, &v_filtered0_w);
- v_src0_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
- apply_filter_lo(v_src1_b, v_src0_b, v_filter_b, &v_filtered1_w);
- v_res_b = _mm_packus_epi16(v_filtered1_w, v_filtered0_w);
- }
- // Load the dst data
- v_dst_b = _mm_unpacklo_epi64(
- _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1)),
- _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)));
- // Load the mask data
- v_msk_b = _mm_unpacklo_epi64(
- _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1)),
- _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)));
- // Compute the sum and SSE
- sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
- // Move onto the next set of rows
- src += src_stride * 2;
- dst += dst_stride * 2;
- msk += msk_stride * 2;
- }
- return calc_masked_variance(v_sum_d, v_sse_q, sse, 8, h);
-}
-
-// Note order in which rows loaded xmm[127:96] = row 1, xmm[95:64] = row 2,
-// xmm[63:32] = row 3, xmm[31:0] = row 4
-unsigned int aom_masked_subpel_var4xH_yzero(const uint8_t *src, int src_stride,
- int xoffset, const uint8_t *dst,
- int dst_stride, const uint8_t *msk,
- int msk_stride, unsigned int *sse,
- int h) {
- int i;
- __m128i v_src0_b, v_src1_b, v_src2_b, v_src3_b, v_filtered0_w, v_filtered2_w;
- __m128i v_src0_shift_b, v_src1_shift_b, v_src2_shift_b, v_src3_shift_b;
- __m128i v_dst0_b, v_dst1_b, v_dst2_b, v_dst3_b;
- __m128i v_msk0_b, v_msk1_b, v_msk2_b, v_msk3_b, v_res_b;
- __m128i v_sum_d = _mm_setzero_si128();
- __m128i v_sse_q = _mm_setzero_si128();
- __m128i v_filter_b = _mm_set1_epi16((bilinear_filters_2t[xoffset][1] << 8) +
- bilinear_filters_2t[xoffset][0]);
- assert(xoffset < BIL_SUBPEL_SHIFTS);
- for (i = 0; i < h; i += 4) {
- // Load the src data
- v_src0_b = _mm_loadl_epi64((const __m128i *)src);
- v_src0_shift_b = _mm_srli_si128(v_src0_b, 1);
- v_src1_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 1));
- v_src0_b = _mm_unpacklo_epi32(v_src1_b, v_src0_b);
- v_src1_shift_b = _mm_srli_si128(v_src1_b, 1);
- v_src2_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
- v_src0_shift_b = _mm_unpacklo_epi32(v_src1_shift_b, v_src0_shift_b);
- v_src2_shift_b = _mm_srli_si128(v_src2_b, 1);
- v_src3_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3));
- v_src2_b = _mm_unpacklo_epi32(v_src3_b, v_src2_b);
- v_src3_shift_b = _mm_srli_si128(v_src3_b, 1);
- v_src2_shift_b = _mm_unpacklo_epi32(v_src3_shift_b, v_src2_shift_b);
- // Load the dst data
- v_dst0_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 0));
- v_dst1_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 1));
- v_dst0_b = _mm_unpacklo_epi32(v_dst1_b, v_dst0_b);
- v_dst2_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 2));
- v_dst3_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 3));
- v_dst2_b = _mm_unpacklo_epi32(v_dst3_b, v_dst2_b);
- v_dst0_b = _mm_unpacklo_epi64(v_dst2_b, v_dst0_b);
- // Load the mask data
- v_msk0_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 0));
- v_msk1_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 1));
- v_msk0_b = _mm_unpacklo_epi32(v_msk1_b, v_msk0_b);
- v_msk2_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 2));
- v_msk3_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 3));
- v_msk2_b = _mm_unpacklo_epi32(v_msk3_b, v_msk2_b);
- v_msk0_b = _mm_unpacklo_epi64(v_msk2_b, v_msk0_b);
- // Apply the x filter
- if (xoffset == HALF_PIXEL_OFFSET) {
- v_src0_b = _mm_unpacklo_epi64(v_src2_b, v_src0_b);
- v_src0_shift_b = _mm_unpacklo_epi64(v_src2_shift_b, v_src0_shift_b);
- v_res_b = _mm_avg_epu8(v_src0_b, v_src0_shift_b);
- } else {
- apply_filter_lo(v_src0_b, v_src0_shift_b, v_filter_b, &v_filtered0_w);
- apply_filter_lo(v_src2_b, v_src2_shift_b, v_filter_b, &v_filtered2_w);
- v_res_b = _mm_packus_epi16(v_filtered2_w, v_filtered0_w);
- }
- // Compute the sum and SSE
- sum_and_sse(v_res_b, v_dst0_b, v_msk0_b, &v_sum_d, &v_sse_q);
- // Move onto the next set of rows
- src += src_stride * 4;
- dst += dst_stride * 4;
- msk += msk_stride * 4;
- }
- return calc_masked_variance(v_sum_d, v_sse_q, sse, 4, h);
-}
-
-unsigned int aom_masked_subpel_var8xH_yzero(const uint8_t *src, int src_stride,
- int xoffset, const uint8_t *dst,
- int dst_stride, const uint8_t *msk,
- int msk_stride, unsigned int *sse,
- int h) {
- int i;
- __m128i v_src0_b, v_src1_b, v_filtered0_w, v_filtered1_w;
- __m128i v_src0_shift_b, v_src1_shift_b, v_res_b, v_dst_b, v_msk_b;
- __m128i v_sum_d = _mm_setzero_si128();
- __m128i v_sse_q = _mm_setzero_si128();
- __m128i v_filter_b = _mm_set1_epi16((bilinear_filters_2t[xoffset][1] << 8) +
- bilinear_filters_2t[xoffset][0]);
- assert(xoffset < BIL_SUBPEL_SHIFTS);
- for (i = 0; i < h; i += 2) {
- // Load the src data
- v_src0_b = _mm_loadu_si128((const __m128i *)(src));
- v_src0_shift_b = _mm_srli_si128(v_src0_b, 1);
- v_src1_b = _mm_loadu_si128((const __m128i *)(src + src_stride));
- v_src1_shift_b = _mm_srli_si128(v_src1_b, 1);
- // Apply the x filter
- if (xoffset == HALF_PIXEL_OFFSET) {
- v_src1_b = _mm_unpacklo_epi64(v_src0_b, v_src1_b);
- v_src1_shift_b = _mm_unpacklo_epi64(v_src0_shift_b, v_src1_shift_b);
- v_res_b = _mm_avg_epu8(v_src1_b, v_src1_shift_b);
- } else {
- apply_filter_lo(v_src0_b, v_src0_shift_b, v_filter_b, &v_filtered0_w);
- apply_filter_lo(v_src1_b, v_src1_shift_b, v_filter_b, &v_filtered1_w);
- v_res_b = _mm_packus_epi16(v_filtered0_w, v_filtered1_w);
- }
- // Load the dst data
- v_dst_b = _mm_unpacklo_epi64(
- _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)),
- _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1)));
- // Load the mask data
- v_msk_b = _mm_unpacklo_epi64(
- _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)),
- _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1)));
- // Compute the sum and SSE
- sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
- // Move onto the next set of rows
- src += src_stride * 2;
- dst += dst_stride * 2;
- msk += msk_stride * 2;
- }
- return calc_masked_variance(v_sum_d, v_sse_q, sse, 8, h);
-}
-
-// Note order in which rows loaded xmm[127:96] = row 1, xmm[95:64] = row 2,
-// xmm[63:32] = row 3, xmm[31:0] = row 4
-unsigned int aom_masked_subpel_var4xH_xnonzero_ynonzero(
- const uint8_t *src, int src_stride, int xoffset, int yoffset,
- const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
- unsigned int *sse, int h) {
- int i;
- __m128i v_src0_b, v_src1_b, v_src2_b, v_src3_b, v_filtered0_w, v_filtered2_w;
- __m128i v_src0_shift_b, v_src1_shift_b, v_src2_shift_b, v_src3_shift_b;
- __m128i v_dst0_b, v_dst1_b, v_dst2_b, v_dst3_b, v_temp_b;
- __m128i v_msk0_b, v_msk1_b, v_msk2_b, v_msk3_b, v_extra_row_b, v_res_b;
- __m128i v_xres_b[2];
- __m128i v_sum_d = _mm_setzero_si128();
- __m128i v_sse_q = _mm_setzero_si128();
- __m128i v_filterx_b = _mm_set1_epi16((bilinear_filters_2t[xoffset][1] << 8) +
- bilinear_filters_2t[xoffset][0]);
- __m128i v_filtery_b = _mm_set1_epi16((bilinear_filters_2t[yoffset][1] << 8) +
- bilinear_filters_2t[yoffset][0]);
- assert(xoffset < BIL_SUBPEL_SHIFTS);
- assert(yoffset < BIL_SUBPEL_SHIFTS);
- for (i = 0; i < h; i += 4) {
- // Load the src data
- v_src0_b = _mm_loadl_epi64((const __m128i *)src);
- v_src0_shift_b = _mm_srli_si128(v_src0_b, 1);
- v_src1_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 1));
- v_src0_b = _mm_unpacklo_epi32(v_src1_b, v_src0_b);
- v_src1_shift_b = _mm_srli_si128(v_src1_b, 1);
- v_src2_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
- v_src0_shift_b = _mm_unpacklo_epi32(v_src1_shift_b, v_src0_shift_b);
- v_src2_shift_b = _mm_srli_si128(v_src2_b, 1);
- v_src3_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3));
- v_src2_b = _mm_unpacklo_epi32(v_src3_b, v_src2_b);
- v_src3_shift_b = _mm_srli_si128(v_src3_b, 1);
- v_src2_shift_b = _mm_unpacklo_epi32(v_src3_shift_b, v_src2_shift_b);
- // Apply the x filter
- if (xoffset == HALF_PIXEL_OFFSET) {
- v_src0_b = _mm_unpacklo_epi64(v_src2_b, v_src0_b);
- v_src0_shift_b = _mm_unpacklo_epi64(v_src2_shift_b, v_src0_shift_b);
- v_xres_b[i == 0 ? 0 : 1] = _mm_avg_epu8(v_src0_b, v_src0_shift_b);
- } else {
- apply_filter_lo(v_src0_b, v_src0_shift_b, v_filterx_b, &v_filtered0_w);
- apply_filter_lo(v_src2_b, v_src2_shift_b, v_filterx_b, &v_filtered2_w);
- v_xres_b[i == 0 ? 0 : 1] = _mm_packus_epi16(v_filtered2_w, v_filtered0_w);
- }
- // Move onto the next set of rows
- src += src_stride * 4;
- }
- // Load one more row to be used in the y filter
- v_src0_b = _mm_loadl_epi64((const __m128i *)src);
- v_src0_shift_b = _mm_srli_si128(v_src0_b, 1);
- // Apply the x filter
- if (xoffset == HALF_PIXEL_OFFSET) {
- v_extra_row_b = _mm_and_si128(_mm_avg_epu8(v_src0_b, v_src0_shift_b),
- _mm_setr_epi32(-1, 0, 0, 0));
- } else {
- apply_filter_lo(v_src0_b, v_src0_shift_b, v_filterx_b, &v_filtered0_w);
- v_extra_row_b =
- _mm_and_si128(_mm_packus_epi16(v_filtered0_w, _mm_setzero_si128()),
- _mm_setr_epi32(-1, 0, 0, 0));
- }
-
- for (i = 0; i < h; i += 4) {
- if (h == 8 && i == 0) {
- v_temp_b = _mm_or_si128(_mm_slli_si128(v_xres_b[0], 4),
- _mm_srli_si128(v_xres_b[1], 12));
- } else {
- v_temp_b = _mm_or_si128(_mm_slli_si128(v_xres_b[i == 0 ? 0 : 1], 4),
- v_extra_row_b);
- }
- // Apply the y filter
- if (yoffset == HALF_PIXEL_OFFSET) {
- v_res_b = _mm_avg_epu8(v_xres_b[i == 0 ? 0 : 1], v_temp_b);
- } else {
- v_res_b = apply_filter(v_xres_b[i == 0 ? 0 : 1], v_temp_b, v_filtery_b);
- }
-
- // Load the dst data
- v_dst0_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 0));
- v_dst1_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 1));
- v_dst0_b = _mm_unpacklo_epi32(v_dst1_b, v_dst0_b);
- v_dst2_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 2));
- v_dst3_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 3));
- v_dst2_b = _mm_unpacklo_epi32(v_dst3_b, v_dst2_b);
- v_dst0_b = _mm_unpacklo_epi64(v_dst2_b, v_dst0_b);
- // Load the mask data
- v_msk0_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 0));
- v_msk1_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 1));
- v_msk0_b = _mm_unpacklo_epi32(v_msk1_b, v_msk0_b);
- v_msk2_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 2));
- v_msk3_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 3));
- v_msk2_b = _mm_unpacklo_epi32(v_msk3_b, v_msk2_b);
- v_msk0_b = _mm_unpacklo_epi64(v_msk2_b, v_msk0_b);
- // Compute the sum and SSE
- sum_and_sse(v_res_b, v_dst0_b, v_msk0_b, &v_sum_d, &v_sse_q);
- // Move onto the next set of rows
- dst += dst_stride * 4;
- msk += msk_stride * 4;
- }
- return calc_masked_variance(v_sum_d, v_sse_q, sse, 4, h);
-}
-
-unsigned int aom_masked_subpel_var8xH_xnonzero_ynonzero(
- const uint8_t *src, int src_stride, int xoffset, int yoffset,
- const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
- unsigned int *sse, int h) {
- int i;
- __m128i v_src0_b, v_src1_b, v_filtered0_w, v_filtered1_w, v_dst_b, v_msk_b;
- __m128i v_src0_shift_b, v_src1_shift_b;
- __m128i v_xres0_b, v_xres1_b, v_res_b, v_temp_b;
- __m128i v_sum_d = _mm_setzero_si128();
- __m128i v_sse_q = _mm_setzero_si128();
- __m128i v_filterx_b = _mm_set1_epi16((bilinear_filters_2t[xoffset][1] << 8) +
- bilinear_filters_2t[xoffset][0]);
- __m128i v_filtery_b = _mm_set1_epi16((bilinear_filters_2t[yoffset][1] << 8) +
- bilinear_filters_2t[yoffset][0]);
- assert(xoffset < BIL_SUBPEL_SHIFTS);
- assert(yoffset < BIL_SUBPEL_SHIFTS);
- // Load the first block of src data
- v_src0_b = _mm_loadu_si128((const __m128i *)(src));
- v_src0_shift_b = _mm_srli_si128(v_src0_b, 1);
- v_src1_b = _mm_loadu_si128((const __m128i *)(src + src_stride));
- v_src1_shift_b = _mm_srli_si128(v_src1_b, 1);
- // Apply the x filter
- if (xoffset == HALF_PIXEL_OFFSET) {
- v_src1_b = _mm_unpacklo_epi64(v_src0_b, v_src1_b);
- v_src1_shift_b = _mm_unpacklo_epi64(v_src0_shift_b, v_src1_shift_b);
- v_xres0_b = _mm_avg_epu8(v_src1_b, v_src1_shift_b);
- } else {
- apply_filter_lo(v_src0_b, v_src0_shift_b, v_filterx_b, &v_filtered0_w);
- apply_filter_lo(v_src1_b, v_src1_shift_b, v_filterx_b, &v_filtered1_w);
- v_xres0_b = _mm_packus_epi16(v_filtered0_w, v_filtered1_w);
- }
- for (i = 0; i < h; i += 4) {
- // Load the next block of src data
- v_src0_b = _mm_loadu_si128((const __m128i *)(src + src_stride * 2));
- v_src0_shift_b = _mm_srli_si128(v_src0_b, 1);
- v_src1_b = _mm_loadu_si128((const __m128i *)(src + src_stride * 3));
- v_src1_shift_b = _mm_srli_si128(v_src1_b, 1);
- // Apply the x filter
- if (xoffset == HALF_PIXEL_OFFSET) {
- v_src1_b = _mm_unpacklo_epi64(v_src0_b, v_src1_b);
- v_src1_shift_b = _mm_unpacklo_epi64(v_src0_shift_b, v_src1_shift_b);
- v_xres1_b = _mm_avg_epu8(v_src1_b, v_src1_shift_b);
- } else {
- apply_filter_lo(v_src0_b, v_src0_shift_b, v_filterx_b, &v_filtered0_w);
- apply_filter_lo(v_src1_b, v_src1_shift_b, v_filterx_b, &v_filtered1_w);
- v_xres1_b = _mm_packus_epi16(v_filtered0_w, v_filtered1_w);
- }
- // Apply the y filter to the previous block
- v_temp_b = _mm_or_si128(_mm_srli_si128(v_xres0_b, 8),
- _mm_slli_si128(v_xres1_b, 8));
- if (yoffset == HALF_PIXEL_OFFSET) {
- v_res_b = _mm_avg_epu8(v_xres0_b, v_temp_b);
- } else {
- v_res_b = apply_filter(v_xres0_b, v_temp_b, v_filtery_b);
- }
- // Load the dst data
- v_dst_b = _mm_unpacklo_epi64(
- _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)),
- _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1)));
- // Load the mask data
- v_msk_b = _mm_unpacklo_epi64(
- _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)),
- _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1)));
- // Compute the sum and SSE
- sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
-
- // Load the next block of src data
- v_src0_b = _mm_loadu_si128((const __m128i *)(src + src_stride * 4));
- v_src0_shift_b = _mm_srli_si128(v_src0_b, 1);
- v_src1_b = _mm_loadu_si128((const __m128i *)(src + src_stride * 5));
- v_src1_shift_b = _mm_srli_si128(v_src1_b, 1);
- // Apply the x filter
- if (xoffset == HALF_PIXEL_OFFSET) {
- v_src1_b = _mm_unpacklo_epi64(v_src0_b, v_src1_b);
- v_src1_shift_b = _mm_unpacklo_epi64(v_src0_shift_b, v_src1_shift_b);
- v_xres0_b = _mm_avg_epu8(v_src1_b, v_src1_shift_b);
- } else {
- apply_filter_lo(v_src0_b, v_src0_shift_b, v_filterx_b, &v_filtered0_w);
- apply_filter_lo(v_src1_b, v_src1_shift_b, v_filterx_b, &v_filtered1_w);
- v_xres0_b = _mm_packus_epi16(v_filtered0_w, v_filtered1_w);
- }
- // Apply the y filter to the previous block
- v_temp_b = _mm_or_si128(_mm_srli_si128(v_xres1_b, 8),
- _mm_slli_si128(v_xres0_b, 8));
- if (yoffset == HALF_PIXEL_OFFSET) {
- v_res_b = _mm_avg_epu8(v_xres1_b, v_temp_b);
- } else {
- v_res_b = apply_filter(v_xres1_b, v_temp_b, v_filtery_b);
- }
- // Load the dst data
- v_dst_b = _mm_unpacklo_epi64(
- _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 2)),
- _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 3)));
- // Load the mask data
- v_msk_b = _mm_unpacklo_epi64(
- _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 2)),
- _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 3)));
- // Compute the sum and SSE
- sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
- // Move onto the next set of rows
- src += src_stride * 4;
- dst += dst_stride * 4;
- msk += msk_stride * 4;
- }
- return calc_masked_variance(v_sum_d, v_sse_q, sse, 8, h);
-}
-
-// For W >=16
-#define MASK_SUBPIX_VAR_LARGE(W, H) \
- unsigned int aom_masked_sub_pixel_variance##W##x##H##_ssse3( \
- const uint8_t *src, int src_stride, int xoffset, int yoffset, \
- const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, \
- unsigned int *sse) { \
- assert(W % 16 == 0); \
- if (xoffset == 0) { \
- if (yoffset == 0) \
- return aom_masked_variance##W##x##H##_ssse3( \
- src, src_stride, dst, dst_stride, msk, msk_stride, sse); \
- else if (yoffset == HALF_PIXEL_OFFSET) \
- return aom_masked_subpel_varWxH_xzero( \
- src, src_stride, HALF_PIXEL_OFFSET, dst, dst_stride, msk, \
- msk_stride, sse, W, H, apply_filter_avg); \
- else \
- return aom_masked_subpel_varWxH_xzero(src, src_stride, yoffset, dst, \
- dst_stride, msk, msk_stride, \
- sse, W, H, apply_filter); \
- } else if (yoffset == 0) { \
- if (xoffset == HALF_PIXEL_OFFSET) \
- return aom_masked_subpel_varWxH_yzero( \
- src, src_stride, HALF_PIXEL_OFFSET, dst, dst_stride, msk, \
- msk_stride, sse, W, H, apply_filter_avg); \
- else \
- return aom_masked_subpel_varWxH_yzero(src, src_stride, xoffset, dst, \
- dst_stride, msk, msk_stride, \
- sse, W, H, apply_filter); \
- } else if (xoffset == HALF_PIXEL_OFFSET) { \
- if (yoffset == HALF_PIXEL_OFFSET) \
- return aom_masked_subpel_varWxH_xnonzero_ynonzero( \
- src, src_stride, HALF_PIXEL_OFFSET, HALF_PIXEL_OFFSET, dst, \
- dst_stride, msk, msk_stride, sse, W, H, apply_filter_avg, \
- apply_filter_avg); \
- else \
- return aom_masked_subpel_varWxH_xnonzero_ynonzero( \
- src, src_stride, HALF_PIXEL_OFFSET, yoffset, dst, dst_stride, msk, \
- msk_stride, sse, W, H, apply_filter_avg, apply_filter); \
- } else { \
- if (yoffset == HALF_PIXEL_OFFSET) \
- return aom_masked_subpel_varWxH_xnonzero_ynonzero( \
- src, src_stride, xoffset, HALF_PIXEL_OFFSET, dst, dst_stride, msk, \
- msk_stride, sse, W, H, apply_filter, apply_filter_avg); \
- else \
- return aom_masked_subpel_varWxH_xnonzero_ynonzero( \
- src, src_stride, xoffset, yoffset, dst, dst_stride, msk, \
- msk_stride, sse, W, H, apply_filter, apply_filter); \
- } \
- }
-
-// For W < 16
-#define MASK_SUBPIX_VAR_SMALL(W, H) \
- unsigned int aom_masked_sub_pixel_variance##W##x##H##_ssse3( \
- const uint8_t *src, int src_stride, int xoffset, int yoffset, \
- const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, \
- unsigned int *sse) { \
- assert(W == 4 || W == 8); \
- if (xoffset == 0 && yoffset == 0) \
- return aom_masked_variance##W##x##H##_ssse3( \
- src, src_stride, dst, dst_stride, msk, msk_stride, sse); \
- else if (xoffset == 0) \
- return aom_masked_subpel_var##W##xH_xzero( \
- src, src_stride, yoffset, dst, dst_stride, msk, msk_stride, sse, H); \
- else if (yoffset == 0) \
- return aom_masked_subpel_var##W##xH_yzero( \
- src, src_stride, xoffset, dst, dst_stride, msk, msk_stride, sse, H); \
- else \
- return aom_masked_subpel_var##W##xH_xnonzero_ynonzero( \
- src, src_stride, xoffset, yoffset, dst, dst_stride, msk, msk_stride, \
- sse, H); \
- }
-
-MASK_SUBPIX_VAR_SMALL(4, 4)
-MASK_SUBPIX_VAR_SMALL(4, 8)
-MASK_SUBPIX_VAR_SMALL(8, 4)
-MASK_SUBPIX_VAR_SMALL(8, 8)
-MASK_SUBPIX_VAR_SMALL(8, 16)
-MASK_SUBPIX_VAR_LARGE(16, 8)
-MASK_SUBPIX_VAR_LARGE(16, 16)
-MASK_SUBPIX_VAR_LARGE(16, 32)
-MASK_SUBPIX_VAR_LARGE(32, 16)
-MASK_SUBPIX_VAR_LARGE(32, 32)
-MASK_SUBPIX_VAR_LARGE(32, 64)
-MASK_SUBPIX_VAR_LARGE(64, 32)
-MASK_SUBPIX_VAR_LARGE(64, 64)
-#if CONFIG_EXT_PARTITION
-MASK_SUBPIX_VAR_LARGE(64, 128)
-MASK_SUBPIX_VAR_LARGE(128, 64)
-MASK_SUBPIX_VAR_LARGE(128, 128)
-#endif // CONFIG_EXT_PARTITION
-
-#if CONFIG_HIGHBITDEPTH
-typedef uint32_t (*highbd_calc_masked_var_t)(__m128i v_sum_d, __m128i v_sse_q,
- uint32_t *sse, int w, int h);
-typedef unsigned int (*highbd_variance_fn_t)(const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride,
- const uint8_t *m, int m_stride,
- unsigned int *sse);
-typedef __m128i (*highbd_filter_fn_t)(__m128i v_a_w, __m128i v_b_w,
- __m128i v_filter_w);
-
-static INLINE __m128i highbd_apply_filter_avg(const __m128i v_a_w,
- const __m128i v_b_w,
- const __m128i v_filter_w) {
- (void)v_filter_w;
- return _mm_avg_epu16(v_a_w, v_b_w);
-}
-
-static INLINE __m128i highbd_apply_filter(const __m128i v_a_w,
- const __m128i v_b_w,
- const __m128i v_filter_w) {
- const __m128i v_rounding_d = _mm_set1_epi32(1 << (FILTER_BITS - 1));
- __m128i v_input_lo_w = _mm_unpacklo_epi16(v_a_w, v_b_w);
- __m128i v_input_hi_w = _mm_unpackhi_epi16(v_a_w, v_b_w);
- __m128i v_temp0_d = _mm_madd_epi16(v_input_lo_w, v_filter_w);
- __m128i v_temp1_d = _mm_madd_epi16(v_input_hi_w, v_filter_w);
- __m128i v_res_lo_d =
- _mm_srai_epi32(_mm_add_epi32(v_temp0_d, v_rounding_d), FILTER_BITS);
- __m128i v_res_hi_d =
- _mm_srai_epi32(_mm_add_epi32(v_temp1_d, v_rounding_d), FILTER_BITS);
- return _mm_packs_epi32(v_res_lo_d, v_res_hi_d);
-}
-// Apply the filter to the contents of the lower half of a and b
-static INLINE void highbd_apply_filter_lo(const __m128i v_a_lo_w,
- const __m128i v_b_lo_w,
- const __m128i v_filter_w,
- __m128i *v_res_d) {
- const __m128i v_rounding_d = _mm_set1_epi32(1 << (FILTER_BITS - 1));
- __m128i v_input_w = _mm_unpacklo_epi16(v_a_lo_w, v_b_lo_w);
- __m128i v_temp0_d = _mm_madd_epi16(v_input_w, v_filter_w);
- *v_res_d =
- _mm_srai_epi32(_mm_add_epi32(v_temp0_d, v_rounding_d), FILTER_BITS);
-}
-
-static void highbd_sum_and_sse(const __m128i v_a_w, const __m128i v_b_w,
- const __m128i v_m_b, __m128i *v_sum_d,
- __m128i *v_sse_q) {
- const __m128i v_zero = _mm_setzero_si128();
- const __m128i v_m_w = _mm_unpacklo_epi8(v_m_b, v_zero);
-
- // Difference: [-2^12, 2^12] => 13 bits (incld sign bit)
- const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
-
- // Error - [-4095, 4095] * [0, 64] & sum pairs => fits in 19 + 1 bits
- const __m128i v_e_d = _mm_madd_epi16(v_d_w, v_m_w);
-
- // Squared error - max (18 bits * 18 bits) = 36 bits (no sign bit)
- const __m128i v_absd_w = _mm_abs_epi16(v_d_w);
- const __m128i v_dlo_d = _mm_unpacklo_epi16(v_absd_w, v_zero);
- const __m128i v_mlo_d = _mm_unpacklo_epi16(v_m_w, v_zero);
- const __m128i v_elo_d = _mm_madd_epi16(v_dlo_d, v_mlo_d);
- const __m128i v_dhi_d = _mm_unpackhi_epi16(v_absd_w, v_zero);
- const __m128i v_mhi_d = _mm_unpackhi_epi16(v_m_w, v_zero);
- const __m128i v_ehi_d = _mm_madd_epi16(v_dhi_d, v_mhi_d);
- // Square and sum the errors -> 36bits * 4 = 38bits
- __m128i v_se0_q, v_se1_q, v_se2_q, v_se3_q, v_se_q, v_elo1_d, v_ehi3_d;
- v_se0_q = _mm_mul_epu32(v_elo_d, v_elo_d);
- v_elo1_d = _mm_srli_si128(v_elo_d, 4);
- v_se1_q = _mm_mul_epu32(v_elo1_d, v_elo1_d);
- v_se0_q = _mm_add_epi64(v_se0_q, v_se1_q);
- v_se2_q = _mm_mul_epu32(v_ehi_d, v_ehi_d);
- v_ehi3_d = _mm_srli_si128(v_ehi_d, 4);
- v_se3_q = _mm_mul_epu32(v_ehi3_d, v_ehi3_d);
- v_se1_q = _mm_add_epi64(v_se2_q, v_se3_q);
- v_se_q = _mm_add_epi64(v_se0_q, v_se1_q);
-
- // Accumulate
- *v_sum_d = _mm_add_epi32(*v_sum_d, v_e_d);
- *v_sse_q = _mm_add_epi64(*v_sse_q, v_se_q);
-}
-
-static INLINE uint32_t highbd_10_calc_masked_variance(__m128i v_sum_d,
- __m128i v_sse_q,
- uint32_t *sse, int w,
- int h) {
- int64_t sum64;
- uint64_t sse64;
-
- // Horizontal sum
- sum64 = hsum_epi32_si32(v_sum_d);
- sse64 = hsum_epi64_si64(v_sse_q);
-
- sum64 = (sum64 >= 0) ? sum64 : -sum64;
-
- // Round
- sum64 = ROUND_POWER_OF_TWO(sum64, 6);
- sse64 = ROUND_POWER_OF_TWO(sse64, 12);
-
- // Normalise
- sum64 = ROUND_POWER_OF_TWO(sum64, 2);
- sse64 = ROUND_POWER_OF_TWO(sse64, 4);
-
- // Store the SSE
- *sse = (uint32_t)sse64;
- // Compute the variance
- return *sse - (uint32_t)((sum64 * sum64) / (w * h));
-}
-static INLINE uint32_t highbd_12_calc_masked_variance(__m128i v_sum_d,
- __m128i v_sse_q,
- uint32_t *sse, int w,
- int h) {
- int64_t sum64;
- uint64_t sse64;
-
- // Horizontal sum
- sum64 = hsum_epi32_si64(v_sum_d);
- sse64 = hsum_epi64_si64(v_sse_q);
-
- sum64 = (sum64 >= 0) ? sum64 : -sum64;
-
- // Round
- sum64 = ROUND_POWER_OF_TWO(sum64, 6);
- sse64 = ROUND_POWER_OF_TWO(sse64, 12);
-
- // Normalise
- sum64 = ROUND_POWER_OF_TWO(sum64, 4);
- sse64 = ROUND_POWER_OF_TWO(sse64, 8);
-
- // Store the SSE
- *sse = (uint32_t)sse64;
- // Compute the variance
- return *sse - (uint32_t)((sum64 * sum64) / (w * h));
-}
-
-// High bit depth functions for width (W) >= 8
-unsigned int aom_highbd_masked_subpel_varWxH_xzero(
- const uint16_t *src, int src_stride, int yoffset, const uint16_t *dst,
- int dst_stride, const uint8_t *msk, int msk_stride, unsigned int *sse,
- int w, int h, highbd_filter_fn_t filter_fn,
- highbd_calc_masked_var_t calc_var) {
- int i, j;
- __m128i v_src0_w, v_src1_w, v_res_w, v_dst_w, v_msk_b;
- __m128i v_sum_d = _mm_setzero_si128();
- __m128i v_sse_q = _mm_setzero_si128();
- const __m128i v_filter_w =
- _mm_set1_epi32((bilinear_filters_2t[yoffset][1] << 16) +
- bilinear_filters_2t[yoffset][0]);
- assert(yoffset < BIL_SUBPEL_SHIFTS);
- for (j = 0; j < w; j += 8) {
- // Load the first row ready
- v_src0_w = _mm_loadu_si128((const __m128i *)(src + j));
- // Process 2 rows at a time
- for (i = 0; i < h; i += 2) {
- // Load the next row apply the filter
- v_src1_w = _mm_loadu_si128((const __m128i *)(src + j + src_stride));
- v_res_w = filter_fn(v_src0_w, v_src1_w, v_filter_w);
- // Load the dst and msk for the variance calculation
- v_dst_w = _mm_loadu_si128((const __m128i *)(dst + j));
- v_msk_b = _mm_loadl_epi64((const __m128i *)(msk + j));
- highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
-
- // Load the next row apply the filter
- v_src0_w = _mm_loadu_si128((const __m128i *)(src + j + src_stride * 2));
- v_res_w = filter_fn(v_src1_w, v_src0_w, v_filter_w);
- // Load the dst and msk for the variance calculation
- v_dst_w = _mm_loadu_si128((const __m128i *)(dst + j + dst_stride));
- v_msk_b = _mm_loadl_epi64((const __m128i *)(msk + j + msk_stride));
- highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
- // Move onto the next block of rows
- src += src_stride * 2;
- dst += dst_stride * 2;
- msk += msk_stride * 2;
- }
- // Reset to the top of the block
- src -= src_stride * h;
- dst -= dst_stride * h;
- msk -= msk_stride * h;
- }
- return calc_var(v_sum_d, v_sse_q, sse, w, h);
-}
-unsigned int aom_highbd_masked_subpel_varWxH_yzero(
- const uint16_t *src, int src_stride, int xoffset, const uint16_t *dst,
- int dst_stride, const uint8_t *msk, int msk_stride, unsigned int *sse,
- int w, int h, highbd_filter_fn_t filter_fn,
- highbd_calc_masked_var_t calc_var) {
- int i, j;
- __m128i v_src0_w, v_src1_w, v_res_w, v_dst_w, v_msk_b;
- __m128i v_sum_d = _mm_setzero_si128();
- __m128i v_sse_q = _mm_setzero_si128();
- const __m128i v_filter_w =
- _mm_set1_epi32((bilinear_filters_2t[xoffset][1] << 16) +
- bilinear_filters_2t[xoffset][0]);
- assert(xoffset < BIL_SUBPEL_SHIFTS);
- for (i = 0; i < h; i++) {
- for (j = 0; j < w; j += 8) {
- // Load this row & apply the filter to them
- v_src0_w = _mm_loadu_si128((const __m128i *)(src + j));
- v_src1_w = _mm_loadu_si128((const __m128i *)(src + j + 1));
- v_res_w = filter_fn(v_src0_w, v_src1_w, v_filter_w);
-
- // Load the dst and msk for the variance calculation
- v_dst_w = _mm_loadu_si128((const __m128i *)(dst + j));
- v_msk_b = _mm_loadl_epi64((const __m128i *)(msk + j));
- highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
- }
- src += src_stride;
- dst += dst_stride;
- msk += msk_stride;
- }
- return calc_var(v_sum_d, v_sse_q, sse, w, h);
-}
-
-unsigned int aom_highbd_masked_subpel_varWxH_xnonzero_ynonzero(
- const uint16_t *src, int src_stride, int xoffset, int yoffset,
- const uint16_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
- unsigned int *sse, int w, int h, highbd_filter_fn_t xfilter_fn,
- highbd_filter_fn_t yfilter_fn, highbd_calc_masked_var_t calc_var) {
- int i, j;
- __m128i v_src0_w, v_src1_w, v_src2_w, v_src3_w;
- __m128i v_filtered0_w, v_filtered1_w, v_res_w, v_dst_w, v_msk_b;
- __m128i v_sum_d = _mm_setzero_si128();
- __m128i v_sse_q = _mm_setzero_si128();
- const __m128i v_filterx_w =
- _mm_set1_epi32((bilinear_filters_2t[xoffset][1] << 16) +
- bilinear_filters_2t[xoffset][0]);
- const __m128i v_filtery_w =
- _mm_set1_epi32((bilinear_filters_2t[yoffset][1] << 16) +
- bilinear_filters_2t[yoffset][0]);
- assert(xoffset < BIL_SUBPEL_SHIFTS);
- assert(yoffset < BIL_SUBPEL_SHIFTS);
- for (j = 0; j < w; j += 8) {
- // Load the first row ready
- v_src0_w = _mm_loadu_si128((const __m128i *)(src + j));
- v_src1_w = _mm_loadu_si128((const __m128i *)(src + j + 1));
- v_filtered0_w = xfilter_fn(v_src0_w, v_src1_w, v_filterx_w);
- // Process 2 rows at a time
- for (i = 0; i < h; i += 2) {
- // Load the next row & apply the filter
- v_src2_w = _mm_loadu_si128((const __m128i *)(src + src_stride + j));
- v_src3_w = _mm_loadu_si128((const __m128i *)(src + src_stride + j + 1));
- v_filtered1_w = xfilter_fn(v_src2_w, v_src3_w, v_filterx_w);
- // Load the dst and msk for the variance calculation
- v_dst_w = _mm_loadu_si128((const __m128i *)(dst + j));
- v_msk_b = _mm_loadl_epi64((const __m128i *)(msk + j));
- // Complete the calculation for this row and add it to the running total
- v_res_w = yfilter_fn(v_filtered0_w, v_filtered1_w, v_filtery_w);
- highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
-
- // Load the next row & apply the filter
- v_src0_w = _mm_loadu_si128((const __m128i *)(src + src_stride * 2 + j));
- v_src1_w =
- _mm_loadu_si128((const __m128i *)(src + src_stride * 2 + j + 1));
- v_filtered0_w = xfilter_fn(v_src0_w, v_src1_w, v_filterx_w);
- // Load the dst and msk for the variance calculation
- v_dst_w = _mm_loadu_si128((const __m128i *)(dst + dst_stride + j));
- v_msk_b = _mm_loadl_epi64((const __m128i *)(msk + msk_stride + j));
- // Complete the calculation for this row and add it to the running total
- v_res_w = yfilter_fn(v_filtered1_w, v_filtered0_w, v_filtery_w);
- highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
- // Move onto the next block of rows
- src += src_stride * 2;
- dst += dst_stride * 2;
- msk += msk_stride * 2;
- }
- // Reset to the top of the block
- src -= src_stride * h;
- dst -= dst_stride * h;
- msk -= msk_stride * h;
- }
- return calc_var(v_sum_d, v_sse_q, sse, w, h);
-}
-
-// Note order in which rows loaded xmm[127:64] = row 1, xmm[63:0] = row 2
-unsigned int aom_highbd_masked_subpel_var4xH_xzero(
- const uint16_t *src, int src_stride, int yoffset, const uint16_t *dst,
- int dst_stride, const uint8_t *msk, int msk_stride, unsigned int *sse,
- int h, highbd_calc_masked_var_t calc_var) {
- int i;
- __m128i v_src0_w, v_src1_w, v_filtered0_d, v_filtered1_d, v_res_w;
- __m128i v_dst_w, v_msk_b;
- __m128i v_sum_d = _mm_setzero_si128();
- __m128i v_sse_q = _mm_setzero_si128();
- __m128i v_filter_w = _mm_set1_epi32((bilinear_filters_2t[yoffset][1] << 16) +
- bilinear_filters_2t[yoffset][0]);
- assert(yoffset < BIL_SUBPEL_SHIFTS);
- // Load the first row of src data ready
- v_src0_w = _mm_loadl_epi64((const __m128i *)src);
- for (i = 0; i < h; i += 2) {
- if (yoffset == HALF_PIXEL_OFFSET) {
- // Load the rest of the source data for these rows
- v_src1_w = _mm_or_si128(
- _mm_slli_si128(v_src0_w, 8),
- _mm_loadl_epi64((const __m128i *)(src + src_stride * 1)));
- v_src0_w = _mm_or_si128(
- _mm_slli_si128(v_src1_w, 8),
- _mm_loadl_epi64((const __m128i *)(src + src_stride * 2)));
- // Apply the y filter
- v_res_w = _mm_avg_epu16(v_src1_w, v_src0_w);
- } else {
- // Load the data and apply the y filter
- v_src1_w = _mm_loadl_epi64((const __m128i *)(src + src_stride * 1));
- highbd_apply_filter_lo(v_src0_w, v_src1_w, v_filter_w, &v_filtered0_d);
- v_src0_w = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
- highbd_apply_filter_lo(v_src1_w, v_src0_w, v_filter_w, &v_filtered1_d);
- v_res_w = _mm_packs_epi32(v_filtered1_d, v_filtered0_d);
- }
- // Load the dst data
- v_dst_w = _mm_unpacklo_epi64(
- _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1)),
- _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)));
- // Load the mask data
- v_msk_b = _mm_unpacklo_epi32(
- _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1)),
- _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)));
- // Compute the sum and SSE
- highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
- // Move onto the next set of rows
- src += src_stride * 2;
- dst += dst_stride * 2;
- msk += msk_stride * 2;
- }
- return calc_var(v_sum_d, v_sse_q, sse, 4, h);
-}
-
-unsigned int aom_highbd_masked_subpel_var4xH_yzero(
- const uint16_t *src, int src_stride, int xoffset, const uint16_t *dst,
- int dst_stride, const uint8_t *msk, int msk_stride, unsigned int *sse,
- int h, highbd_calc_masked_var_t calc_var) {
- int i;
- __m128i v_src0_w, v_src1_w, v_filtered0_d, v_filtered1_d;
- __m128i v_src0_shift_w, v_src1_shift_w, v_res_w, v_dst_w, v_msk_b;
- __m128i v_sum_d = _mm_setzero_si128();
- __m128i v_sse_q = _mm_setzero_si128();
- __m128i v_filter_w = _mm_set1_epi32((bilinear_filters_2t[xoffset][1] << 16) +
- bilinear_filters_2t[xoffset][0]);
- assert(xoffset < BIL_SUBPEL_SHIFTS);
- for (i = 0; i < h; i += 2) {
- // Load the src data
- v_src0_w = _mm_loadu_si128((const __m128i *)(src));
- v_src0_shift_w = _mm_srli_si128(v_src0_w, 2);
- v_src1_w = _mm_loadu_si128((const __m128i *)(src + src_stride));
- v_src1_shift_w = _mm_srli_si128(v_src1_w, 2);
- // Apply the x filter
- if (xoffset == HALF_PIXEL_OFFSET) {
- v_src1_w = _mm_unpacklo_epi64(v_src0_w, v_src1_w);
- v_src1_shift_w = _mm_unpacklo_epi64(v_src0_shift_w, v_src1_shift_w);
- v_res_w = _mm_avg_epu16(v_src1_w, v_src1_shift_w);
- } else {
- highbd_apply_filter_lo(v_src0_w, v_src0_shift_w, v_filter_w,
- &v_filtered0_d);
- highbd_apply_filter_lo(v_src1_w, v_src1_shift_w, v_filter_w,
- &v_filtered1_d);
- v_res_w = _mm_packs_epi32(v_filtered0_d, v_filtered1_d);
- }
- // Load the dst data
- v_dst_w = _mm_unpacklo_epi64(
- _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)),
- _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1)));
- // Load the mask data
- v_msk_b = _mm_unpacklo_epi32(
- _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)),
- _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1)));
- // Compute the sum and SSE
- highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
- // Move onto the next set of rows
- src += src_stride * 2;
- dst += dst_stride * 2;
- msk += msk_stride * 2;
- }
- return calc_var(v_sum_d, v_sse_q, sse, 4, h);
-}
-
-unsigned int aom_highbd_masked_subpel_var4xH_xnonzero_ynonzero(
- const uint16_t *src, int src_stride, int xoffset, int yoffset,
- const uint16_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
- unsigned int *sse, int h, highbd_calc_masked_var_t calc_var) {
- int i;
- __m128i v_src0_w, v_src1_w, v_filtered0_d, v_filtered1_d, v_dst_w, v_msk_b;
- __m128i v_src0_shift_w, v_src1_shift_w;
- __m128i v_xres0_w, v_xres1_w, v_res_w, v_temp_w;
- __m128i v_sum_d = _mm_setzero_si128();
- __m128i v_sse_q = _mm_setzero_si128();
- __m128i v_filterx_w = _mm_set1_epi32((bilinear_filters_2t[xoffset][1] << 16) +
- bilinear_filters_2t[xoffset][0]);
- __m128i v_filtery_w = _mm_set1_epi32((bilinear_filters_2t[yoffset][1] << 16) +
- bilinear_filters_2t[yoffset][0]);
- assert(xoffset < BIL_SUBPEL_SHIFTS);
- assert(yoffset < BIL_SUBPEL_SHIFTS);
- // Load the first block of src data
- v_src0_w = _mm_loadu_si128((const __m128i *)(src));
- v_src0_shift_w = _mm_srli_si128(v_src0_w, 2);
- v_src1_w = _mm_loadu_si128((const __m128i *)(src + src_stride));
- v_src1_shift_w = _mm_srli_si128(v_src1_w, 2);
- // Apply the x filter
- if (xoffset == HALF_PIXEL_OFFSET) {
- v_src1_w = _mm_unpacklo_epi64(v_src0_w, v_src1_w);
- v_src1_shift_w = _mm_unpacklo_epi64(v_src0_shift_w, v_src1_shift_w);
- v_xres0_w = _mm_avg_epu16(v_src1_w, v_src1_shift_w);
- } else {
- highbd_apply_filter_lo(v_src0_w, v_src0_shift_w, v_filterx_w,
- &v_filtered0_d);
- highbd_apply_filter_lo(v_src1_w, v_src1_shift_w, v_filterx_w,
- &v_filtered1_d);
- v_xres0_w = _mm_packs_epi32(v_filtered0_d, v_filtered1_d);
- }
- for (i = 0; i < h; i += 4) {
- // Load the next block of src data
- v_src0_w = _mm_loadu_si128((const __m128i *)(src + src_stride * 2));
- v_src0_shift_w = _mm_srli_si128(v_src0_w, 2);
- v_src1_w = _mm_loadu_si128((const __m128i *)(src + src_stride * 3));
- v_src1_shift_w = _mm_srli_si128(v_src1_w, 2);
- // Apply the x filter
- if (xoffset == HALF_PIXEL_OFFSET) {
- v_src1_w = _mm_unpacklo_epi64(v_src0_w, v_src1_w);
- v_src1_shift_w = _mm_unpacklo_epi64(v_src0_shift_w, v_src1_shift_w);
- v_xres1_w = _mm_avg_epu16(v_src1_w, v_src1_shift_w);
- } else {
- highbd_apply_filter_lo(v_src0_w, v_src0_shift_w, v_filterx_w,
- &v_filtered0_d);
- highbd_apply_filter_lo(v_src1_w, v_src1_shift_w, v_filterx_w,
- &v_filtered1_d);
- v_xres1_w = _mm_packs_epi32(v_filtered0_d, v_filtered1_d);
- }
- // Apply the y filter to the previous block
- v_temp_w = _mm_or_si128(_mm_srli_si128(v_xres0_w, 8),
- _mm_slli_si128(v_xres1_w, 8));
- if (yoffset == HALF_PIXEL_OFFSET) {
- v_res_w = _mm_avg_epu16(v_xres0_w, v_temp_w);
- } else {
- v_res_w = highbd_apply_filter(v_xres0_w, v_temp_w, v_filtery_w);
- }
- // Load the dst data
- v_dst_w = _mm_unpacklo_epi64(
- _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)),
- _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1)));
- // Load the mask data
- v_msk_b = _mm_unpacklo_epi32(
- _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)),
- _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1)));
- // Compute the sum and SSE
- highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
-
- // Load the next block of src data
- v_src0_w = _mm_loadu_si128((const __m128i *)(src + src_stride * 4));
- v_src0_shift_w = _mm_srli_si128(v_src0_w, 2);
- v_src1_w = _mm_loadu_si128((const __m128i *)(src + src_stride * 5));
- v_src1_shift_w = _mm_srli_si128(v_src1_w, 2);
- // Apply the x filter
- if (xoffset == HALF_PIXEL_OFFSET) {
- v_src1_w = _mm_unpacklo_epi64(v_src0_w, v_src1_w);
- v_src1_shift_w = _mm_unpacklo_epi64(v_src0_shift_w, v_src1_shift_w);
- v_xres0_w = _mm_avg_epu16(v_src1_w, v_src1_shift_w);
- } else {
- highbd_apply_filter_lo(v_src0_w, v_src0_shift_w, v_filterx_w,
- &v_filtered0_d);
- highbd_apply_filter_lo(v_src1_w, v_src1_shift_w, v_filterx_w,
- &v_filtered1_d);
- v_xres0_w = _mm_packs_epi32(v_filtered0_d, v_filtered1_d);
- }
- // Apply the y filter to the previous block
- v_temp_w = _mm_or_si128(_mm_srli_si128(v_xres1_w, 8),
- _mm_slli_si128(v_xres0_w, 8));
- if (yoffset == HALF_PIXEL_OFFSET) {
- v_res_w = _mm_avg_epu16(v_xres1_w, v_temp_w);
- } else {
- v_res_w = highbd_apply_filter(v_xres1_w, v_temp_w, v_filtery_w);
- }
- // Load the dst data
- v_dst_w = _mm_unpacklo_epi64(
- _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 2)),
- _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 3)));
- // Load the mask data
- v_msk_b = _mm_unpacklo_epi32(
- _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 2)),
- _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 3)));
- // Compute the sum and SSE
- highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
- // Move onto the next set of rows
- src += src_stride * 4;
- dst += dst_stride * 4;
- msk += msk_stride * 4;
- }
- return calc_var(v_sum_d, v_sse_q, sse, 4, h);
-}
-
-// For W >=8
-#define HIGHBD_MASK_SUBPIX_VAR_LARGE(W, H) \
- unsigned int highbd_masked_sub_pixel_variance##W##x##H##_ssse3( \
- const uint8_t *src8, int src_stride, int xoffset, int yoffset, \
- const uint8_t *dst8, int dst_stride, const uint8_t *msk, int msk_stride, \
- unsigned int *sse, highbd_calc_masked_var_t calc_var, \
- highbd_variance_fn_t full_variance_function) { \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
- assert(W % 8 == 0); \
- if (xoffset == 0) { \
- if (yoffset == 0) \
- return full_variance_function(src8, src_stride, dst8, dst_stride, msk, \
- msk_stride, sse); \
- else if (yoffset == HALF_PIXEL_OFFSET) \
- return aom_highbd_masked_subpel_varWxH_xzero( \
- src, src_stride, HALF_PIXEL_OFFSET, dst, dst_stride, msk, \
- msk_stride, sse, W, H, highbd_apply_filter_avg, calc_var); \
- else \
- return aom_highbd_masked_subpel_varWxH_xzero( \
- src, src_stride, yoffset, dst, dst_stride, msk, msk_stride, sse, \
- W, H, highbd_apply_filter, calc_var); \
- } else if (yoffset == 0) { \
- if (xoffset == HALF_PIXEL_OFFSET) \
- return aom_highbd_masked_subpel_varWxH_yzero( \
- src, src_stride, HALF_PIXEL_OFFSET, dst, dst_stride, msk, \
- msk_stride, sse, W, H, highbd_apply_filter_avg, calc_var); \
- else \
- return aom_highbd_masked_subpel_varWxH_yzero( \
- src, src_stride, xoffset, dst, dst_stride, msk, msk_stride, sse, \
- W, H, highbd_apply_filter, calc_var); \
- } else if (xoffset == HALF_PIXEL_OFFSET) { \
- if (yoffset == HALF_PIXEL_OFFSET) \
- return aom_highbd_masked_subpel_varWxH_xnonzero_ynonzero( \
- src, src_stride, HALF_PIXEL_OFFSET, HALF_PIXEL_OFFSET, dst, \
- dst_stride, msk, msk_stride, sse, W, H, highbd_apply_filter_avg, \
- highbd_apply_filter_avg, calc_var); \
- else \
- return aom_highbd_masked_subpel_varWxH_xnonzero_ynonzero( \
- src, src_stride, HALF_PIXEL_OFFSET, yoffset, dst, dst_stride, msk, \
- msk_stride, sse, W, H, highbd_apply_filter_avg, \
- highbd_apply_filter, calc_var); \
- } else { \
- if (yoffset == HALF_PIXEL_OFFSET) \
- return aom_highbd_masked_subpel_varWxH_xnonzero_ynonzero( \
- src, src_stride, xoffset, HALF_PIXEL_OFFSET, dst, dst_stride, msk, \
- msk_stride, sse, W, H, highbd_apply_filter, \
- highbd_apply_filter_avg, calc_var); \
- else \
- return aom_highbd_masked_subpel_varWxH_xnonzero_ynonzero( \
- src, src_stride, xoffset, yoffset, dst, dst_stride, msk, \
- msk_stride, sse, W, H, highbd_apply_filter, highbd_apply_filter, \
- calc_var); \
- } \
- }
-
-// For W < 8
-#define HIGHBD_MASK_SUBPIX_VAR_SMALL(W, H) \
- unsigned int highbd_masked_sub_pixel_variance##W##x##H##_ssse3( \
- const uint8_t *src8, int src_stride, int xoffset, int yoffset, \
- const uint8_t *dst8, int dst_stride, const uint8_t *msk, int msk_stride, \
- unsigned int *sse, highbd_calc_masked_var_t calc_var, \
- highbd_variance_fn_t full_variance_function) { \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
- assert(W == 4); \
- if (xoffset == 0 && yoffset == 0) \
- return full_variance_function(src8, src_stride, dst8, dst_stride, msk, \
- msk_stride, sse); \
- else if (xoffset == 0) \
- return aom_highbd_masked_subpel_var4xH_xzero( \
- src, src_stride, yoffset, dst, dst_stride, msk, msk_stride, sse, H, \
- calc_var); \
- else if (yoffset == 0) \
- return aom_highbd_masked_subpel_var4xH_yzero( \
- src, src_stride, xoffset, dst, dst_stride, msk, msk_stride, sse, H, \
- calc_var); \
- else \
- return aom_highbd_masked_subpel_var4xH_xnonzero_ynonzero( \
- src, src_stride, xoffset, yoffset, dst, dst_stride, msk, msk_stride, \
- sse, H, calc_var); \
- }
-
-#define HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(W, H) \
- unsigned int aom_highbd_masked_sub_pixel_variance##W##x##H##_ssse3( \
- const uint8_t *src8, int src_stride, int xoffset, int yoffset, \
- const uint8_t *dst8, int dst_stride, const uint8_t *msk, int msk_stride, \
- unsigned int *sse) { \
- return highbd_masked_sub_pixel_variance##W##x##H##_ssse3( \
- src8, src_stride, xoffset, yoffset, dst8, dst_stride, msk, msk_stride, \
- sse, calc_masked_variance, \
- aom_highbd_masked_variance##W##x##H##_ssse3); \
- } \
- unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_ssse3( \
- const uint8_t *src8, int src_stride, int xoffset, int yoffset, \
- const uint8_t *dst8, int dst_stride, const uint8_t *msk, int msk_stride, \
- unsigned int *sse) { \
- return highbd_masked_sub_pixel_variance##W##x##H##_ssse3( \
- src8, src_stride, xoffset, yoffset, dst8, dst_stride, msk, msk_stride, \
- sse, highbd_10_calc_masked_variance, \
- aom_highbd_10_masked_variance##W##x##H##_ssse3); \
- } \
- unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_ssse3( \
- const uint8_t *src8, int src_stride, int xoffset, int yoffset, \
- const uint8_t *dst8, int dst_stride, const uint8_t *msk, int msk_stride, \
- unsigned int *sse) { \
- return highbd_masked_sub_pixel_variance##W##x##H##_ssse3( \
- src8, src_stride, xoffset, yoffset, dst8, dst_stride, msk, msk_stride, \
- sse, highbd_12_calc_masked_variance, \
- aom_highbd_12_masked_variance##W##x##H##_ssse3); \
- }
-
-HIGHBD_MASK_SUBPIX_VAR_SMALL(4, 4)
-HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(4, 4)
-HIGHBD_MASK_SUBPIX_VAR_SMALL(4, 8)
-HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(4, 8)
-HIGHBD_MASK_SUBPIX_VAR_LARGE(8, 4)
-HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(8, 4)
-HIGHBD_MASK_SUBPIX_VAR_LARGE(8, 8)
-HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(8, 8)
-HIGHBD_MASK_SUBPIX_VAR_LARGE(8, 16)
-HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(8, 16)
-HIGHBD_MASK_SUBPIX_VAR_LARGE(16, 8)
-HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(16, 8)
-HIGHBD_MASK_SUBPIX_VAR_LARGE(16, 16)
-HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(16, 16)
-HIGHBD_MASK_SUBPIX_VAR_LARGE(16, 32)
-HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(16, 32)
-HIGHBD_MASK_SUBPIX_VAR_LARGE(32, 16)
-HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(32, 16)
-HIGHBD_MASK_SUBPIX_VAR_LARGE(32, 32)
-HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(32, 32)
-HIGHBD_MASK_SUBPIX_VAR_LARGE(32, 64)
-HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(32, 64)
-HIGHBD_MASK_SUBPIX_VAR_LARGE(64, 32)
-HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(64, 32)
-HIGHBD_MASK_SUBPIX_VAR_LARGE(64, 64)
-HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(64, 64)
-#if CONFIG_EXT_PARTITION
-HIGHBD_MASK_SUBPIX_VAR_LARGE(64, 128)
-HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(64, 128)
-HIGHBD_MASK_SUBPIX_VAR_LARGE(128, 64)
-HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(128, 64)
-HIGHBD_MASK_SUBPIX_VAR_LARGE(128, 128)
-HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(128, 128)
-#endif // CONFIG_EXT_PARTITION
-#endif
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index cfa0557..5b2dcef 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -1171,33 +1171,10 @@
MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x4x4d)
#if CONFIG_EXT_INTER
-#define HIGHBD_MBFP(BT, MSDF, MVF, MSVF, MCSDF, MCVF, MCSVF) \
- cpi->fn_ptr[BT].msdf = MSDF; \
- cpi->fn_ptr[BT].mvf = MVF; \
- cpi->fn_ptr[BT].msvf = MSVF; \
- cpi->fn_ptr[BT].mcsdf = MCSDF; \
- cpi->fn_ptr[BT].mcvf = MCVF; \
+#define HIGHBD_MBFP(BT, MCSDF, MCSVF) \
+ cpi->fn_ptr[BT].mcsdf = MCSDF; \
cpi->fn_ptr[BT].mcsvf = MCSVF;
-#define MAKE_MBFP_SAD_WRAPPER(fnname) \
- static unsigned int fnname##_bits8( \
- const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
- int ref_stride, const uint8_t *m, int m_stride) { \
- return fnname(src_ptr, source_stride, ref_ptr, ref_stride, m, m_stride); \
- } \
- static unsigned int fnname##_bits10( \
- const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
- int ref_stride, const uint8_t *m, int m_stride) { \
- return fnname(src_ptr, source_stride, ref_ptr, ref_stride, m, m_stride) >> \
- 2; \
- } \
- static unsigned int fnname##_bits12( \
- const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
- int ref_stride, const uint8_t *m, int m_stride) { \
- return fnname(src_ptr, source_stride, ref_ptr, ref_stride, m, m_stride) >> \
- 4; \
- }
-
#define MAKE_MBFP_COMPOUND_SAD_WRAPPER(fnname) \
static unsigned int fnname##_bits8( \
const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
@@ -1224,26 +1201,10 @@
}
#if CONFIG_EXT_PARTITION
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad128x128)
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad128x64)
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad64x128)
MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_compound_sad128x128)
MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_compound_sad128x64)
MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_compound_sad64x128)
#endif // CONFIG_EXT_PARTITION
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad64x64)
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad64x32)
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad32x64)
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad32x32)
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad32x16)
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad16x32)
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad16x16)
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad16x8)
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad8x16)
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad8x8)
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad8x4)
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad4x8)
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad4x4)
MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_compound_sad64x64)
MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_compound_sad64x32)
MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_compound_sad32x64)
@@ -1421,102 +1382,38 @@
#if CONFIG_EXT_INTER
#if CONFIG_EXT_PARTITION
- HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits8,
- aom_highbd_masked_variance128x128,
- aom_highbd_masked_sub_pixel_variance128x128,
- aom_highbd_masked_compound_sad128x128_bits8,
- aom_highbd_masked_compound_variance128x128,
+ HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_compound_sad128x128_bits8,
aom_highbd_masked_compound_sub_pixel_variance128x128)
- HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits8,
- aom_highbd_masked_variance128x64,
- aom_highbd_masked_sub_pixel_variance128x64,
- aom_highbd_masked_compound_sad128x64_bits8,
- aom_highbd_masked_compound_variance128x64,
+ HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_compound_sad128x64_bits8,
aom_highbd_masked_compound_sub_pixel_variance128x64)
- HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits8,
- aom_highbd_masked_variance64x128,
- aom_highbd_masked_sub_pixel_variance64x128,
- aom_highbd_masked_compound_sad64x128_bits8,
- aom_highbd_masked_compound_variance64x128,
+ HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_compound_sad64x128_bits8,
aom_highbd_masked_compound_sub_pixel_variance64x128)
#endif // CONFIG_EXT_PARTITION
- HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits8,
- aom_highbd_masked_variance64x64,
- aom_highbd_masked_sub_pixel_variance64x64,
- aom_highbd_masked_compound_sad64x64_bits8,
- aom_highbd_masked_compound_variance64x64,
+ HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_compound_sad64x64_bits8,
aom_highbd_masked_compound_sub_pixel_variance64x64)
- HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits8,
- aom_highbd_masked_variance64x32,
- aom_highbd_masked_sub_pixel_variance64x32,
- aom_highbd_masked_compound_sad64x32_bits8,
- aom_highbd_masked_compound_variance64x32,
+ HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_compound_sad64x32_bits8,
aom_highbd_masked_compound_sub_pixel_variance64x32)
- HIGHBD_MBFP(BLOCK_32X64, aom_highbd_masked_sad32x64_bits8,
- aom_highbd_masked_variance32x64,
- aom_highbd_masked_sub_pixel_variance32x64,
- aom_highbd_masked_compound_sad32x64_bits8,
- aom_highbd_masked_compound_variance32x64,
+ HIGHBD_MBFP(BLOCK_32X64, aom_highbd_masked_compound_sad32x64_bits8,
aom_highbd_masked_compound_sub_pixel_variance32x64)
- HIGHBD_MBFP(BLOCK_32X32, aom_highbd_masked_sad32x32_bits8,
- aom_highbd_masked_variance32x32,
- aom_highbd_masked_sub_pixel_variance32x32,
- aom_highbd_masked_compound_sad32x32_bits8,
- aom_highbd_masked_compound_variance32x32,
+ HIGHBD_MBFP(BLOCK_32X32, aom_highbd_masked_compound_sad32x32_bits8,
aom_highbd_masked_compound_sub_pixel_variance32x32)
- HIGHBD_MBFP(BLOCK_32X16, aom_highbd_masked_sad32x16_bits8,
- aom_highbd_masked_variance32x16,
- aom_highbd_masked_sub_pixel_variance32x16,
- aom_highbd_masked_compound_sad32x16_bits8,
- aom_highbd_masked_compound_variance32x16,
+ HIGHBD_MBFP(BLOCK_32X16, aom_highbd_masked_compound_sad32x16_bits8,
aom_highbd_masked_compound_sub_pixel_variance32x16)
- HIGHBD_MBFP(BLOCK_16X32, aom_highbd_masked_sad16x32_bits8,
- aom_highbd_masked_variance16x32,
- aom_highbd_masked_sub_pixel_variance16x32,
- aom_highbd_masked_compound_sad16x32_bits8,
- aom_highbd_masked_compound_variance16x32,
+ HIGHBD_MBFP(BLOCK_16X32, aom_highbd_masked_compound_sad16x32_bits8,
aom_highbd_masked_compound_sub_pixel_variance16x32)
- HIGHBD_MBFP(BLOCK_16X16, aom_highbd_masked_sad16x16_bits8,
- aom_highbd_masked_variance16x16,
- aom_highbd_masked_sub_pixel_variance16x16,
- aom_highbd_masked_compound_sad16x16_bits8,
- aom_highbd_masked_compound_variance16x16,
+ HIGHBD_MBFP(BLOCK_16X16, aom_highbd_masked_compound_sad16x16_bits8,
aom_highbd_masked_compound_sub_pixel_variance16x16)
- HIGHBD_MBFP(BLOCK_8X16, aom_highbd_masked_sad8x16_bits8,
- aom_highbd_masked_variance8x16,
- aom_highbd_masked_sub_pixel_variance8x16,
- aom_highbd_masked_compound_sad8x16_bits8,
- aom_highbd_masked_compound_variance8x16,
+ HIGHBD_MBFP(BLOCK_8X16, aom_highbd_masked_compound_sad8x16_bits8,
aom_highbd_masked_compound_sub_pixel_variance8x16)
- HIGHBD_MBFP(BLOCK_16X8, aom_highbd_masked_sad16x8_bits8,
- aom_highbd_masked_variance16x8,
- aom_highbd_masked_sub_pixel_variance16x8,
- aom_highbd_masked_compound_sad16x8_bits8,
- aom_highbd_masked_compound_variance16x8,
+ HIGHBD_MBFP(BLOCK_16X8, aom_highbd_masked_compound_sad16x8_bits8,
aom_highbd_masked_compound_sub_pixel_variance16x8)
- HIGHBD_MBFP(BLOCK_8X8, aom_highbd_masked_sad8x8_bits8,
- aom_highbd_masked_variance8x8,
- aom_highbd_masked_sub_pixel_variance8x8,
- aom_highbd_masked_compound_sad8x8_bits8,
- aom_highbd_masked_compound_variance8x8,
+ HIGHBD_MBFP(BLOCK_8X8, aom_highbd_masked_compound_sad8x8_bits8,
aom_highbd_masked_compound_sub_pixel_variance8x8)
- HIGHBD_MBFP(BLOCK_4X8, aom_highbd_masked_sad4x8_bits8,
- aom_highbd_masked_variance4x8,
- aom_highbd_masked_sub_pixel_variance4x8,
- aom_highbd_masked_compound_sad4x8_bits8,
- aom_highbd_masked_compound_variance4x8,
+ HIGHBD_MBFP(BLOCK_4X8, aom_highbd_masked_compound_sad4x8_bits8,
aom_highbd_masked_compound_sub_pixel_variance4x8)
- HIGHBD_MBFP(BLOCK_8X4, aom_highbd_masked_sad8x4_bits8,
- aom_highbd_masked_variance8x4,
- aom_highbd_masked_sub_pixel_variance8x4,
- aom_highbd_masked_compound_sad8x4_bits8,
- aom_highbd_masked_compound_variance8x4,
+ HIGHBD_MBFP(BLOCK_8X4, aom_highbd_masked_compound_sad8x4_bits8,
aom_highbd_masked_compound_sub_pixel_variance8x4)
- HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits8,
- aom_highbd_masked_variance4x4,
- aom_highbd_masked_sub_pixel_variance4x4,
- aom_highbd_masked_compound_sad4x4_bits8,
- aom_highbd_masked_compound_variance4x4,
+ HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_compound_sad4x4_bits8,
aom_highbd_masked_compound_sub_pixel_variance4x4)
#endif // CONFIG_EXT_INTER
#if CONFIG_MOTION_VAR
@@ -1692,102 +1589,38 @@
#if CONFIG_EXT_INTER
#if CONFIG_EXT_PARTITION
- HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits10,
- aom_highbd_10_masked_variance128x128,
- aom_highbd_10_masked_sub_pixel_variance128x128,
- aom_highbd_masked_compound_sad128x128_bits10,
- aom_highbd_10_masked_compound_variance128x128,
+ HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_compound_sad128x128_bits10,
aom_highbd_10_masked_compound_sub_pixel_variance128x128)
- HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits10,
- aom_highbd_10_masked_variance128x64,
- aom_highbd_10_masked_sub_pixel_variance128x64,
- aom_highbd_masked_compound_sad128x64_bits10,
- aom_highbd_10_masked_compound_variance128x64,
+ HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_compound_sad128x64_bits10,
aom_highbd_10_masked_compound_sub_pixel_variance128x64)
- HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits10,
- aom_highbd_10_masked_variance64x128,
- aom_highbd_10_masked_sub_pixel_variance64x128,
- aom_highbd_masked_compound_sad64x128_bits10,
- aom_highbd_10_masked_compound_variance64x128,
+ HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_compound_sad64x128_bits10,
aom_highbd_10_masked_compound_sub_pixel_variance64x128)
#endif // CONFIG_EXT_PARTITION
- HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits10,
- aom_highbd_10_masked_variance64x64,
- aom_highbd_10_masked_sub_pixel_variance64x64,
- aom_highbd_masked_compound_sad64x64_bits10,
- aom_highbd_10_masked_compound_variance64x64,
+ HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_compound_sad64x64_bits10,
aom_highbd_10_masked_compound_sub_pixel_variance64x64)
- HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits10,
- aom_highbd_10_masked_variance64x32,
- aom_highbd_10_masked_sub_pixel_variance64x32,
- aom_highbd_masked_compound_sad64x32_bits10,
- aom_highbd_10_masked_compound_variance64x32,
+ HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_compound_sad64x32_bits10,
aom_highbd_10_masked_compound_sub_pixel_variance64x32)
- HIGHBD_MBFP(BLOCK_32X64, aom_highbd_masked_sad32x64_bits10,
- aom_highbd_10_masked_variance32x64,
- aom_highbd_10_masked_sub_pixel_variance32x64,
- aom_highbd_masked_compound_sad32x64_bits10,
- aom_highbd_10_masked_compound_variance32x64,
+ HIGHBD_MBFP(BLOCK_32X64, aom_highbd_masked_compound_sad32x64_bits10,
aom_highbd_10_masked_compound_sub_pixel_variance32x64)
- HIGHBD_MBFP(BLOCK_32X32, aom_highbd_masked_sad32x32_bits10,
- aom_highbd_10_masked_variance32x32,
- aom_highbd_10_masked_sub_pixel_variance32x32,
- aom_highbd_masked_compound_sad32x32_bits10,
- aom_highbd_10_masked_compound_variance32x32,
+ HIGHBD_MBFP(BLOCK_32X32, aom_highbd_masked_compound_sad32x32_bits10,
aom_highbd_10_masked_compound_sub_pixel_variance32x32)
- HIGHBD_MBFP(BLOCK_32X16, aom_highbd_masked_sad32x16_bits10,
- aom_highbd_10_masked_variance32x16,
- aom_highbd_10_masked_sub_pixel_variance32x16,
- aom_highbd_masked_compound_sad32x16_bits10,
- aom_highbd_10_masked_compound_variance32x16,
+ HIGHBD_MBFP(BLOCK_32X16, aom_highbd_masked_compound_sad32x16_bits10,
aom_highbd_10_masked_compound_sub_pixel_variance32x16)
- HIGHBD_MBFP(BLOCK_16X32, aom_highbd_masked_sad16x32_bits10,
- aom_highbd_10_masked_variance16x32,
- aom_highbd_10_masked_sub_pixel_variance16x32,
- aom_highbd_masked_compound_sad16x32_bits10,
- aom_highbd_10_masked_compound_variance16x32,
+ HIGHBD_MBFP(BLOCK_16X32, aom_highbd_masked_compound_sad16x32_bits10,
aom_highbd_10_masked_compound_sub_pixel_variance16x32)
- HIGHBD_MBFP(BLOCK_16X16, aom_highbd_masked_sad16x16_bits10,
- aom_highbd_10_masked_variance16x16,
- aom_highbd_10_masked_sub_pixel_variance16x16,
- aom_highbd_masked_compound_sad16x16_bits10,
- aom_highbd_10_masked_compound_variance16x16,
+ HIGHBD_MBFP(BLOCK_16X16, aom_highbd_masked_compound_sad16x16_bits10,
aom_highbd_10_masked_compound_sub_pixel_variance16x16)
- HIGHBD_MBFP(BLOCK_8X16, aom_highbd_masked_sad8x16_bits10,
- aom_highbd_10_masked_variance8x16,
- aom_highbd_10_masked_sub_pixel_variance8x16,
- aom_highbd_masked_compound_sad8x16_bits10,
- aom_highbd_10_masked_compound_variance8x16,
+ HIGHBD_MBFP(BLOCK_8X16, aom_highbd_masked_compound_sad8x16_bits10,
aom_highbd_10_masked_compound_sub_pixel_variance8x16)
- HIGHBD_MBFP(BLOCK_16X8, aom_highbd_masked_sad16x8_bits10,
- aom_highbd_10_masked_variance16x8,
- aom_highbd_10_masked_sub_pixel_variance16x8,
- aom_highbd_masked_compound_sad16x8_bits10,
- aom_highbd_10_masked_compound_variance16x8,
+ HIGHBD_MBFP(BLOCK_16X8, aom_highbd_masked_compound_sad16x8_bits10,
aom_highbd_10_masked_compound_sub_pixel_variance16x8)
- HIGHBD_MBFP(BLOCK_8X8, aom_highbd_masked_sad8x8_bits10,
- aom_highbd_10_masked_variance8x8,
- aom_highbd_10_masked_sub_pixel_variance8x8,
- aom_highbd_masked_compound_sad8x8_bits10,
- aom_highbd_10_masked_compound_variance8x8,
+ HIGHBD_MBFP(BLOCK_8X8, aom_highbd_masked_compound_sad8x8_bits10,
aom_highbd_10_masked_compound_sub_pixel_variance8x8)
- HIGHBD_MBFP(BLOCK_4X8, aom_highbd_masked_sad4x8_bits10,
- aom_highbd_10_masked_variance4x8,
- aom_highbd_10_masked_sub_pixel_variance4x8,
- aom_highbd_masked_compound_sad4x8_bits10,
- aom_highbd_10_masked_compound_variance4x8,
+ HIGHBD_MBFP(BLOCK_4X8, aom_highbd_masked_compound_sad4x8_bits10,
aom_highbd_10_masked_compound_sub_pixel_variance4x8)
- HIGHBD_MBFP(BLOCK_8X4, aom_highbd_masked_sad8x4_bits10,
- aom_highbd_10_masked_variance8x4,
- aom_highbd_10_masked_sub_pixel_variance8x4,
- aom_highbd_masked_compound_sad8x4_bits10,
- aom_highbd_10_masked_compound_variance8x4,
+ HIGHBD_MBFP(BLOCK_8X4, aom_highbd_masked_compound_sad8x4_bits10,
aom_highbd_10_masked_compound_sub_pixel_variance8x4)
- HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits10,
- aom_highbd_10_masked_variance4x4,
- aom_highbd_10_masked_sub_pixel_variance4x4,
- aom_highbd_masked_compound_sad4x4_bits10,
- aom_highbd_10_masked_compound_variance4x4,
+ HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_compound_sad4x4_bits10,
aom_highbd_10_masked_compound_sub_pixel_variance4x4)
#endif // CONFIG_EXT_INTER
#if CONFIG_MOTION_VAR
@@ -1963,102 +1796,38 @@
#if CONFIG_EXT_INTER
#if CONFIG_EXT_PARTITION
- HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits12,
- aom_highbd_12_masked_variance128x128,
- aom_highbd_12_masked_sub_pixel_variance128x128,
- aom_highbd_masked_compound_sad128x128_bits12,
- aom_highbd_12_masked_compound_variance128x128,
+ HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_compound_sad128x128_bits12,
aom_highbd_12_masked_compound_sub_pixel_variance128x128)
- HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits12,
- aom_highbd_12_masked_variance128x64,
- aom_highbd_12_masked_sub_pixel_variance128x64,
- aom_highbd_masked_compound_sad128x64_bits12,
- aom_highbd_12_masked_compound_variance128x64,
+ HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_compound_sad128x64_bits12,
aom_highbd_12_masked_compound_sub_pixel_variance128x64)
- HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits12,
- aom_highbd_12_masked_variance64x128,
- aom_highbd_12_masked_sub_pixel_variance64x128,
- aom_highbd_masked_compound_sad64x128_bits12,
- aom_highbd_12_masked_compound_variance64x128,
+ HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_compound_sad64x128_bits12,
aom_highbd_12_masked_compound_sub_pixel_variance64x128)
#endif // CONFIG_EXT_PARTITION
- HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits12,
- aom_highbd_12_masked_variance64x64,
- aom_highbd_12_masked_sub_pixel_variance64x64,
- aom_highbd_masked_compound_sad64x64_bits12,
- aom_highbd_12_masked_compound_variance64x64,
+ HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_compound_sad64x64_bits12,
aom_highbd_12_masked_compound_sub_pixel_variance64x64)
- HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits12,
- aom_highbd_12_masked_variance64x32,
- aom_highbd_12_masked_sub_pixel_variance64x32,
- aom_highbd_masked_compound_sad64x32_bits12,
- aom_highbd_12_masked_compound_variance64x32,
+ HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_compound_sad64x32_bits12,
aom_highbd_12_masked_compound_sub_pixel_variance64x32)
- HIGHBD_MBFP(BLOCK_32X64, aom_highbd_masked_sad32x64_bits12,
- aom_highbd_12_masked_variance32x64,
- aom_highbd_12_masked_sub_pixel_variance32x64,
- aom_highbd_masked_compound_sad32x64_bits12,
- aom_highbd_12_masked_compound_variance32x64,
+ HIGHBD_MBFP(BLOCK_32X64, aom_highbd_masked_compound_sad32x64_bits12,
aom_highbd_12_masked_compound_sub_pixel_variance32x64)
- HIGHBD_MBFP(BLOCK_32X32, aom_highbd_masked_sad32x32_bits12,
- aom_highbd_12_masked_variance32x32,
- aom_highbd_12_masked_sub_pixel_variance32x32,
- aom_highbd_masked_compound_sad32x32_bits12,
- aom_highbd_12_masked_compound_variance32x32,
+ HIGHBD_MBFP(BLOCK_32X32, aom_highbd_masked_compound_sad32x32_bits12,
aom_highbd_12_masked_compound_sub_pixel_variance32x32)
- HIGHBD_MBFP(BLOCK_32X16, aom_highbd_masked_sad32x16_bits12,
- aom_highbd_12_masked_variance32x16,
- aom_highbd_12_masked_sub_pixel_variance32x16,
- aom_highbd_masked_compound_sad32x16_bits12,
- aom_highbd_12_masked_compound_variance32x16,
+ HIGHBD_MBFP(BLOCK_32X16, aom_highbd_masked_compound_sad32x16_bits12,
aom_highbd_12_masked_compound_sub_pixel_variance32x16)
- HIGHBD_MBFP(BLOCK_16X32, aom_highbd_masked_sad16x32_bits12,
- aom_highbd_12_masked_variance16x32,
- aom_highbd_12_masked_sub_pixel_variance16x32,
- aom_highbd_masked_compound_sad16x32_bits12,
- aom_highbd_12_masked_compound_variance16x32,
+ HIGHBD_MBFP(BLOCK_16X32, aom_highbd_masked_compound_sad16x32_bits12,
aom_highbd_12_masked_compound_sub_pixel_variance16x32)
- HIGHBD_MBFP(BLOCK_16X16, aom_highbd_masked_sad16x16_bits12,
- aom_highbd_12_masked_variance16x16,
- aom_highbd_12_masked_sub_pixel_variance16x16,
- aom_highbd_masked_compound_sad16x16_bits12,
- aom_highbd_12_masked_compound_variance16x16,
+ HIGHBD_MBFP(BLOCK_16X16, aom_highbd_masked_compound_sad16x16_bits12,
aom_highbd_12_masked_compound_sub_pixel_variance16x16)
- HIGHBD_MBFP(BLOCK_8X16, aom_highbd_masked_sad8x16_bits12,
- aom_highbd_12_masked_variance8x16,
- aom_highbd_12_masked_sub_pixel_variance8x16,
- aom_highbd_masked_compound_sad8x16_bits12,
- aom_highbd_12_masked_compound_variance8x16,
+ HIGHBD_MBFP(BLOCK_8X16, aom_highbd_masked_compound_sad8x16_bits12,
aom_highbd_12_masked_compound_sub_pixel_variance8x16)
- HIGHBD_MBFP(BLOCK_16X8, aom_highbd_masked_sad16x8_bits12,
- aom_highbd_12_masked_variance16x8,
- aom_highbd_12_masked_sub_pixel_variance16x8,
- aom_highbd_masked_compound_sad16x8_bits12,
- aom_highbd_12_masked_compound_variance16x8,
+ HIGHBD_MBFP(BLOCK_16X8, aom_highbd_masked_compound_sad16x8_bits12,
aom_highbd_12_masked_compound_sub_pixel_variance16x8)
- HIGHBD_MBFP(BLOCK_8X8, aom_highbd_masked_sad8x8_bits12,
- aom_highbd_12_masked_variance8x8,
- aom_highbd_12_masked_sub_pixel_variance8x8,
- aom_highbd_masked_compound_sad8x8_bits12,
- aom_highbd_12_masked_compound_variance8x8,
+ HIGHBD_MBFP(BLOCK_8X8, aom_highbd_masked_compound_sad8x8_bits12,
aom_highbd_12_masked_compound_sub_pixel_variance8x8)
- HIGHBD_MBFP(BLOCK_4X8, aom_highbd_masked_sad4x8_bits12,
- aom_highbd_12_masked_variance4x8,
- aom_highbd_12_masked_sub_pixel_variance4x8,
- aom_highbd_masked_compound_sad4x8_bits12,
- aom_highbd_12_masked_compound_variance4x8,
+ HIGHBD_MBFP(BLOCK_4X8, aom_highbd_masked_compound_sad4x8_bits12,
aom_highbd_12_masked_compound_sub_pixel_variance4x8)
- HIGHBD_MBFP(BLOCK_8X4, aom_highbd_masked_sad8x4_bits12,
- aom_highbd_12_masked_variance8x4,
- aom_highbd_12_masked_sub_pixel_variance8x4,
- aom_highbd_masked_compound_sad8x4_bits12,
- aom_highbd_12_masked_compound_variance8x4,
+ HIGHBD_MBFP(BLOCK_8X4, aom_highbd_masked_compound_sad8x4_bits12,
aom_highbd_12_masked_compound_sub_pixel_variance8x4)
- HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits12,
- aom_highbd_12_masked_variance4x4,
- aom_highbd_12_masked_sub_pixel_variance4x4,
- aom_highbd_masked_compound_sad4x4_bits12,
- aom_highbd_12_masked_compound_variance4x4,
+ HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_compound_sad4x4_bits12,
aom_highbd_12_masked_compound_sub_pixel_variance4x4)
#endif // CONFIG_EXT_INTER
@@ -2642,79 +2411,43 @@
#endif // CONFIG_MOTION_VAR
#if CONFIG_EXT_INTER
-#define MBFP(BT, MSDF, MVF, MSVF, MCSDF, MCVF, MCSVF) \
- cpi->fn_ptr[BT].msdf = MSDF; \
- cpi->fn_ptr[BT].mvf = MVF; \
- cpi->fn_ptr[BT].msvf = MSVF; \
- cpi->fn_ptr[BT].mcsdf = MCSDF; \
- cpi->fn_ptr[BT].mcvf = MCVF; \
+#define MBFP(BT, MCSDF, MCSVF) \
+ cpi->fn_ptr[BT].mcsdf = MCSDF; \
cpi->fn_ptr[BT].mcsvf = MCSVF;
#if CONFIG_EXT_PARTITION
- MBFP(BLOCK_128X128, aom_masked_sad128x128, aom_masked_variance128x128,
- aom_masked_sub_pixel_variance128x128, aom_masked_compound_sad128x128,
- aom_masked_compound_variance128x128,
+ MBFP(BLOCK_128X128, aom_masked_compound_sad128x128,
aom_masked_compound_sub_pixel_variance128x128)
- MBFP(BLOCK_128X64, aom_masked_sad128x64, aom_masked_variance128x64,
- aom_masked_sub_pixel_variance128x64, aom_masked_compound_sad128x64,
- aom_masked_compound_variance128x64,
+ MBFP(BLOCK_128X64, aom_masked_compound_sad128x64,
aom_masked_compound_sub_pixel_variance128x64)
- MBFP(BLOCK_64X128, aom_masked_sad64x128, aom_masked_variance64x128,
- aom_masked_sub_pixel_variance64x128, aom_masked_compound_sad64x128,
- aom_masked_compound_variance64x128,
+ MBFP(BLOCK_64X128, aom_masked_compound_sad64x128,
aom_masked_compound_sub_pixel_variance64x128)
#endif // CONFIG_EXT_PARTITION
- MBFP(BLOCK_64X64, aom_masked_sad64x64, aom_masked_variance64x64,
- aom_masked_sub_pixel_variance64x64, aom_masked_compound_sad64x64,
- aom_masked_compound_variance64x64,
+ MBFP(BLOCK_64X64, aom_masked_compound_sad64x64,
aom_masked_compound_sub_pixel_variance64x64)
- MBFP(BLOCK_64X32, aom_masked_sad64x32, aom_masked_variance64x32,
- aom_masked_sub_pixel_variance64x32, aom_masked_compound_sad64x32,
- aom_masked_compound_variance64x32,
+ MBFP(BLOCK_64X32, aom_masked_compound_sad64x32,
aom_masked_compound_sub_pixel_variance64x32)
- MBFP(BLOCK_32X64, aom_masked_sad32x64, aom_masked_variance32x64,
- aom_masked_sub_pixel_variance32x64, aom_masked_compound_sad32x64,
- aom_masked_compound_variance32x64,
+ MBFP(BLOCK_32X64, aom_masked_compound_sad32x64,
aom_masked_compound_sub_pixel_variance32x64)
- MBFP(BLOCK_32X32, aom_masked_sad32x32, aom_masked_variance32x32,
- aom_masked_sub_pixel_variance32x32, aom_masked_compound_sad32x32,
- aom_masked_compound_variance32x32,
+ MBFP(BLOCK_32X32, aom_masked_compound_sad32x32,
aom_masked_compound_sub_pixel_variance32x32)
- MBFP(BLOCK_32X16, aom_masked_sad32x16, aom_masked_variance32x16,
- aom_masked_sub_pixel_variance32x16, aom_masked_compound_sad32x16,
- aom_masked_compound_variance32x16,
+ MBFP(BLOCK_32X16, aom_masked_compound_sad32x16,
aom_masked_compound_sub_pixel_variance32x16)
- MBFP(BLOCK_16X32, aom_masked_sad16x32, aom_masked_variance16x32,
- aom_masked_sub_pixel_variance16x32, aom_masked_compound_sad16x32,
- aom_masked_compound_variance16x32,
+ MBFP(BLOCK_16X32, aom_masked_compound_sad16x32,
aom_masked_compound_sub_pixel_variance16x32)
- MBFP(BLOCK_16X16, aom_masked_sad16x16, aom_masked_variance16x16,
- aom_masked_sub_pixel_variance16x16, aom_masked_compound_sad16x16,
- aom_masked_compound_variance16x16,
+ MBFP(BLOCK_16X16, aom_masked_compound_sad16x16,
aom_masked_compound_sub_pixel_variance16x16)
- MBFP(BLOCK_16X8, aom_masked_sad16x8, aom_masked_variance16x8,
- aom_masked_sub_pixel_variance16x8, aom_masked_compound_sad16x8,
- aom_masked_compound_variance16x8,
+ MBFP(BLOCK_16X8, aom_masked_compound_sad16x8,
aom_masked_compound_sub_pixel_variance16x8)
- MBFP(BLOCK_8X16, aom_masked_sad8x16, aom_masked_variance8x16,
- aom_masked_sub_pixel_variance8x16, aom_masked_compound_sad8x16,
- aom_masked_compound_variance8x16,
+ MBFP(BLOCK_8X16, aom_masked_compound_sad8x16,
aom_masked_compound_sub_pixel_variance8x16)
- MBFP(BLOCK_8X8, aom_masked_sad8x8, aom_masked_variance8x8,
- aom_masked_sub_pixel_variance8x8, aom_masked_compound_sad8x8,
- aom_masked_compound_variance8x8,
+ MBFP(BLOCK_8X8, aom_masked_compound_sad8x8,
aom_masked_compound_sub_pixel_variance8x8)
- MBFP(BLOCK_4X8, aom_masked_sad4x8, aom_masked_variance4x8,
- aom_masked_sub_pixel_variance4x8, aom_masked_compound_sad4x8,
- aom_masked_compound_variance4x8,
+ MBFP(BLOCK_4X8, aom_masked_compound_sad4x8,
aom_masked_compound_sub_pixel_variance4x8)
- MBFP(BLOCK_8X4, aom_masked_sad8x4, aom_masked_variance8x4,
- aom_masked_sub_pixel_variance8x4, aom_masked_compound_sad8x4,
- aom_masked_compound_variance8x4,
+ MBFP(BLOCK_8X4, aom_masked_compound_sad8x4,
aom_masked_compound_sub_pixel_variance8x4)
- MBFP(BLOCK_4X4, aom_masked_sad4x4, aom_masked_variance4x4,
- aom_masked_sub_pixel_variance4x4, aom_masked_compound_sad4x4,
- aom_masked_compound_variance4x4,
+ MBFP(BLOCK_4X4, aom_masked_compound_sad4x4,
aom_masked_compound_sub_pixel_variance4x4)
#endif // CONFIG_EXT_INTER
diff --git a/av1/encoder/mcomp.c b/av1/encoder/mcomp.c
index cbdfc8f..b727739 100644
--- a/av1/encoder/mcomp.c
+++ b/av1/encoder/mcomp.c
@@ -2524,598 +2524,6 @@
return var;
}
-#if CONFIG_EXT_INTER
-/* returns subpixel variance error function */
-#define DIST(r, c) \
- vfp->msvf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, src_stride, \
- mask, mask_stride, &sse)
-
-/* checks if (r, c) has better score than previous best */
-
-#define MVC(r, c) \
- (unsigned int)(mvcost \
- ? ((mvjcost[((r) != rr) * 2 + ((c) != rc)] + \
- mvcost[0][((r)-rr)] + (int64_t)mvcost[1][((c)-rc)]) * \
- error_per_bit + \
- 4096) >> \
- 13 \
- : 0)
-
-#define CHECK_BETTER(v, r, c) \
- if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \
- thismse = (DIST(r, c)); \
- if ((v = MVC(r, c) + thismse) < besterr) { \
- besterr = v; \
- br = r; \
- bc = c; \
- *distortion = thismse; \
- *sse1 = sse; \
- } \
- } else { \
- v = INT_MAX; \
- }
-
-#undef CHECK_BETTER0
-#define CHECK_BETTER0(v, r, c) CHECK_BETTER(v, r, c)
-
-#undef CHECK_BETTER1
-#define CHECK_BETTER1(v, r, c) \
- if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \
- thismse = upsampled_masked_pref_error(xd, mask, mask_stride, vfp, z, \
- src_stride, upre(y, y_stride, r, c), \
- y_stride, w, h, &sse); \
- if ((v = MVC(r, c) + thismse) < besterr) { \
- besterr = v; \
- br = r; \
- bc = c; \
- *distortion = thismse; \
- *sse1 = sse; \
- } \
- } else { \
- v = INT_MAX; \
- }
-
-int av1_find_best_masked_sub_pixel_tree(
- const MACROBLOCK *x, const uint8_t *mask, int mask_stride, MV *bestmv,
- const MV *ref_mv, int allow_hp, int error_per_bit,
- const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
- int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1,
- int is_second) {
- const uint8_t *const z = x->plane[0].src.buf;
- const int src_stride = x->plane[0].src.stride;
- const MACROBLOCKD *xd = &x->e_mbd;
- unsigned int besterr = INT_MAX;
- unsigned int sse;
- int thismse;
- unsigned int whichdir;
- unsigned int halfiters = iters_per_step;
- unsigned int quarteriters = iters_per_step;
- unsigned int eighthiters = iters_per_step;
-
- const int y_stride = xd->plane[0].pre[is_second].stride;
- const int offset = bestmv->row * y_stride + bestmv->col;
- const uint8_t *const y = xd->plane[0].pre[is_second].buf;
-
- int rr = ref_mv->row;
- int rc = ref_mv->col;
- int br = bestmv->row * 8;
- int bc = bestmv->col * 8;
- int hstep = 4;
- int tr = br;
- int tc = bc;
- int minc, maxc, minr, maxr;
-
- av1_set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr,
- ref_mv);
-
- // central mv
- bestmv->row *= 8;
- bestmv->col *= 8;
-
- // calculate central point error
- besterr =
- vfp->mvf(y + offset, y_stride, z, src_stride, mask, mask_stride, sse1);
- *distortion = besterr;
- besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
-
- // 1/2 pel
- FIRST_LEVEL_CHECKS;
- if (halfiters > 1) {
- SECOND_LEVEL_CHECKS;
- }
- tr = br;
- tc = bc;
-
- // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
- if (forced_stop != 2) {
- hstep >>= 1;
- FIRST_LEVEL_CHECKS;
- if (quarteriters > 1) {
- SECOND_LEVEL_CHECKS;
- }
- tr = br;
- tc = bc;
- }
-
- if (allow_hp && forced_stop == 0) {
- hstep >>= 1;
- FIRST_LEVEL_CHECKS;
- if (eighthiters > 1) {
- SECOND_LEVEL_CHECKS;
- }
- tr = br;
- tc = bc;
- }
- // These lines insure static analysis doesn't warn that
- // tr and tc aren't used after the above point.
- (void)tr;
- (void)tc;
-
- bestmv->row = br;
- bestmv->col = bc;
-
- return besterr;
-}
-
-static unsigned int setup_masked_center_error(
- const uint8_t *mask, int mask_stride, const MV *bestmv, const MV *ref_mv,
- int error_per_bit, const aom_variance_fn_ptr_t *vfp,
- const uint8_t *const src, const int src_stride, const uint8_t *const y,
- int y_stride, int offset, int *mvjcost, int *mvcost[2], unsigned int *sse1,
- int *distortion) {
- unsigned int besterr;
- besterr =
- vfp->mvf(y + offset, y_stride, src, src_stride, mask, mask_stride, sse1);
- *distortion = besterr;
- besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
- return besterr;
-}
-
-static int upsampled_masked_pref_error(const MACROBLOCKD *xd,
- const uint8_t *mask, int mask_stride,
- const aom_variance_fn_ptr_t *vfp,
- const uint8_t *const src,
- const int src_stride,
- const uint8_t *const y, int y_stride,
- int w, int h, unsigned int *sse) {
- unsigned int besterr;
-#if CONFIG_HIGHBITDEPTH
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
- aom_highbd_upsampled_pred(pred16, w, h, y, y_stride);
-
- besterr = vfp->mvf(CONVERT_TO_BYTEPTR(pred16), w, src, src_stride, mask,
- mask_stride, sse);
- } else {
- DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
-#else
- DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
- (void)xd;
-#endif // CONFIG_HIGHBITDEPTH
- aom_upsampled_pred(pred, w, h, y, y_stride);
-
- besterr = vfp->mvf(pred, w, src, src_stride, mask, mask_stride, sse);
-#if CONFIG_HIGHBITDEPTH
- }
-#endif
- return besterr;
-}
-
-static unsigned int upsampled_setup_masked_center_error(
- const MACROBLOCKD *xd, const uint8_t *mask, int mask_stride,
- const MV *bestmv, const MV *ref_mv, int error_per_bit,
- const aom_variance_fn_ptr_t *vfp, const uint8_t *const src,
- const int src_stride, const uint8_t *const y, int y_stride, int w, int h,
- int offset, int *mvjcost, int *mvcost[2], unsigned int *sse1,
- int *distortion) {
- unsigned int besterr =
- upsampled_masked_pref_error(xd, mask, mask_stride, vfp, src, src_stride,
- y + offset, y_stride, w, h, sse1);
- *distortion = besterr;
- besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
- return besterr;
-}
-
-int av1_find_best_masked_sub_pixel_tree_up(
- const AV1_COMP *cpi, MACROBLOCK *x, const uint8_t *mask, int mask_stride,
- int mi_row, int mi_col, MV *bestmv, const MV *ref_mv, int allow_hp,
- int error_per_bit, const aom_variance_fn_ptr_t *vfp, int forced_stop,
- int iters_per_step, int *mvjcost, int *mvcost[2], int *distortion,
- unsigned int *sse1, int is_second, int use_upsampled_ref) {
- const uint8_t *const z = x->plane[0].src.buf;
- const uint8_t *const src_address = z;
- const int src_stride = x->plane[0].src.stride;
- MACROBLOCKD *xd = &x->e_mbd;
- struct macroblockd_plane *const pd = &xd->plane[0];
- MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
- unsigned int besterr = INT_MAX;
- unsigned int sse;
- unsigned int thismse;
-
- int rr = ref_mv->row;
- int rc = ref_mv->col;
- int br = bestmv->row * 8;
- int bc = bestmv->col * 8;
- int hstep = 4;
- int iter;
- int round = 3 - forced_stop;
- int tr = br;
- int tc = bc;
- const MV *search_step = search_step_table;
- int idx, best_idx = -1;
- unsigned int cost_array[5];
- int kr, kc;
- const int w = block_size_wide[mbmi->sb_type];
- const int h = block_size_high[mbmi->sb_type];
- int offset;
- int y_stride;
- const uint8_t *y;
-
- const struct buf_2d backup_pred = pd->pre[is_second];
- int minc, maxc, minr, maxr;
-
- av1_set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr,
- ref_mv);
-
- if (use_upsampled_ref) {
- int ref = xd->mi[0]->mbmi.ref_frame[is_second];
- const YV12_BUFFER_CONFIG *upsampled_ref = get_upsampled_ref(cpi, ref);
- setup_pred_plane(&pd->pre[is_second], mbmi->sb_type,
- upsampled_ref->y_buffer, upsampled_ref->y_crop_width,
- upsampled_ref->y_crop_height, upsampled_ref->y_stride,
- (mi_row << 3), (mi_col << 3), NULL, pd->subsampling_x,
- pd->subsampling_y);
- }
- y = pd->pre[is_second].buf;
- y_stride = pd->pre[is_second].stride;
- offset = bestmv->row * y_stride + bestmv->col;
-
- if (!allow_hp)
- if (round == 3) round = 2;
-
- bestmv->row *= 8;
- bestmv->col *= 8;
-
- // use_upsampled_ref can be 0 or 1
- if (use_upsampled_ref)
- besterr = upsampled_setup_masked_center_error(
- xd, mask, mask_stride, bestmv, ref_mv, error_per_bit, vfp, z,
- src_stride, y, y_stride, w, h, (offset * 8), mvjcost, mvcost, sse1,
- distortion);
- else
- besterr = setup_masked_center_error(
- mask, mask_stride, bestmv, ref_mv, error_per_bit, vfp, z, src_stride, y,
- y_stride, offset, mvjcost, mvcost, sse1, distortion);
-
- for (iter = 0; iter < round; ++iter) {
- // Check vertical and horizontal sub-pixel positions.
- for (idx = 0; idx < 4; ++idx) {
- tr = br + search_step[idx].row;
- tc = bc + search_step[idx].col;
- if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
- MV this_mv = { tr, tc };
-
- if (use_upsampled_ref) {
- const uint8_t *const pre_address = y + tr * y_stride + tc;
-
- thismse = upsampled_masked_pref_error(
- xd, mask, mask_stride, vfp, src_address, src_stride, pre_address,
- y_stride, w, h, &sse);
- } else {
- const uint8_t *const pre_address =
- y + (tr >> 3) * y_stride + (tc >> 3);
- thismse = vfp->msvf(pre_address, y_stride, sp(tc), sp(tr),
- src_address, src_stride, mask, mask_stride, &sse);
- }
-
- cost_array[idx] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost,
- mvcost, error_per_bit);
-
- if (cost_array[idx] < besterr) {
- best_idx = idx;
- besterr = cost_array[idx];
- *distortion = thismse;
- *sse1 = sse;
- }
- } else {
- cost_array[idx] = INT_MAX;
- }
- }
-
- // Check diagonal sub-pixel position
- kc = (cost_array[0] <= cost_array[1] ? -hstep : hstep);
- kr = (cost_array[2] <= cost_array[3] ? -hstep : hstep);
-
- tc = bc + kc;
- tr = br + kr;
- if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
- MV this_mv = { tr, tc };
-
- if (use_upsampled_ref) {
- const uint8_t *const pre_address = y + tr * y_stride + tc;
-
- thismse = upsampled_masked_pref_error(
- xd, mask, mask_stride, vfp, src_address, src_stride, pre_address,
- y_stride, w, h, &sse);
- } else {
- const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
-
- thismse = vfp->msvf(pre_address, y_stride, sp(tc), sp(tr), src_address,
- src_stride, mask, mask_stride, &sse);
- }
-
- cost_array[4] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
- error_per_bit);
-
- if (cost_array[4] < besterr) {
- best_idx = 4;
- besterr = cost_array[4];
- *distortion = thismse;
- *sse1 = sse;
- }
- } else {
- cost_array[idx] = INT_MAX;
- }
-
- if (best_idx < 4 && best_idx >= 0) {
- br += search_step[best_idx].row;
- bc += search_step[best_idx].col;
- } else if (best_idx == 4) {
- br = tr;
- bc = tc;
- }
-
- if (iters_per_step > 1 && best_idx != -1) {
- if (use_upsampled_ref) {
- SECOND_LEVEL_CHECKS_BEST(1);
- } else {
- SECOND_LEVEL_CHECKS_BEST(0);
- }
- }
-
- tr = br;
- tc = bc;
-
- search_step += 4;
- hstep >>= 1;
- best_idx = -1;
- }
-
- // These lines insure static analysis doesn't warn that
- // tr and tc aren't used after the above point.
- (void)tr;
- (void)tc;
-
- bestmv->row = br;
- bestmv->col = bc;
-
- if (use_upsampled_ref) {
- pd->pre[is_second] = backup_pred;
- }
-
- return besterr;
-}
-
-#undef DIST
-#undef MVC
-#undef CHECK_BETTER
-
-static int get_masked_mvpred_var(const MACROBLOCK *x, const uint8_t *mask,
- int mask_stride, const MV *best_mv,
- const MV *center_mv,
- const aom_variance_fn_ptr_t *vfp,
- int use_mvcost, int is_second) {
- const MACROBLOCKD *const xd = &x->e_mbd;
- const struct buf_2d *const what = &x->plane[0].src;
- const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
- const MV mv = { best_mv->row * 8, best_mv->col * 8 };
- unsigned int unused;
-
- return vfp->mvf(what->buf, what->stride, get_buf_from_mv(in_what, best_mv),
- in_what->stride, mask, mask_stride, &unused) +
- (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmvjointcost, x->mvcost,
- x->errorperbit)
- : 0);
-}
-
-int masked_refining_search_sad(const MACROBLOCK *x, const uint8_t *mask,
- int mask_stride, MV *ref_mv, int error_per_bit,
- int search_range,
- const aom_variance_fn_ptr_t *fn_ptr,
- const MV *center_mv, int is_second) {
- const MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } };
- const MACROBLOCKD *const xd = &x->e_mbd;
- const struct buf_2d *const what = &x->plane[0].src;
- const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
- const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
- unsigned int best_sad =
- fn_ptr->msdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv),
- in_what->stride, mask, mask_stride) +
- mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
- int i, j;
-
- for (i = 0; i < search_range; i++) {
- int best_site = -1;
-
- for (j = 0; j < 4; j++) {
- const MV mv = { ref_mv->row + neighbors[j].row,
- ref_mv->col + neighbors[j].col };
- if (is_mv_in(&x->mv_limits, &mv)) {
- unsigned int sad =
- fn_ptr->msdf(what->buf, what->stride, get_buf_from_mv(in_what, &mv),
- in_what->stride, mask, mask_stride);
- if (sad < best_sad) {
- sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
- if (sad < best_sad) {
- best_sad = sad;
- best_site = j;
- }
- }
- }
- }
-
- if (best_site == -1) {
- break;
- } else {
- ref_mv->row += neighbors[best_site].row;
- ref_mv->col += neighbors[best_site].col;
- }
- }
- return best_sad;
-}
-
-int masked_diamond_search_sad(const MACROBLOCK *x,
- const search_site_config *cfg,
- const uint8_t *mask, int mask_stride, MV *ref_mv,
- MV *best_mv, int search_param, int sad_per_bit,
- int *num00, const aom_variance_fn_ptr_t *fn_ptr,
- const MV *center_mv, int is_second) {
- const MACROBLOCKD *const xd = &x->e_mbd;
- const struct buf_2d *const what = &x->plane[0].src;
- const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
- // search_param determines the length of the initial step and hence the number
- // of iterations
- // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 =
- // (MAX_FIRST_STEP/4) pel... etc.
- const search_site *const ss = &cfg->ss[search_param * cfg->searches_per_step];
- const int tot_steps = (cfg->ss_count / cfg->searches_per_step) - search_param;
- const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
- const uint8_t *best_address, *in_what_ref;
- int best_sad = INT_MAX;
- int best_site = 0;
- int last_site = 0;
- int i, j, step;
-
- clamp_mv(ref_mv, x->mv_limits.col_min, x->mv_limits.col_max,
- x->mv_limits.row_min, x->mv_limits.row_max);
- in_what_ref = get_buf_from_mv(in_what, ref_mv);
- best_address = in_what_ref;
- *num00 = 0;
- *best_mv = *ref_mv;
-
- // Check the starting position
- best_sad = fn_ptr->msdf(what->buf, what->stride, best_address,
- in_what->stride, mask, mask_stride) +
- mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit);
-
- i = 1;
-
- for (step = 0; step < tot_steps; step++) {
- for (j = 0; j < cfg->searches_per_step; j++) {
- const MV mv = { best_mv->row + ss[i].mv.row,
- best_mv->col + ss[i].mv.col };
- if (is_mv_in(&x->mv_limits, &mv)) {
- int sad =
- fn_ptr->msdf(what->buf, what->stride, best_address + ss[i].offset,
- in_what->stride, mask, mask_stride);
- if (sad < best_sad) {
- sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
- if (sad < best_sad) {
- best_sad = sad;
- best_site = i;
- }
- }
- }
-
- i++;
- }
-
- if (best_site != last_site) {
- best_mv->row += ss[best_site].mv.row;
- best_mv->col += ss[best_site].mv.col;
- best_address += ss[best_site].offset;
- last_site = best_site;
-#if defined(NEW_DIAMOND_SEARCH)
- while (1) {
- const MV this_mv = { best_mv->row + ss[best_site].mv.row,
- best_mv->col + ss[best_site].mv.col };
- if (is_mv_in(&x->mv_limits, &this_mv)) {
- int sad = fn_ptr->msdf(what->buf, what->stride,
- best_address + ss[best_site].offset,
- in_what->stride, mask, mask_stride);
- if (sad < best_sad) {
- sad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
- if (sad < best_sad) {
- best_sad = sad;
- best_mv->row += ss[best_site].mv.row;
- best_mv->col += ss[best_site].mv.col;
- best_address += ss[best_site].offset;
- continue;
- }
- }
- }
- break;
- }
-#endif
- } else if (best_address == in_what_ref) {
- (*num00)++;
- }
- }
- return best_sad;
-}
-
-int av1_masked_full_pixel_diamond(const AV1_COMP *cpi, MACROBLOCK *x,
- const uint8_t *mask, int mask_stride,
- MV *mvp_full, int step_param, int sadpb,
- int further_steps, int do_refine,
- const aom_variance_fn_ptr_t *fn_ptr,
- const MV *ref_mv, MV *dst_mv, int is_second) {
- MV temp_mv;
- int thissme, n, num00 = 0;
- int bestsme = masked_diamond_search_sad(x, &cpi->ss_cfg, mask, mask_stride,
- mvp_full, &temp_mv, step_param, sadpb,
- &n, fn_ptr, ref_mv, is_second);
- if (bestsme < INT_MAX)
- bestsme = get_masked_mvpred_var(x, mask, mask_stride, &temp_mv, ref_mv,
- fn_ptr, 1, is_second);
- *dst_mv = temp_mv;
-
- // If there won't be more n-step search, check to see if refining search is
- // needed.
- if (n > further_steps) do_refine = 0;
-
- while (n < further_steps) {
- ++n;
-
- if (num00) {
- num00--;
- } else {
- thissme = masked_diamond_search_sad(
- x, &cpi->ss_cfg, mask, mask_stride, mvp_full, &temp_mv,
- step_param + n, sadpb, &num00, fn_ptr, ref_mv, is_second);
- if (thissme < INT_MAX)
- thissme = get_masked_mvpred_var(x, mask, mask_stride, &temp_mv, ref_mv,
- fn_ptr, 1, is_second);
-
- // check to see if refining search is needed.
- if (num00 > further_steps - n) do_refine = 0;
-
- if (thissme < bestsme) {
- bestsme = thissme;
- *dst_mv = temp_mv;
- }
- }
- }
-
- // final 1-away diamond refining search
- if (do_refine) {
- const int search_range = 8;
- MV best_mv = *dst_mv;
- thissme =
- masked_refining_search_sad(x, mask, mask_stride, &best_mv, sadpb,
- search_range, fn_ptr, ref_mv, is_second);
- if (thissme < INT_MAX)
- thissme = get_masked_mvpred_var(x, mask, mask_stride, &best_mv, ref_mv,
- fn_ptr, 1, is_second);
- if (thissme < bestsme) {
- bestsme = thissme;
- *dst_mv = best_mv;
- }
- }
- return bestsme;
-}
-#endif // CONFIG_EXT_INTER
-
#if CONFIG_MOTION_VAR
/* returns subpixel variance error function */
#define DIST(r, c) \
diff --git a/av1/encoder/mcomp.h b/av1/encoder/mcomp.h
index eb989e8..9ed0817 100644
--- a/av1/encoder/mcomp.h
+++ b/av1/encoder/mcomp.h
@@ -136,27 +136,6 @@
int error_per_bit, int *cost_list, const MV *ref_mv,
int var_max, int rd);
-#if CONFIG_EXT_INTER
-int av1_find_best_masked_sub_pixel_tree(
- const MACROBLOCK *x, const uint8_t *mask, int mask_stride, MV *bestmv,
- const MV *ref_mv, int allow_hp, int error_per_bit,
- const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
- int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1,
- int is_second);
-int av1_find_best_masked_sub_pixel_tree_up(
- const struct AV1_COMP *cpi, MACROBLOCK *x, const uint8_t *mask,
- int mask_stride, int mi_row, int mi_col, MV *bestmv, const MV *ref_mv,
- int allow_hp, int error_per_bit, const aom_variance_fn_ptr_t *vfp,
- int forced_stop, int iters_per_step, int *mvjcost, int *mvcost[2],
- int *distortion, unsigned int *sse1, int is_second, int use_upsampled_ref);
-int av1_masked_full_pixel_diamond(const struct AV1_COMP *cpi, MACROBLOCK *x,
- const uint8_t *mask, int mask_stride,
- MV *mvp_full, int step_param, int sadpb,
- int further_steps, int do_refine,
- const aom_variance_fn_ptr_t *fn_ptr,
- const MV *ref_mv, MV *dst_mv, int is_second);
-#endif // CONFIG_EXT_INTER
-
#if CONFIG_MOTION_VAR
int av1_obmc_full_pixel_diamond(const struct AV1_COMP *cpi, MACROBLOCK *x,
MV *mvp_full, int step_param, int sadpb,
diff --git a/test/masked_sad_test.cc b/test/masked_sad_test.cc
index 53f85ee..c0b6eb2 100644
--- a/test/masked_sad_test.cc
+++ b/test/masked_sad_test.cc
@@ -27,9 +27,11 @@
namespace {
const int number_of_iterations = 500;
-typedef unsigned int (*MaskedSADFunc)(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- const uint8_t *m, int m_stride);
+typedef unsigned int (*MaskedSADFunc)(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred,
+ const uint8_t *msk, int msk_stride,
+ int invert_mask);
typedef std::tr1::tuple<MaskedSADFunc, MaskedSADFunc> MaskedSADParam;
class MaskedSADTest : public ::testing::TestWithParam<MaskedSADParam> {
@@ -52,6 +54,7 @@
ACMRandom rnd(ACMRandom::DeterministicSeed());
DECLARE_ALIGNED(16, uint8_t, src_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
DECLARE_ALIGNED(16, uint8_t, ref_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
+ DECLARE_ALIGNED(16, uint8_t, second_pred_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
DECLARE_ALIGNED(16, uint8_t, msk_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
int err_count = 0;
int first_failure = -1;
@@ -62,18 +65,23 @@
for (int j = 0; j < MAX_SB_SIZE * MAX_SB_SIZE; j++) {
src_ptr[j] = rnd.Rand8();
ref_ptr[j] = rnd.Rand8();
+ second_pred_ptr[j] = rnd.Rand8();
msk_ptr[j] = ((rnd.Rand8() & 0x7f) > 64) ? rnd.Rand8() & 0x3f : 64;
assert(msk_ptr[j] <= 64);
}
- ref_ret = ref_maskedSAD_op_(src_ptr, src_stride, ref_ptr, ref_stride,
- msk_ptr, msk_stride);
- ASM_REGISTER_STATE_CHECK(ret = maskedSAD_op_(src_ptr, src_stride, ref_ptr,
- ref_stride, msk_ptr,
- msk_stride));
- if (ret != ref_ret) {
- err_count++;
- if (first_failure == -1) first_failure = i;
+ for (int invert_mask = 0; invert_mask < 2; ++invert_mask) {
+ ref_ret =
+ ref_maskedSAD_op_(src_ptr, src_stride, ref_ptr, ref_stride,
+ second_pred_ptr, msk_ptr, msk_stride, invert_mask);
+ ASM_REGISTER_STATE_CHECK(ret = maskedSAD_op_(src_ptr, src_stride, ref_ptr,
+ ref_stride, second_pred_ptr,
+ msk_ptr, msk_stride,
+ invert_mask));
+ if (ret != ref_ret) {
+ err_count++;
+ if (first_failure == -1) first_failure = i;
+ }
}
}
EXPECT_EQ(0, err_count)
@@ -82,9 +90,11 @@
}
#if CONFIG_HIGHBITDEPTH
-typedef unsigned int (*HighbdMaskedSADFunc)(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- const uint8_t *m, int m_stride);
+typedef unsigned int (*HighbdMaskedSADFunc)(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred,
+ const uint8_t *msk, int msk_stride,
+ int invert_mask);
typedef std::tr1::tuple<HighbdMaskedSADFunc, HighbdMaskedSADFunc>
HighbdMaskedSADParam;
@@ -109,9 +119,11 @@
ACMRandom rnd(ACMRandom::DeterministicSeed());
DECLARE_ALIGNED(16, uint16_t, src_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
DECLARE_ALIGNED(16, uint16_t, ref_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
+ DECLARE_ALIGNED(16, uint16_t, second_pred_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
DECLARE_ALIGNED(16, uint8_t, msk_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
uint8_t *src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
uint8_t *ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
+ uint8_t *second_pred8_ptr = CONVERT_TO_BYTEPTR(second_pred_ptr);
int err_count = 0;
int first_failure = -1;
int src_stride = MAX_SB_SIZE;
@@ -121,17 +133,22 @@
for (int j = 0; j < MAX_SB_SIZE * MAX_SB_SIZE; j++) {
src_ptr[j] = rnd.Rand16() & 0xfff;
ref_ptr[j] = rnd.Rand16() & 0xfff;
+ second_pred_ptr[j] = rnd.Rand16() & 0xfff;
msk_ptr[j] = ((rnd.Rand8() & 0x7f) > 64) ? rnd.Rand8() & 0x3f : 64;
}
- ref_ret = ref_maskedSAD_op_(src8_ptr, src_stride, ref8_ptr, ref_stride,
- msk_ptr, msk_stride);
- ASM_REGISTER_STATE_CHECK(ret = maskedSAD_op_(src8_ptr, src_stride, ref8_ptr,
- ref_stride, msk_ptr,
- msk_stride));
- if (ret != ref_ret) {
- err_count++;
- if (first_failure == -1) first_failure = i;
+ for (int invert_mask = 0; invert_mask < 2; ++invert_mask) {
+ ref_ret =
+ ref_maskedSAD_op_(src8_ptr, src_stride, ref8_ptr, ref_stride,
+ second_pred8_ptr, msk_ptr, msk_stride, invert_mask);
+ ASM_REGISTER_STATE_CHECK(ret = maskedSAD_op_(src8_ptr, src_stride,
+ ref8_ptr, ref_stride,
+ second_pred8_ptr, msk_ptr,
+ msk_stride, invert_mask));
+ if (ret != ref_ret) {
+ err_count++;
+ if (first_failure == -1) first_failure = i;
+ }
}
}
EXPECT_EQ(0, err_count)
@@ -142,65 +159,83 @@
using std::tr1::make_tuple;
-#if HAVE_SSSE3
-INSTANTIATE_TEST_CASE_P(
- SSSE3_C_COMPARE, MaskedSADTest,
- ::testing::Values(
-#if CONFIG_EXT_PARTITION
- make_tuple(&aom_masked_sad128x128_ssse3, &aom_masked_sad128x128_c),
- make_tuple(&aom_masked_sad128x64_ssse3, &aom_masked_sad128x64_c),
- make_tuple(&aom_masked_sad64x128_ssse3, &aom_masked_sad64x128_c),
-#endif // CONFIG_EXT_PARTITION
- make_tuple(&aom_masked_sad64x64_ssse3, &aom_masked_sad64x64_c),
- make_tuple(&aom_masked_sad64x32_ssse3, &aom_masked_sad64x32_c),
- make_tuple(&aom_masked_sad32x64_ssse3, &aom_masked_sad32x64_c),
- make_tuple(&aom_masked_sad32x32_ssse3, &aom_masked_sad32x32_c),
- make_tuple(&aom_masked_sad32x16_ssse3, &aom_masked_sad32x16_c),
- make_tuple(&aom_masked_sad16x32_ssse3, &aom_masked_sad16x32_c),
- make_tuple(&aom_masked_sad16x16_ssse3, &aom_masked_sad16x16_c),
- make_tuple(&aom_masked_sad16x8_ssse3, &aom_masked_sad16x8_c),
- make_tuple(&aom_masked_sad8x16_ssse3, &aom_masked_sad8x16_c),
- make_tuple(&aom_masked_sad8x8_ssse3, &aom_masked_sad8x8_c),
- make_tuple(&aom_masked_sad8x4_ssse3, &aom_masked_sad8x4_c),
- make_tuple(&aom_masked_sad4x8_ssse3, &aom_masked_sad4x8_c),
- make_tuple(&aom_masked_sad4x4_ssse3, &aom_masked_sad4x4_c)));
-#if CONFIG_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(SSSE3_C_COMPARE, HighbdMaskedSADTest,
+// TODO(david.barker): Re-enable this once we have vectorized
+// versions of the masked_compound_* functions
+#if 0 && HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(SSSE3_C_COMPARE, MaskedSADTest,
::testing::Values(
#if CONFIG_EXT_PARTITION
- make_tuple(&aom_highbd_masked_sad128x128_ssse3,
- &aom_highbd_masked_sad128x128_c),
- make_tuple(&aom_highbd_masked_sad128x64_ssse3,
- &aom_highbd_masked_sad128x64_c),
- make_tuple(&aom_highbd_masked_sad64x128_ssse3,
- &aom_highbd_masked_sad64x128_c),
+ make_tuple(&aom_masked_compound_sad128x128_ssse3,
+ &aom_masked_compound_sad128x128_c),
+ make_tuple(&aom_masked_compound_sad128x64_ssse3,
+ &aom_masked_compound_sad128x64_c),
+ make_tuple(&aom_masked_compound_sad64x128_ssse3,
+ &aom_masked_compound_sad64x128_c),
#endif // CONFIG_EXT_PARTITION
- make_tuple(&aom_highbd_masked_sad64x64_ssse3,
- &aom_highbd_masked_sad64x64_c),
- make_tuple(&aom_highbd_masked_sad64x32_ssse3,
- &aom_highbd_masked_sad64x32_c),
- make_tuple(&aom_highbd_masked_sad32x64_ssse3,
- &aom_highbd_masked_sad32x64_c),
- make_tuple(&aom_highbd_masked_sad32x32_ssse3,
- &aom_highbd_masked_sad32x32_c),
- make_tuple(&aom_highbd_masked_sad32x16_ssse3,
- &aom_highbd_masked_sad32x16_c),
- make_tuple(&aom_highbd_masked_sad16x32_ssse3,
- &aom_highbd_masked_sad16x32_c),
- make_tuple(&aom_highbd_masked_sad16x16_ssse3,
- &aom_highbd_masked_sad16x16_c),
- make_tuple(&aom_highbd_masked_sad16x8_ssse3,
- &aom_highbd_masked_sad16x8_c),
- make_tuple(&aom_highbd_masked_sad8x16_ssse3,
- &aom_highbd_masked_sad8x16_c),
- make_tuple(&aom_highbd_masked_sad8x8_ssse3,
- &aom_highbd_masked_sad8x8_c),
- make_tuple(&aom_highbd_masked_sad8x4_ssse3,
- &aom_highbd_masked_sad8x4_c),
- make_tuple(&aom_highbd_masked_sad4x8_ssse3,
- &aom_highbd_masked_sad4x8_c),
- make_tuple(&aom_highbd_masked_sad4x4_ssse3,
- &aom_highbd_masked_sad4x4_c)));
+ make_tuple(&aom_masked_compound_sad64x64_ssse3,
+ &aom_masked_compound_sad64x64_c),
+ make_tuple(&aom_masked_compound_sad64x32_ssse3,
+ &aom_masked_compound_sad64x32_c),
+ make_tuple(&aom_masked_compound_sad32x64_ssse3,
+ &aom_masked_compound_sad32x64_c),
+ make_tuple(&aom_masked_compound_sad32x32_ssse3,
+ &aom_masked_compound_sad32x32_c),
+ make_tuple(&aom_masked_compound_sad32x16_ssse3,
+ &aom_masked_compound_sad32x16_c),
+ make_tuple(&aom_masked_compound_sad16x32_ssse3,
+ &aom_masked_compound_sad16x32_c),
+ make_tuple(&aom_masked_compound_sad16x16_ssse3,
+ &aom_masked_compound_sad16x16_c),
+ make_tuple(&aom_masked_compound_sad16x8_ssse3,
+ &aom_masked_compound_sad16x8_c),
+ make_tuple(&aom_masked_compound_sad8x16_ssse3,
+ &aom_masked_compound_sad8x16_c),
+ make_tuple(&aom_masked_compound_sad8x8_ssse3,
+ &aom_masked_compound_sad8x8_c),
+ make_tuple(&aom_masked_compound_sad8x4_ssse3,
+ &aom_masked_compound_sad8x4_c),
+ make_tuple(&aom_masked_compound_sad4x8_ssse3,
+ &aom_masked_compound_sad4x8_c),
+ make_tuple(&aom_masked_compound_sad4x4_ssse3,
+ &aom_masked_compound_sad4x4_c)));
+#if CONFIG_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+ SSSE3_C_COMPARE, HighbdMaskedSADTest,
+ ::testing::Values(
+#if CONFIG_EXT_PARTITION
+ make_tuple(&aom_highbd_masked_compound_sad128x128_ssse3,
+ &aom_highbd_masked_compound_sad128x128_c),
+ make_tuple(&aom_highbd_masked_compound_sad128x64_ssse3,
+ &aom_highbd_masked_compound_sad128x64_c),
+ make_tuple(&aom_highbd_masked_compound_sad64x128_ssse3,
+ &aom_highbd_masked_compound_sad64x128_c),
+#endif // CONFIG_EXT_PARTITION
+ make_tuple(&aom_highbd_masked_compound_sad64x64_ssse3,
+ &aom_highbd_masked_compound_sad64x64_c),
+ make_tuple(&aom_highbd_masked_compound_sad64x32_ssse3,
+ &aom_highbd_masked_compound_sad64x32_c),
+ make_tuple(&aom_highbd_masked_compound_sad32x64_ssse3,
+ &aom_highbd_masked_compound_sad32x64_c),
+ make_tuple(&aom_highbd_masked_compound_sad32x32_ssse3,
+ &aom_highbd_masked_compound_sad32x32_c),
+ make_tuple(&aom_highbd_masked_compound_sad32x16_ssse3,
+ &aom_highbd_masked_compound_sad32x16_c),
+ make_tuple(&aom_highbd_masked_compound_sad16x32_ssse3,
+ &aom_highbd_masked_compound_sad16x32_c),
+ make_tuple(&aom_highbd_masked_compound_sad16x16_ssse3,
+ &aom_highbd_masked_compound_sad16x16_c),
+ make_tuple(&aom_highbd_masked_compound_sad16x8_ssse3,
+ &aom_highbd_masked_compound_sad16x8_c),
+ make_tuple(&aom_highbd_masked_compound_sad8x16_ssse3,
+ &aom_highbd_masked_compound_sad8x16_c),
+ make_tuple(&aom_highbd_masked_compound_sad8x8_ssse3,
+ &aom_highbd_masked_compound_sad8x8_c),
+ make_tuple(&aom_highbd_masked_compound_sad8x4_ssse3,
+ &aom_highbd_masked_compound_sad8x4_c),
+ make_tuple(&aom_highbd_masked_compound_sad4x8_ssse3,
+ &aom_highbd_masked_compound_sad4x8_c),
+ make_tuple(&aom_highbd_masked_compound_sad4x4_ssse3,
+ &aom_highbd_masked_compound_sad4x4_c)));
#endif // CONFIG_HIGHBITDEPTH
-#endif // HAVE_SSSE3
+#endif // 0 && HAVE_SSSE3
} // namespace
diff --git a/test/masked_variance_test.cc b/test/masked_variance_test.cc
index 65e852a..e0fc010 100644
--- a/test/masked_variance_test.cc
+++ b/test/masked_variance_test.cc
@@ -31,105 +31,10 @@
namespace {
const int number_of_iterations = 500;
-typedef unsigned int (*MaskedVarianceFunc)(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- const uint8_t *m, int m_stride,
- unsigned int *sse);
-
-typedef std::tr1::tuple<MaskedVarianceFunc, MaskedVarianceFunc>
- MaskedVarianceParam;
-
-class MaskedVarianceTest
- : public ::testing::TestWithParam<MaskedVarianceParam> {
- public:
- virtual ~MaskedVarianceTest() {}
- virtual void SetUp() {
- opt_func_ = GET_PARAM(0);
- ref_func_ = GET_PARAM(1);
- }
-
- virtual void TearDown() { libaom_test::ClearSystemState(); }
-
- protected:
- MaskedVarianceFunc opt_func_;
- MaskedVarianceFunc ref_func_;
-};
-
-TEST_P(MaskedVarianceTest, OperationCheck) {
- unsigned int ref_ret, opt_ret;
- unsigned int ref_sse, opt_sse;
- ACMRandom rnd(ACMRandom::DeterministicSeed());
- DECLARE_ALIGNED(16, uint8_t, src_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
- DECLARE_ALIGNED(16, uint8_t, ref_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
- DECLARE_ALIGNED(16, uint8_t, msk_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
- int err_count = 0;
- int first_failure = -1;
- int src_stride = MAX_SB_SIZE;
- int ref_stride = MAX_SB_SIZE;
- int msk_stride = MAX_SB_SIZE;
-
- for (int i = 0; i < number_of_iterations; ++i) {
- for (int j = 0; j < MAX_SB_SIZE * MAX_SB_SIZE; j++) {
- src_ptr[j] = rnd.Rand8();
- ref_ptr[j] = rnd.Rand8();
- msk_ptr[j] = rnd(65);
- }
-
- ref_ret = ref_func_(src_ptr, src_stride, ref_ptr, ref_stride, msk_ptr,
- msk_stride, &ref_sse);
- ASM_REGISTER_STATE_CHECK(opt_ret = opt_func_(src_ptr, src_stride, ref_ptr,
- ref_stride, msk_ptr,
- msk_stride, &opt_sse));
-
- if (opt_ret != ref_ret || opt_sse != ref_sse) {
- err_count++;
- if (first_failure == -1) first_failure = i;
- }
- }
-
- EXPECT_EQ(0, err_count) << "Error: Masked Variance Test OperationCheck,"
- << "C output doesn't match SSSE3 output. "
- << "First failed at test case " << first_failure;
-}
-
-TEST_P(MaskedVarianceTest, ExtremeValues) {
- unsigned int ref_ret, opt_ret;
- unsigned int ref_sse, opt_sse;
- ACMRandom rnd(ACMRandom::DeterministicSeed());
- DECLARE_ALIGNED(16, uint8_t, src_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
- DECLARE_ALIGNED(16, uint8_t, ref_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
- DECLARE_ALIGNED(16, uint8_t, msk_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
- int err_count = 0;
- int first_failure = -1;
- int src_stride = MAX_SB_SIZE;
- int ref_stride = MAX_SB_SIZE;
- int msk_stride = MAX_SB_SIZE;
-
- for (int i = 0; i < 8; ++i) {
- memset(src_ptr, (i & 0x1) ? 255 : 0, MAX_SB_SIZE * MAX_SB_SIZE);
- memset(ref_ptr, (i & 0x2) ? 255 : 0, MAX_SB_SIZE * MAX_SB_SIZE);
- memset(msk_ptr, (i & 0x4) ? 64 : 0, MAX_SB_SIZE * MAX_SB_SIZE);
-
- ref_ret = ref_func_(src_ptr, src_stride, ref_ptr, ref_stride, msk_ptr,
- msk_stride, &ref_sse);
- ASM_REGISTER_STATE_CHECK(opt_ret = opt_func_(src_ptr, src_stride, ref_ptr,
- ref_stride, msk_ptr,
- msk_stride, &opt_sse));
-
- if (opt_ret != ref_ret || opt_sse != ref_sse) {
- err_count++;
- if (first_failure == -1) first_failure = i;
- }
- }
-
- EXPECT_EQ(0, err_count) << "Error: Masked Variance Test ExtremeValues,"
- << "C output doesn't match SSSE3 output. "
- << "First failed at test case " << first_failure;
-}
-
typedef unsigned int (*MaskedSubPixelVarianceFunc)(
- const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b,
- int b_stride, const uint8_t *m, int m_stride, unsigned int *sse);
+ const uint8_t *src, int src_stride, int xoffset, int yoffset,
+ const uint8_t *ref, int ref_stride, const uint8_t *second_pred,
+ const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
typedef std::tr1::tuple<MaskedSubPixelVarianceFunc, MaskedSubPixelVarianceFunc>
MaskedSubPixelVarianceParam;
@@ -156,6 +61,8 @@
ACMRandom rnd(ACMRandom::DeterministicSeed());
DECLARE_ALIGNED(16, uint8_t, src_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1)]);
DECLARE_ALIGNED(16, uint8_t, ref_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1)]);
+ DECLARE_ALIGNED(16, uint8_t,
+ second_pred_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1)]);
DECLARE_ALIGNED(16, uint8_t, msk_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1)]);
int err_count = 0;
int first_failure = -1;
@@ -171,6 +78,7 @@
for (int j = 0; j < (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1); j++) {
src_ptr[j] = rnd.Rand8();
ref_ptr[j] = rnd.Rand8();
+ second_pred_ptr[j] = rnd.Rand8();
msk_ptr[j] = rnd(65);
}
for (int k = 0; k < 3; k++) {
@@ -178,16 +86,23 @@
for (int l = 0; l < 3; l++) {
xoffset = xoffsets[k];
yoffset = yoffsets[l];
+ for (int invert_mask = 0; invert_mask < 2; ++invert_mask) {
+ // const uint8_t *src, int src_stride, int xoffset, int yoffset, const
+ // uint8_t *ref, int ref_stride,
+ // const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int
+ // invert_mask, unsigned int *sse
+ ref_ret = ref_func_(src_ptr, src_stride, xoffset, yoffset, ref_ptr,
+ ref_stride, second_pred_ptr, msk_ptr, msk_stride,
+ invert_mask, &ref_sse);
+ ASM_REGISTER_STATE_CHECK(
+ opt_ret = opt_func_(src_ptr, src_stride, xoffset, yoffset,
+ ref_ptr, ref_stride, second_pred_ptr, msk_ptr,
+ msk_stride, invert_mask, &opt_sse));
- ref_ret = ref_func_(src_ptr, src_stride, xoffset, yoffset, ref_ptr,
- ref_stride, msk_ptr, msk_stride, &ref_sse);
- ASM_REGISTER_STATE_CHECK(
- opt_ret = opt_func_(src_ptr, src_stride, xoffset, yoffset, ref_ptr,
- ref_stride, msk_ptr, msk_stride, &opt_sse));
-
- if (opt_ret != ref_ret || opt_sse != ref_sse) {
- err_count++;
- if (first_failure == -1) first_failure = i;
+ if (opt_ret != ref_ret || opt_sse != ref_sse) {
+ err_count++;
+ if (first_failure == -1) first_failure = i;
+ }
}
}
}
@@ -205,6 +120,8 @@
ACMRandom rnd(ACMRandom::DeterministicSeed());
DECLARE_ALIGNED(16, uint8_t, src_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1)]);
DECLARE_ALIGNED(16, uint8_t, ref_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1)]);
+ DECLARE_ALIGNED(16, uint8_t,
+ second_pred_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1)]);
DECLARE_ALIGNED(16, uint8_t, msk_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1)]);
int first_failure_x = -1;
int first_failure_y = -1;
@@ -216,26 +133,32 @@
for (int xoffset = 0; xoffset < BIL_SUBPEL_SHIFTS; xoffset++) {
for (int yoffset = 0; yoffset < BIL_SUBPEL_SHIFTS; yoffset++) {
- for (int i = 0; i < 8; ++i) {
+ for (int i = 0; i < 16; ++i) {
memset(src_ptr, (i & 0x1) ? 255 : 0,
(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1));
memset(ref_ptr, (i & 0x2) ? 255 : 0,
(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1));
- memset(msk_ptr, (i & 0x4) ? 64 : 0,
+ memset(second_pred_ptr, (i & 0x4) ? 255 : 0,
+ (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1));
+ memset(msk_ptr, (i & 0x8) ? 64 : 0,
(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1));
- ref_ret = ref_func_(src_ptr, src_stride, xoffset, yoffset, ref_ptr,
- ref_stride, msk_ptr, msk_stride, &ref_sse);
- ASM_REGISTER_STATE_CHECK(
- opt_ret = opt_func_(src_ptr, src_stride, xoffset, yoffset, ref_ptr,
- ref_stride, msk_ptr, msk_stride, &opt_sse));
+ for (int invert_mask = 0; invert_mask < 2; ++invert_mask) {
+ ref_ret = ref_func_(src_ptr, src_stride, xoffset, yoffset, ref_ptr,
+ ref_stride, second_pred_ptr, msk_ptr, msk_stride,
+ invert_mask, &ref_sse);
+ ASM_REGISTER_STATE_CHECK(
+ opt_ret = opt_func_(src_ptr, src_stride, xoffset, yoffset,
+ ref_ptr, ref_stride, second_pred_ptr, msk_ptr,
+ msk_stride, invert_mask, &opt_sse));
- if (opt_ret != ref_ret || opt_sse != ref_sse) {
- err_count++;
- if (first_failure == -1) {
- first_failure = i;
- first_failure_x = xoffset;
- first_failure_y = yoffset;
+ if (opt_ret != ref_ret || opt_sse != ref_sse) {
+ err_count++;
+ if (first_failure == -1) {
+ first_failure = i;
+ first_failure_x = xoffset;
+ first_failure_y = yoffset;
+ }
}
}
}
@@ -250,105 +173,6 @@
}
#if CONFIG_HIGHBITDEPTH
-typedef std::tr1::tuple<MaskedVarianceFunc, MaskedVarianceFunc, aom_bit_depth_t>
- HighbdMaskedVarianceParam;
-
-class HighbdMaskedVarianceTest
- : public ::testing::TestWithParam<HighbdMaskedVarianceParam> {
- public:
- virtual ~HighbdMaskedVarianceTest() {}
- virtual void SetUp() {
- opt_func_ = GET_PARAM(0);
- ref_func_ = GET_PARAM(1);
- bit_depth_ = GET_PARAM(2);
- }
-
- virtual void TearDown() { libaom_test::ClearSystemState(); }
-
- protected:
- MaskedVarianceFunc opt_func_;
- MaskedVarianceFunc ref_func_;
- aom_bit_depth_t bit_depth_;
-};
-
-TEST_P(HighbdMaskedVarianceTest, OperationCheck) {
- unsigned int ref_ret, opt_ret;
- unsigned int ref_sse, opt_sse;
- ACMRandom rnd(ACMRandom::DeterministicSeed());
- DECLARE_ALIGNED(16, uint16_t, src_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
- DECLARE_ALIGNED(16, uint16_t, ref_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
- DECLARE_ALIGNED(16, uint8_t, msk_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
- uint8_t *src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
- uint8_t *ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
- int err_count = 0;
- int first_failure = -1;
- int src_stride = MAX_SB_SIZE;
- int ref_stride = MAX_SB_SIZE;
- int msk_stride = MAX_SB_SIZE;
-
- for (int i = 0; i < number_of_iterations; ++i) {
- for (int j = 0; j < MAX_SB_SIZE * MAX_SB_SIZE; j++) {
- src_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
- ref_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
- msk_ptr[j] = rnd(65);
- }
-
- ref_ret = ref_func_(src8_ptr, src_stride, ref8_ptr, ref_stride, msk_ptr,
- msk_stride, &ref_sse);
- ASM_REGISTER_STATE_CHECK(opt_ret = opt_func_(src8_ptr, src_stride, ref8_ptr,
- ref_stride, msk_ptr,
- msk_stride, &opt_sse));
-
- if (opt_ret != ref_ret || opt_sse != ref_sse) {
- err_count++;
- if (first_failure == -1) first_failure = i;
- }
- }
-
- EXPECT_EQ(0, err_count) << "Error: Masked Variance Test OperationCheck,"
- << "C output doesn't match SSSE3 output. "
- << "First failed at test case " << first_failure;
-}
-
-TEST_P(HighbdMaskedVarianceTest, ExtremeValues) {
- unsigned int ref_ret, opt_ret;
- unsigned int ref_sse, opt_sse;
- ACMRandom rnd(ACMRandom::DeterministicSeed());
- DECLARE_ALIGNED(16, uint16_t, src_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
- DECLARE_ALIGNED(16, uint16_t, ref_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
- DECLARE_ALIGNED(16, uint8_t, msk_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
- uint8_t *src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
- uint8_t *ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
- int err_count = 0;
- int first_failure = -1;
- int src_stride = MAX_SB_SIZE;
- int ref_stride = MAX_SB_SIZE;
- int msk_stride = MAX_SB_SIZE;
-
- for (int i = 0; i < 8; ++i) {
- aom_memset16(src_ptr, (i & 0x1) ? ((1 << bit_depth_) - 1) : 0,
- MAX_SB_SIZE * MAX_SB_SIZE);
- aom_memset16(ref_ptr, (i & 0x2) ? ((1 << bit_depth_) - 1) : 0,
- MAX_SB_SIZE * MAX_SB_SIZE);
- memset(msk_ptr, (i & 0x4) ? 64 : 0, MAX_SB_SIZE * MAX_SB_SIZE);
-
- ref_ret = ref_func_(src8_ptr, src_stride, ref8_ptr, ref_stride, msk_ptr,
- msk_stride, &ref_sse);
- ASM_REGISTER_STATE_CHECK(opt_ret = opt_func_(src8_ptr, src_stride, ref8_ptr,
- ref_stride, msk_ptr,
- msk_stride, &opt_sse));
-
- if (opt_ret != ref_ret || opt_sse != ref_sse) {
- err_count++;
- if (first_failure == -1) first_failure = i;
- }
- }
-
- EXPECT_EQ(0, err_count) << "Error: Masked Variance Test ExtremeValues,"
- << "C output doesn't match SSSE3 output. "
- << "First failed at test case " << first_failure;
-}
-
typedef std::tr1::tuple<MaskedSubPixelVarianceFunc, MaskedSubPixelVarianceFunc,
aom_bit_depth_t>
HighbdMaskedSubPixelVarianceParam;
@@ -377,9 +201,12 @@
ACMRandom rnd(ACMRandom::DeterministicSeed());
DECLARE_ALIGNED(16, uint16_t, src_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1)]);
DECLARE_ALIGNED(16, uint16_t, ref_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1)]);
+ DECLARE_ALIGNED(16, uint16_t,
+ second_pred_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1)]);
DECLARE_ALIGNED(16, uint8_t, msk_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1)]);
uint8_t *src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
uint8_t *ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
+ uint8_t *second_pred8_ptr = CONVERT_TO_BYTEPTR(second_pred_ptr);
int err_count = 0;
int first_failure = -1;
int first_failure_x = -1;
@@ -395,22 +222,26 @@
for (int j = 0; j < (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1); j++) {
src_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
ref_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
+ second_pred_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
msk_ptr[j] = rnd(65);
}
- ref_ret = ref_func_(src8_ptr, src_stride, xoffset, yoffset, ref8_ptr,
- ref_stride, msk_ptr, msk_stride, &ref_sse);
- ASM_REGISTER_STATE_CHECK(opt_ret =
- opt_func_(src8_ptr, src_stride, xoffset,
- yoffset, ref8_ptr, ref_stride,
- msk_ptr, msk_stride, &opt_sse));
+ for (int invert_mask = 0; invert_mask < 2; ++invert_mask) {
+ ref_ret = ref_func_(src8_ptr, src_stride, xoffset, yoffset, ref8_ptr,
+ ref_stride, second_pred8_ptr, msk_ptr, msk_stride,
+ invert_mask, &ref_sse);
+ ASM_REGISTER_STATE_CHECK(
+ opt_ret = opt_func_(src8_ptr, src_stride, xoffset, yoffset,
+ ref8_ptr, ref_stride, second_pred8_ptr,
+ msk_ptr, msk_stride, invert_mask, &opt_sse));
- if (opt_ret != ref_ret || opt_sse != ref_sse) {
- err_count++;
- if (first_failure == -1) {
- first_failure = i;
- first_failure_x = xoffset;
- first_failure_y = yoffset;
+ if (opt_ret != ref_ret || opt_sse != ref_sse) {
+ err_count++;
+ if (first_failure == -1) {
+ first_failure = i;
+ first_failure_x = xoffset;
+ first_failure_y = yoffset;
+ }
}
}
}
@@ -431,8 +262,11 @@
DECLARE_ALIGNED(16, uint16_t, src_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1)]);
DECLARE_ALIGNED(16, uint16_t, ref_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1)]);
DECLARE_ALIGNED(16, uint8_t, msk_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1)]);
+ DECLARE_ALIGNED(16, uint16_t,
+ second_pred_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1)]);
uint8_t *src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
uint8_t *ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
+ uint8_t *second_pred8_ptr = CONVERT_TO_BYTEPTR(second_pred_ptr);
int first_failure_x = -1;
int first_failure_y = -1;
int err_count = 0;
@@ -443,27 +277,32 @@
for (int xoffset = 0; xoffset < BIL_SUBPEL_SHIFTS; xoffset++) {
for (int yoffset = 0; yoffset < BIL_SUBPEL_SHIFTS; yoffset++) {
- for (int i = 0; i < 8; ++i) {
+ for (int i = 0; i < 16; ++i) {
aom_memset16(src_ptr, (i & 0x1) ? ((1 << bit_depth_) - 1) : 0,
(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1));
aom_memset16(ref_ptr, (i & 0x2) ? ((1 << bit_depth_) - 1) : 0,
(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1));
- memset(msk_ptr, (i & 0x4) ? 64 : 0,
+ aom_memset16(second_pred_ptr, (i & 0x4) ? ((1 << bit_depth_) - 1) : 0,
+ (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1));
+ memset(msk_ptr, (i & 0x8) ? 64 : 0,
(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1));
- ref_ret = ref_func_(src8_ptr, src_stride, xoffset, yoffset, ref8_ptr,
- ref_stride, msk_ptr, msk_stride, &ref_sse);
- ASM_REGISTER_STATE_CHECK(opt_ret =
- opt_func_(src8_ptr, src_stride, xoffset,
- yoffset, ref8_ptr, ref_stride,
- msk_ptr, msk_stride, &opt_sse));
+ for (int invert_mask = 0; invert_mask < 2; ++invert_mask) {
+ ref_ret = ref_func_(src8_ptr, src_stride, xoffset, yoffset, ref8_ptr,
+ ref_stride, second_pred8_ptr, msk_ptr, msk_stride,
+ invert_mask, &ref_sse);
+ ASM_REGISTER_STATE_CHECK(
+ opt_ret = opt_func_(src8_ptr, src_stride, xoffset, yoffset,
+ ref8_ptr, ref_stride, second_pred8_ptr,
+ msk_ptr, msk_stride, invert_mask, &opt_sse));
- if (opt_ret != ref_ret || opt_sse != ref_sse) {
- err_count++;
- if (first_failure == -1) {
- first_failure = i;
- first_failure_x = xoffset;
- first_failure_y = yoffset;
+ if (opt_ret != ref_ret || opt_sse != ref_sse) {
+ err_count++;
+ if (first_failure == -1) {
+ first_failure = i;
+ first_failure_x = xoffset;
+ first_failure_y = yoffset;
+ }
}
}
}
@@ -480,311 +319,208 @@
using std::tr1::make_tuple;
-#if HAVE_SSSE3
-INSTANTIATE_TEST_CASE_P(
- SSSE3_C_COMPARE, MaskedVarianceTest,
- ::testing::Values(
-#if CONFIG_EXT_PARTITION
- make_tuple(&aom_masked_variance128x128_ssse3,
- &aom_masked_variance128x128_c),
- make_tuple(&aom_masked_variance128x64_ssse3,
- &aom_masked_variance128x64_c),
- make_tuple(&aom_masked_variance64x128_ssse3,
- &aom_masked_variance64x128_c),
-#endif // CONFIG_EXT_PARTITION
- make_tuple(&aom_masked_variance64x64_ssse3,
- &aom_masked_variance64x64_c),
- make_tuple(&aom_masked_variance64x32_ssse3,
- &aom_masked_variance64x32_c),
- make_tuple(&aom_masked_variance32x64_ssse3,
- &aom_masked_variance32x64_c),
- make_tuple(&aom_masked_variance32x32_ssse3,
- &aom_masked_variance32x32_c),
- make_tuple(&aom_masked_variance32x16_ssse3,
- &aom_masked_variance32x16_c),
- make_tuple(&aom_masked_variance16x32_ssse3,
- &aom_masked_variance16x32_c),
- make_tuple(&aom_masked_variance16x16_ssse3,
- &aom_masked_variance16x16_c),
- make_tuple(&aom_masked_variance16x8_ssse3, &aom_masked_variance16x8_c),
- make_tuple(&aom_masked_variance8x16_ssse3, &aom_masked_variance8x16_c),
- make_tuple(&aom_masked_variance8x8_ssse3, &aom_masked_variance8x8_c),
- make_tuple(&aom_masked_variance8x4_ssse3, &aom_masked_variance8x4_c),
- make_tuple(&aom_masked_variance4x8_ssse3, &aom_masked_variance4x8_c),
- make_tuple(&aom_masked_variance4x4_ssse3, &aom_masked_variance4x4_c)));
-
+// TODO(david.barker): Re-enable this once we have vectorized
+// versions of the masked_compound_* functions
+#if 0 && HAVE_SSSE3
INSTANTIATE_TEST_CASE_P(
SSSE3_C_COMPARE, MaskedSubPixelVarianceTest,
::testing::Values(
#if CONFIG_EXT_PARTITION
- make_tuple(&aom_masked_sub_pixel_variance128x128_ssse3,
- &aom_masked_sub_pixel_variance128x128_c),
- make_tuple(&aom_masked_sub_pixel_variance128x64_ssse3,
- &aom_masked_sub_pixel_variance128x64_c),
- make_tuple(&aom_masked_sub_pixel_variance64x128_ssse3,
- &aom_masked_sub_pixel_variance64x128_c),
+ make_tuple(&aom_masked_compound_sub_pixel_variance128x128_ssse3,
+ &aom_masked_compound_sub_pixel_variance128x128_c),
+ make_tuple(&aom_masked_compound_sub_pixel_variance128x64_ssse3,
+ &aom_masked_compound_sub_pixel_variance128x64_c),
+ make_tuple(&aom_masked_compound_sub_pixel_variance64x128_ssse3,
+ &aom_masked_compound_sub_pixel_variance64x128_c),
#endif // CONFIG_EXT_PARTITION
- make_tuple(&aom_masked_sub_pixel_variance64x64_ssse3,
- &aom_masked_sub_pixel_variance64x64_c),
- make_tuple(&aom_masked_sub_pixel_variance64x32_ssse3,
- &aom_masked_sub_pixel_variance64x32_c),
- make_tuple(&aom_masked_sub_pixel_variance32x64_ssse3,
- &aom_masked_sub_pixel_variance32x64_c),
- make_tuple(&aom_masked_sub_pixel_variance32x32_ssse3,
- &aom_masked_sub_pixel_variance32x32_c),
- make_tuple(&aom_masked_sub_pixel_variance32x16_ssse3,
- &aom_masked_sub_pixel_variance32x16_c),
- make_tuple(&aom_masked_sub_pixel_variance16x32_ssse3,
- &aom_masked_sub_pixel_variance16x32_c),
- make_tuple(&aom_masked_sub_pixel_variance16x16_ssse3,
- &aom_masked_sub_pixel_variance16x16_c),
- make_tuple(&aom_masked_sub_pixel_variance16x8_ssse3,
- &aom_masked_sub_pixel_variance16x8_c),
- make_tuple(&aom_masked_sub_pixel_variance8x16_ssse3,
- &aom_masked_sub_pixel_variance8x16_c),
- make_tuple(&aom_masked_sub_pixel_variance8x8_ssse3,
- &aom_masked_sub_pixel_variance8x8_c),
- make_tuple(&aom_masked_sub_pixel_variance8x4_ssse3,
- &aom_masked_sub_pixel_variance8x4_c),
- make_tuple(&aom_masked_sub_pixel_variance4x8_ssse3,
- &aom_masked_sub_pixel_variance4x8_c),
- make_tuple(&aom_masked_sub_pixel_variance4x4_ssse3,
- &aom_masked_sub_pixel_variance4x4_c)));
+ make_tuple(&aom_masked_compound_sub_pixel_variance64x64_ssse3,
+ &aom_masked_compound_sub_pixel_variance64x64_c),
+ make_tuple(&aom_masked_compound_sub_pixel_variance64x32_ssse3,
+ &aom_masked_compound_sub_pixel_variance64x32_c),
+ make_tuple(&aom_masked_compound_sub_pixel_variance32x64_ssse3,
+ &aom_masked_compound_sub_pixel_variance32x64_c),
+ make_tuple(&aom_masked_compound_sub_pixel_variance32x32_ssse3,
+ &aom_masked_compound_sub_pixel_variance32x32_c),
+ make_tuple(&aom_masked_compound_sub_pixel_variance32x16_ssse3,
+ &aom_masked_compound_sub_pixel_variance32x16_c),
+ make_tuple(&aom_masked_compound_sub_pixel_variance16x32_ssse3,
+ &aom_masked_compound_sub_pixel_variance16x32_c),
+ make_tuple(&aom_masked_compound_sub_pixel_variance16x16_ssse3,
+ &aom_masked_compound_sub_pixel_variance16x16_c),
+ make_tuple(&aom_masked_compound_sub_pixel_variance16x8_ssse3,
+ &aom_masked_compound_sub_pixel_variance16x8_c),
+ make_tuple(&aom_masked_compound_sub_pixel_variance8x16_ssse3,
+ &aom_masked_compound_sub_pixel_variance8x16_c),
+ make_tuple(&aom_masked_compound_sub_pixel_variance8x8_ssse3,
+ &aom_masked_compound_sub_pixel_variance8x8_c),
+ make_tuple(&aom_masked_compound_sub_pixel_variance8x4_ssse3,
+ &aom_masked_compound_sub_pixel_variance8x4_c),
+ make_tuple(&aom_masked_compound_sub_pixel_variance4x8_ssse3,
+ &aom_masked_compound_sub_pixel_variance4x8_c),
+ make_tuple(&aom_masked_compound_sub_pixel_variance4x4_ssse3,
+ &aom_masked_compound_sub_pixel_variance4x4_c)));
#if CONFIG_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(
- SSSE3_C_COMPARE, HighbdMaskedVarianceTest,
- ::testing::Values(
-#if CONFIG_EXT_PARTITION
- make_tuple(&aom_highbd_masked_variance128x128_ssse3,
- &aom_highbd_masked_variance128x128_c, AOM_BITS_8),
- make_tuple(&aom_highbd_masked_variance128x64_ssse3,
- &aom_highbd_masked_variance128x64_c, AOM_BITS_8),
- make_tuple(&aom_highbd_masked_variance64x128_ssse3,
- &aom_highbd_masked_variance64x128_c, AOM_BITS_8),
-#endif // CONFIG_EXT_PARTITION
- make_tuple(&aom_highbd_masked_variance64x64_ssse3,
- &aom_highbd_masked_variance64x64_c, AOM_BITS_8),
- make_tuple(&aom_highbd_masked_variance64x32_ssse3,
- &aom_highbd_masked_variance64x32_c, AOM_BITS_8),
- make_tuple(&aom_highbd_masked_variance32x64_ssse3,
- &aom_highbd_masked_variance32x64_c, AOM_BITS_8),
- make_tuple(&aom_highbd_masked_variance32x32_ssse3,
- &aom_highbd_masked_variance32x32_c, AOM_BITS_8),
- make_tuple(&aom_highbd_masked_variance32x16_ssse3,
- &aom_highbd_masked_variance32x16_c, AOM_BITS_8),
- make_tuple(&aom_highbd_masked_variance16x32_ssse3,
- &aom_highbd_masked_variance16x32_c, AOM_BITS_8),
- make_tuple(&aom_highbd_masked_variance16x16_ssse3,
- &aom_highbd_masked_variance16x16_c, AOM_BITS_8),
- make_tuple(&aom_highbd_masked_variance16x8_ssse3,
- &aom_highbd_masked_variance16x8_c, AOM_BITS_8),
- make_tuple(&aom_highbd_masked_variance8x16_ssse3,
- &aom_highbd_masked_variance8x16_c, AOM_BITS_8),
- make_tuple(&aom_highbd_masked_variance8x8_ssse3,
- &aom_highbd_masked_variance8x8_c, AOM_BITS_8),
- make_tuple(&aom_highbd_masked_variance8x4_ssse3,
- &aom_highbd_masked_variance8x4_c, AOM_BITS_8),
- make_tuple(&aom_highbd_masked_variance4x8_ssse3,
- &aom_highbd_masked_variance4x8_c, AOM_BITS_8),
- make_tuple(&aom_highbd_masked_variance4x4_ssse3,
- &aom_highbd_masked_variance4x4_c, AOM_BITS_8),
-#if CONFIG_EXT_PARTITION
- make_tuple(&aom_highbd_10_masked_variance128x128_ssse3,
- &aom_highbd_10_masked_variance128x128_c, AOM_BITS_10),
- make_tuple(&aom_highbd_10_masked_variance128x64_ssse3,
- &aom_highbd_10_masked_variance128x64_c, AOM_BITS_10),
- make_tuple(&aom_highbd_10_masked_variance64x128_ssse3,
- &aom_highbd_10_masked_variance64x128_c, AOM_BITS_10),
-#endif // CONFIG_EXT_PARTITION
- make_tuple(&aom_highbd_10_masked_variance64x64_ssse3,
- &aom_highbd_10_masked_variance64x64_c, AOM_BITS_10),
- make_tuple(&aom_highbd_10_masked_variance64x32_ssse3,
- &aom_highbd_10_masked_variance64x32_c, AOM_BITS_10),
- make_tuple(&aom_highbd_10_masked_variance32x64_ssse3,
- &aom_highbd_10_masked_variance32x64_c, AOM_BITS_10),
- make_tuple(&aom_highbd_10_masked_variance32x32_ssse3,
- &aom_highbd_10_masked_variance32x32_c, AOM_BITS_10),
- make_tuple(&aom_highbd_10_masked_variance32x16_ssse3,
- &aom_highbd_10_masked_variance32x16_c, AOM_BITS_10),
- make_tuple(&aom_highbd_10_masked_variance16x32_ssse3,
- &aom_highbd_10_masked_variance16x32_c, AOM_BITS_10),
- make_tuple(&aom_highbd_10_masked_variance16x16_ssse3,
- &aom_highbd_10_masked_variance16x16_c, AOM_BITS_10),
- make_tuple(&aom_highbd_10_masked_variance16x8_ssse3,
- &aom_highbd_10_masked_variance16x8_c, AOM_BITS_10),
- make_tuple(&aom_highbd_10_masked_variance8x16_ssse3,
- &aom_highbd_10_masked_variance8x16_c, AOM_BITS_10),
- make_tuple(&aom_highbd_10_masked_variance8x8_ssse3,
- &aom_highbd_10_masked_variance8x8_c, AOM_BITS_10),
- make_tuple(&aom_highbd_10_masked_variance8x4_ssse3,
- &aom_highbd_10_masked_variance8x4_c, AOM_BITS_10),
- make_tuple(&aom_highbd_10_masked_variance4x8_ssse3,
- &aom_highbd_10_masked_variance4x8_c, AOM_BITS_10),
- make_tuple(&aom_highbd_10_masked_variance4x4_ssse3,
- &aom_highbd_10_masked_variance4x4_c, AOM_BITS_10),
-#if CONFIG_EXT_PARTITION
- make_tuple(&aom_highbd_12_masked_variance128x128_ssse3,
- &aom_highbd_12_masked_variance128x128_c, AOM_BITS_12),
- make_tuple(&aom_highbd_12_masked_variance128x64_ssse3,
- &aom_highbd_12_masked_variance128x64_c, AOM_BITS_12),
- make_tuple(&aom_highbd_12_masked_variance64x128_ssse3,
- &aom_highbd_12_masked_variance64x128_c, AOM_BITS_12),
-#endif // CONFIG_EXT_PARTITION
- make_tuple(&aom_highbd_12_masked_variance64x64_ssse3,
- &aom_highbd_12_masked_variance64x64_c, AOM_BITS_12),
- make_tuple(&aom_highbd_12_masked_variance64x32_ssse3,
- &aom_highbd_12_masked_variance64x32_c, AOM_BITS_12),
- make_tuple(&aom_highbd_12_masked_variance32x64_ssse3,
- &aom_highbd_12_masked_variance32x64_c, AOM_BITS_12),
- make_tuple(&aom_highbd_12_masked_variance32x32_ssse3,
- &aom_highbd_12_masked_variance32x32_c, AOM_BITS_12),
- make_tuple(&aom_highbd_12_masked_variance32x16_ssse3,
- &aom_highbd_12_masked_variance32x16_c, AOM_BITS_12),
- make_tuple(&aom_highbd_12_masked_variance16x32_ssse3,
- &aom_highbd_12_masked_variance16x32_c, AOM_BITS_12),
- make_tuple(&aom_highbd_12_masked_variance16x16_ssse3,
- &aom_highbd_12_masked_variance16x16_c, AOM_BITS_12),
- make_tuple(&aom_highbd_12_masked_variance16x8_ssse3,
- &aom_highbd_12_masked_variance16x8_c, AOM_BITS_12),
- make_tuple(&aom_highbd_12_masked_variance8x16_ssse3,
- &aom_highbd_12_masked_variance8x16_c, AOM_BITS_12),
- make_tuple(&aom_highbd_12_masked_variance8x8_ssse3,
- &aom_highbd_12_masked_variance8x8_c, AOM_BITS_12),
- make_tuple(&aom_highbd_12_masked_variance8x4_ssse3,
- &aom_highbd_12_masked_variance8x4_c, AOM_BITS_12),
- make_tuple(&aom_highbd_12_masked_variance4x8_ssse3,
- &aom_highbd_12_masked_variance4x8_c, AOM_BITS_12),
- make_tuple(&aom_highbd_12_masked_variance4x4_ssse3,
- &aom_highbd_12_masked_variance4x4_c, AOM_BITS_12)));
-
-INSTANTIATE_TEST_CASE_P(
SSSE3_C_COMPARE, HighbdMaskedSubPixelVarianceTest,
::testing::Values(
#if CONFIG_EXT_PARTITION
- make_tuple(&aom_highbd_masked_sub_pixel_variance128x128_ssse3,
- &aom_highbd_masked_sub_pixel_variance128x128_c, AOM_BITS_8),
- make_tuple(&aom_highbd_masked_sub_pixel_variance128x64_ssse3,
- &aom_highbd_masked_sub_pixel_variance128x64_c, AOM_BITS_8),
- make_tuple(&aom_highbd_masked_sub_pixel_variance64x128_ssse3,
- &aom_highbd_masked_sub_pixel_variance64x128_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_masked_compound_sub_pixel_variance128x128_ssse3,
+ &aom_highbd_masked_compound_sub_pixel_variance128x128_c,
+ AOM_BITS_8),
+ make_tuple(&aom_highbd_masked_compound_sub_pixel_variance128x64_ssse3,
+ &aom_highbd_masked_compound_sub_pixel_variance128x64_c,
+ AOM_BITS_8),
+ make_tuple(&aom_highbd_masked_compound_sub_pixel_variance64x128_ssse3,
+ &aom_highbd_masked_compound_sub_pixel_variance64x128_c,
+ AOM_BITS_8),
#endif // CONFIG_EXT_PARTITION
- make_tuple(&aom_highbd_masked_sub_pixel_variance64x64_ssse3,
- &aom_highbd_masked_sub_pixel_variance64x64_c, AOM_BITS_8),
- make_tuple(&aom_highbd_masked_sub_pixel_variance64x32_ssse3,
- &aom_highbd_masked_sub_pixel_variance64x32_c, AOM_BITS_8),
- make_tuple(&aom_highbd_masked_sub_pixel_variance32x64_ssse3,
- &aom_highbd_masked_sub_pixel_variance32x64_c, AOM_BITS_8),
- make_tuple(&aom_highbd_masked_sub_pixel_variance32x32_ssse3,
- &aom_highbd_masked_sub_pixel_variance32x32_c, AOM_BITS_8),
- make_tuple(&aom_highbd_masked_sub_pixel_variance32x16_ssse3,
- &aom_highbd_masked_sub_pixel_variance32x16_c, AOM_BITS_8),
- make_tuple(&aom_highbd_masked_sub_pixel_variance16x32_ssse3,
- &aom_highbd_masked_sub_pixel_variance16x32_c, AOM_BITS_8),
- make_tuple(&aom_highbd_masked_sub_pixel_variance16x16_ssse3,
- &aom_highbd_masked_sub_pixel_variance16x16_c, AOM_BITS_8),
- make_tuple(&aom_highbd_masked_sub_pixel_variance16x8_ssse3,
- &aom_highbd_masked_sub_pixel_variance16x8_c, AOM_BITS_8),
- make_tuple(&aom_highbd_masked_sub_pixel_variance8x16_ssse3,
- &aom_highbd_masked_sub_pixel_variance8x16_c, AOM_BITS_8),
- make_tuple(&aom_highbd_masked_sub_pixel_variance8x8_ssse3,
- &aom_highbd_masked_sub_pixel_variance8x8_c, AOM_BITS_8),
- make_tuple(&aom_highbd_masked_sub_pixel_variance8x4_ssse3,
- &aom_highbd_masked_sub_pixel_variance8x4_c, AOM_BITS_8),
- make_tuple(&aom_highbd_masked_sub_pixel_variance4x8_ssse3,
- &aom_highbd_masked_sub_pixel_variance4x8_c, AOM_BITS_8),
- make_tuple(&aom_highbd_masked_sub_pixel_variance4x4_ssse3,
- &aom_highbd_masked_sub_pixel_variance4x4_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_masked_compound_sub_pixel_variance64x64_ssse3,
+ &aom_highbd_masked_compound_sub_pixel_variance64x64_c,
+ AOM_BITS_8),
+ make_tuple(&aom_highbd_masked_compound_sub_pixel_variance64x32_ssse3,
+ &aom_highbd_masked_compound_sub_pixel_variance64x32_c,
+ AOM_BITS_8),
+ make_tuple(&aom_highbd_masked_compound_sub_pixel_variance32x64_ssse3,
+ &aom_highbd_masked_compound_sub_pixel_variance32x64_c,
+ AOM_BITS_8),
+ make_tuple(&aom_highbd_masked_compound_sub_pixel_variance32x32_ssse3,
+ &aom_highbd_masked_compound_sub_pixel_variance32x32_c,
+ AOM_BITS_8),
+ make_tuple(&aom_highbd_masked_compound_sub_pixel_variance32x16_ssse3,
+ &aom_highbd_masked_compound_sub_pixel_variance32x16_c,
+ AOM_BITS_8),
+ make_tuple(&aom_highbd_masked_compound_sub_pixel_variance16x32_ssse3,
+ &aom_highbd_masked_compound_sub_pixel_variance16x32_c,
+ AOM_BITS_8),
+ make_tuple(&aom_highbd_masked_compound_sub_pixel_variance16x16_ssse3,
+ &aom_highbd_masked_compound_sub_pixel_variance16x16_c,
+ AOM_BITS_8),
+ make_tuple(&aom_highbd_masked_compound_sub_pixel_variance16x8_ssse3,
+ &aom_highbd_masked_compound_sub_pixel_variance16x8_c,
+ AOM_BITS_8),
+ make_tuple(&aom_highbd_masked_compound_sub_pixel_variance8x16_ssse3,
+ &aom_highbd_masked_compound_sub_pixel_variance8x16_c,
+ AOM_BITS_8),
+ make_tuple(&aom_highbd_masked_compound_sub_pixel_variance8x8_ssse3,
+ &aom_highbd_masked_compound_sub_pixel_variance8x8_c,
+ AOM_BITS_8),
+ make_tuple(&aom_highbd_masked_compound_sub_pixel_variance8x4_ssse3,
+ &aom_highbd_masked_compound_sub_pixel_variance8x4_c,
+ AOM_BITS_8),
+ make_tuple(&aom_highbd_masked_compound_sub_pixel_variance4x8_ssse3,
+ &aom_highbd_masked_compound_sub_pixel_variance4x8_c,
+ AOM_BITS_8),
+ make_tuple(&aom_highbd_masked_compound_sub_pixel_variance4x4_ssse3,
+ &aom_highbd_masked_compound_sub_pixel_variance4x4_c,
+ AOM_BITS_8),
#if CONFIG_EXT_PARTITION
- make_tuple(&aom_highbd_10_masked_sub_pixel_variance128x128_ssse3,
- &aom_highbd_10_masked_sub_pixel_variance128x128_c,
- AOM_BITS_10),
- make_tuple(&aom_highbd_10_masked_sub_pixel_variance128x64_ssse3,
- &aom_highbd_10_masked_sub_pixel_variance128x64_c,
- AOM_BITS_10),
- make_tuple(&aom_highbd_10_masked_sub_pixel_variance64x128_ssse3,
- &aom_highbd_10_masked_sub_pixel_variance64x128_c,
- AOM_BITS_10),
+ make_tuple(
+ &aom_highbd_10_masked_compound_sub_pixel_variance128x128_ssse3,
+ &aom_highbd_10_masked_compound_sub_pixel_variance128x128_c,
+ AOM_BITS_10),
+ make_tuple(
+ &aom_highbd_10_masked_compound_sub_pixel_variance128x64_ssse3,
+ &aom_highbd_10_masked_compound_sub_pixel_variance128x64_c,
+ AOM_BITS_10),
+ make_tuple(
+ &aom_highbd_10_masked_compound_sub_pixel_variance64x128_ssse3,
+ &aom_highbd_10_masked_compound_sub_pixel_variance64x128_c,
+ AOM_BITS_10),
#endif // CONFIG_EXT_PARTITION
- make_tuple(&aom_highbd_10_masked_sub_pixel_variance64x64_ssse3,
- &aom_highbd_10_masked_sub_pixel_variance64x64_c,
+ make_tuple(&aom_highbd_10_masked_compound_sub_pixel_variance64x64_ssse3,
+ &aom_highbd_10_masked_compound_sub_pixel_variance64x64_c,
AOM_BITS_10),
- make_tuple(&aom_highbd_10_masked_sub_pixel_variance64x32_ssse3,
- &aom_highbd_10_masked_sub_pixel_variance64x32_c,
+ make_tuple(&aom_highbd_10_masked_compound_sub_pixel_variance64x32_ssse3,
+ &aom_highbd_10_masked_compound_sub_pixel_variance64x32_c,
AOM_BITS_10),
- make_tuple(&aom_highbd_10_masked_sub_pixel_variance32x64_ssse3,
- &aom_highbd_10_masked_sub_pixel_variance32x64_c,
+ make_tuple(&aom_highbd_10_masked_compound_sub_pixel_variance32x64_ssse3,
+ &aom_highbd_10_masked_compound_sub_pixel_variance32x64_c,
AOM_BITS_10),
- make_tuple(&aom_highbd_10_masked_sub_pixel_variance32x32_ssse3,
- &aom_highbd_10_masked_sub_pixel_variance32x32_c,
+ make_tuple(&aom_highbd_10_masked_compound_sub_pixel_variance32x32_ssse3,
+ &aom_highbd_10_masked_compound_sub_pixel_variance32x32_c,
AOM_BITS_10),
- make_tuple(&aom_highbd_10_masked_sub_pixel_variance32x16_ssse3,
- &aom_highbd_10_masked_sub_pixel_variance32x16_c,
+ make_tuple(&aom_highbd_10_masked_compound_sub_pixel_variance32x16_ssse3,
+ &aom_highbd_10_masked_compound_sub_pixel_variance32x16_c,
AOM_BITS_10),
- make_tuple(&aom_highbd_10_masked_sub_pixel_variance16x32_ssse3,
- &aom_highbd_10_masked_sub_pixel_variance16x32_c,
+ make_tuple(&aom_highbd_10_masked_compound_sub_pixel_variance16x32_ssse3,
+ &aom_highbd_10_masked_compound_sub_pixel_variance16x32_c,
AOM_BITS_10),
- make_tuple(&aom_highbd_10_masked_sub_pixel_variance16x16_ssse3,
- &aom_highbd_10_masked_sub_pixel_variance16x16_c,
+ make_tuple(&aom_highbd_10_masked_compound_sub_pixel_variance16x16_ssse3,
+ &aom_highbd_10_masked_compound_sub_pixel_variance16x16_c,
AOM_BITS_10),
- make_tuple(&aom_highbd_10_masked_sub_pixel_variance16x8_ssse3,
- &aom_highbd_10_masked_sub_pixel_variance16x8_c, AOM_BITS_10),
- make_tuple(&aom_highbd_10_masked_sub_pixel_variance8x16_ssse3,
- &aom_highbd_10_masked_sub_pixel_variance8x16_c, AOM_BITS_10),
- make_tuple(&aom_highbd_10_masked_sub_pixel_variance8x8_ssse3,
- &aom_highbd_10_masked_sub_pixel_variance8x8_c, AOM_BITS_10),
- make_tuple(&aom_highbd_10_masked_sub_pixel_variance8x4_ssse3,
- &aom_highbd_10_masked_sub_pixel_variance8x4_c, AOM_BITS_10),
- make_tuple(&aom_highbd_10_masked_sub_pixel_variance4x8_ssse3,
- &aom_highbd_10_masked_sub_pixel_variance4x8_c, AOM_BITS_10),
- make_tuple(&aom_highbd_10_masked_sub_pixel_variance4x4_ssse3,
- &aom_highbd_10_masked_sub_pixel_variance4x4_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_compound_sub_pixel_variance16x8_ssse3,
+ &aom_highbd_10_masked_compound_sub_pixel_variance16x8_c,
+ AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_compound_sub_pixel_variance8x16_ssse3,
+ &aom_highbd_10_masked_compound_sub_pixel_variance8x16_c,
+ AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_compound_sub_pixel_variance8x8_ssse3,
+ &aom_highbd_10_masked_compound_sub_pixel_variance8x8_c,
+ AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_compound_sub_pixel_variance8x4_ssse3,
+ &aom_highbd_10_masked_compound_sub_pixel_variance8x4_c,
+ AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_compound_sub_pixel_variance4x8_ssse3,
+ &aom_highbd_10_masked_compound_sub_pixel_variance4x8_c,
+ AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_compound_sub_pixel_variance4x4_ssse3,
+ &aom_highbd_10_masked_compound_sub_pixel_variance4x4_c,
+ AOM_BITS_10),
#if CONFIG_EXT_PARTITION
- make_tuple(&aom_highbd_12_masked_sub_pixel_variance128x128_ssse3,
- &aom_highbd_12_masked_sub_pixel_variance128x128_c,
- AOM_BITS_12),
- make_tuple(&aom_highbd_12_masked_sub_pixel_variance128x64_ssse3,
- &aom_highbd_12_masked_sub_pixel_variance128x64_c,
- AOM_BITS_12),
- make_tuple(&aom_highbd_12_masked_sub_pixel_variance64x128_ssse3,
- &aom_highbd_12_masked_sub_pixel_variance64x128_c,
- AOM_BITS_12),
+ make_tuple(
+ &aom_highbd_12_masked_compound_sub_pixel_variance128x128_ssse3,
+ &aom_highbd_12_masked_compound_sub_pixel_variance128x128_c,
+ AOM_BITS_12),
+ make_tuple(
+ &aom_highbd_12_masked_compound_sub_pixel_variance128x64_ssse3,
+ &aom_highbd_12_masked_compound_sub_pixel_variance128x64_c,
+ AOM_BITS_12),
+ make_tuple(
+ &aom_highbd_12_masked_compound_sub_pixel_variance64x128_ssse3,
+ &aom_highbd_12_masked_compound_sub_pixel_variance64x128_c,
+ AOM_BITS_12),
#endif // CONFIG_EXT_PARTITION
- make_tuple(&aom_highbd_12_masked_sub_pixel_variance64x64_ssse3,
- &aom_highbd_12_masked_sub_pixel_variance64x64_c,
+ make_tuple(&aom_highbd_12_masked_compound_sub_pixel_variance64x64_ssse3,
+ &aom_highbd_12_masked_compound_sub_pixel_variance64x64_c,
AOM_BITS_12),
- make_tuple(&aom_highbd_12_masked_sub_pixel_variance64x32_ssse3,
- &aom_highbd_12_masked_sub_pixel_variance64x32_c,
+ make_tuple(&aom_highbd_12_masked_compound_sub_pixel_variance64x32_ssse3,
+ &aom_highbd_12_masked_compound_sub_pixel_variance64x32_c,
AOM_BITS_12),
- make_tuple(&aom_highbd_12_masked_sub_pixel_variance32x64_ssse3,
- &aom_highbd_12_masked_sub_pixel_variance32x64_c,
+ make_tuple(&aom_highbd_12_masked_compound_sub_pixel_variance32x64_ssse3,
+ &aom_highbd_12_masked_compound_sub_pixel_variance32x64_c,
AOM_BITS_12),
- make_tuple(&aom_highbd_12_masked_sub_pixel_variance32x32_ssse3,
- &aom_highbd_12_masked_sub_pixel_variance32x32_c,
+ make_tuple(&aom_highbd_12_masked_compound_sub_pixel_variance32x32_ssse3,
+ &aom_highbd_12_masked_compound_sub_pixel_variance32x32_c,
AOM_BITS_12),
- make_tuple(&aom_highbd_12_masked_sub_pixel_variance32x16_ssse3,
- &aom_highbd_12_masked_sub_pixel_variance32x16_c,
+ make_tuple(&aom_highbd_12_masked_compound_sub_pixel_variance32x16_ssse3,
+ &aom_highbd_12_masked_compound_sub_pixel_variance32x16_c,
AOM_BITS_12),
- make_tuple(&aom_highbd_12_masked_sub_pixel_variance16x32_ssse3,
- &aom_highbd_12_masked_sub_pixel_variance16x32_c,
+ make_tuple(&aom_highbd_12_masked_compound_sub_pixel_variance16x32_ssse3,
+ &aom_highbd_12_masked_compound_sub_pixel_variance16x32_c,
AOM_BITS_12),
- make_tuple(&aom_highbd_12_masked_sub_pixel_variance16x16_ssse3,
- &aom_highbd_12_masked_sub_pixel_variance16x16_c,
+ make_tuple(&aom_highbd_12_masked_compound_sub_pixel_variance16x16_ssse3,
+ &aom_highbd_12_masked_compound_sub_pixel_variance16x16_c,
AOM_BITS_12),
- make_tuple(&aom_highbd_12_masked_sub_pixel_variance16x8_ssse3,
- &aom_highbd_12_masked_sub_pixel_variance16x8_c, AOM_BITS_12),
- make_tuple(&aom_highbd_12_masked_sub_pixel_variance8x16_ssse3,
- &aom_highbd_12_masked_sub_pixel_variance8x16_c, AOM_BITS_12),
- make_tuple(&aom_highbd_12_masked_sub_pixel_variance8x8_ssse3,
- &aom_highbd_12_masked_sub_pixel_variance8x8_c, AOM_BITS_12),
- make_tuple(&aom_highbd_12_masked_sub_pixel_variance8x4_ssse3,
- &aom_highbd_12_masked_sub_pixel_variance8x4_c, AOM_BITS_12),
- make_tuple(&aom_highbd_12_masked_sub_pixel_variance4x8_ssse3,
- &aom_highbd_12_masked_sub_pixel_variance4x8_c, AOM_BITS_12),
- make_tuple(&aom_highbd_12_masked_sub_pixel_variance4x4_ssse3,
- &aom_highbd_12_masked_sub_pixel_variance4x4_c,
+ make_tuple(&aom_highbd_12_masked_compound_sub_pixel_variance16x8_ssse3,
+ &aom_highbd_12_masked_compound_sub_pixel_variance16x8_c,
+ AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_compound_sub_pixel_variance8x16_ssse3,
+ &aom_highbd_12_masked_compound_sub_pixel_variance8x16_c,
+ AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_compound_sub_pixel_variance8x8_ssse3,
+ &aom_highbd_12_masked_compound_sub_pixel_variance8x8_c,
+ AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_compound_sub_pixel_variance8x4_ssse3,
+ &aom_highbd_12_masked_compound_sub_pixel_variance8x4_c,
+ AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_compound_sub_pixel_variance4x8_ssse3,
+ &aom_highbd_12_masked_compound_sub_pixel_variance4x8_c,
+ AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_compound_sub_pixel_variance4x4_ssse3,
+ &aom_highbd_12_masked_compound_sub_pixel_variance4x4_c,
AOM_BITS_12)));
#endif // CONFIG_HIGHBITDEPTH
-#endif // HAVE_SSSE3
+#endif // 0 && HAVE_SSSE3
} // namespace