AVX2: Add optimization for sad_mxnx3d
BUG=aomedia:3358
Change-Id: I9ceead2183c2d9673548af57634070a3738d14ab
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 3bc307b..427a3dc 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -999,6 +999,7 @@
foreach (@encoder_block_sizes) {
($w, $h) = @$_;
add_proto qw/void/, "aom_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]";
+ add_proto qw/void/, "aom_sad${w}x${h}x3d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]";
add_proto qw/void/, "aom_sad${w}x${h}x4d_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, const uint8_t *second_pred, uint32_t sad_array[4]";
add_proto qw/void/, "aom_sad_skip_${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]";
add_proto qw/void/, "aom_masked_sad${w}x${h}x4d", "const uint8_t *src, int src_stride, const uint8_t *ref[4], int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned sads[4]";
@@ -1052,6 +1053,22 @@
specialize qw/aom_sad_skip_4x16x4d sse2 neon/;
specialize qw/aom_sad_skip_4x8x4d sse2 neon/;
+ specialize qw/aom_sad128x128x3d avx2/;
+ specialize qw/aom_sad128x64x3d avx2/;
+ specialize qw/aom_sad64x128x3d avx2/;
+ specialize qw/aom_sad64x64x3d avx2/;
+ specialize qw/aom_sad64x32x3d avx2/;
+ specialize qw/aom_sad32x64x3d avx2/;
+ specialize qw/aom_sad32x32x3d avx2/;
+ specialize qw/aom_sad32x16x3d avx2/;
+ specialize qw/aom_sad16x32x3d avx2/;
+ specialize qw/aom_sad16x16x3d avx2/;
+ specialize qw/aom_sad16x8x3d avx2/;
+
+ specialize qw/aom_sad64x16x3d avx2/;
+ specialize qw/aom_sad32x8x3d avx2/;
+ specialize qw/aom_sad16x64x3d avx2/;
+
if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
specialize qw/aom_sad128x128x4d_avg sse2/;
specialize qw/aom_sad128x64x4d_avg sse2/;
@@ -1118,6 +1135,7 @@
foreach (@encoder_block_sizes) {
($w, $h) = @$_;
add_proto qw/void/, "aom_highbd_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+ add_proto qw/void/, "aom_highbd_sad${w}x${h}x3d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
add_proto qw/void/, "aom_highbd_sad_skip_${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
if ($w != 128 && $h != 128) {
specialize "aom_highbd_sad${w}x${h}x4d", qw/sse2/;
@@ -1167,6 +1185,23 @@
specialize qw/aom_highbd_sad_skip_32x8x4d avx2 sse2/;
specialize qw/aom_highbd_sad_skip_16x64x4d avx2 sse2/;
specialize qw/aom_highbd_sad_skip_64x16x4d avx2 sse2/;
+
+ specialize qw/aom_highbd_sad128x128x3d avx2/;
+ specialize qw/aom_highbd_sad128x64x3d avx2/;
+ specialize qw/aom_highbd_sad64x128x3d avx2/;
+ specialize qw/aom_highbd_sad64x64x3d avx2/;
+ specialize qw/aom_highbd_sad64x32x3d avx2/;
+ specialize qw/aom_highbd_sad32x64x3d avx2/;
+ specialize qw/aom_highbd_sad32x32x3d avx2/;
+ specialize qw/aom_highbd_sad32x16x3d avx2/;
+ specialize qw/aom_highbd_sad16x32x3d avx2/;
+ specialize qw/aom_highbd_sad16x16x3d avx2/;
+ specialize qw/aom_highbd_sad16x8x3d avx2/;
+
+ specialize qw/aom_highbd_sad16x4x3d avx2/;
+ specialize qw/aom_highbd_sad32x8x3d avx2/;
+ specialize qw/aom_highbd_sad16x64x3d avx2/;
+ specialize qw/aom_highbd_sad64x16x3d avx2/;
}
#
# Avg
diff --git a/aom_dsp/sad.c b/aom_dsp/sad.c
index 94260ce..5b7b0e4 100644
--- a/aom_dsp/sad.c
+++ b/aom_dsp/sad.c
@@ -120,70 +120,93 @@
} \
}
#endif // CONFIG_REALTIME_ONLY
+// Call SIMD version of aom_sad_mxnx4d if the 3d version is unavailable.
+#define SAD_MXNX3D(m, n) \
+ void aom_sad##m##x##n##x3d_c(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ aom_sad##m##x##n##x4d(src, src_stride, ref_array, ref_stride, sad_array); \
+ }
// 128x128
SADMXN(128, 128)
SAD_MXNX4D(128, 128)
+SAD_MXNX3D(128, 128)
// 128x64
SADMXN(128, 64)
SAD_MXNX4D(128, 64)
+SAD_MXNX3D(128, 64)
// 64x128
SADMXN(64, 128)
SAD_MXNX4D(64, 128)
+SAD_MXNX3D(64, 128)
// 64x64
SADMXN(64, 64)
SAD_MXNX4D(64, 64)
+SAD_MXNX3D(64, 64)
// 64x32
SADMXN(64, 32)
SAD_MXNX4D(64, 32)
+SAD_MXNX3D(64, 32)
// 32x64
SADMXN(32, 64)
SAD_MXNX4D(32, 64)
+SAD_MXNX3D(32, 64)
// 32x32
SADMXN(32, 32)
SAD_MXNX4D(32, 32)
+SAD_MXNX3D(32, 32)
// 32x16
SADMXN(32, 16)
SAD_MXNX4D(32, 16)
+SAD_MXNX3D(32, 16)
// 16x32
SADMXN(16, 32)
SAD_MXNX4D(16, 32)
+SAD_MXNX3D(16, 32)
// 16x16
SADMXN(16, 16)
SAD_MXNX4D(16, 16)
+SAD_MXNX3D(16, 16)
// 16x8
SADMXN(16, 8)
SAD_MXNX4D(16, 8)
+SAD_MXNX3D(16, 8)
// 8x16
SADMXN(8, 16)
SAD_MXNX4D(8, 16)
+SAD_MXNX3D(8, 16)
// 8x8
SADMXN(8, 8)
SAD_MXNX4D(8, 8)
+SAD_MXNX3D(8, 8)
// 8x4
SADMXN(8, 4)
SAD_MXNX4D(8, 4)
+SAD_MXNX3D(8, 4)
// 4x8
SADMXN(4, 8)
SAD_MXNX4D(4, 8)
+SAD_MXNX3D(4, 8)
// 4x4
SADMXN(4, 4)
SAD_MXNX4D(4, 4)
+SAD_MXNX3D(4, 4)
SAD_MXH(128)
SAD_MXH(64)
@@ -204,6 +227,14 @@
SAD_MXNX4D(16, 64)
SADMXN(64, 16)
SAD_MXNX4D(64, 16)
+#if !CONFIG_REALTIME_ONLY
+SAD_MXNX3D(4, 16)
+SAD_MXNX3D(16, 4)
+SAD_MXNX3D(8, 32)
+SAD_MXNX3D(32, 8)
+SAD_MXNX3D(16, 64)
+SAD_MXNX3D(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
#if CONFIG_AV1_HIGHBITDEPTH
static INLINE unsigned int highbd_sad(const uint8_t *a8, int a_stride,
@@ -291,70 +322,94 @@
2 * ref_stride, (m), (n / 2)); \
} \
}
+// Call SIMD version of aom_highbd_sad_mxnx4d if the 3d version is unavailable.
+#define HIGHBD_SAD_MXNX3D(m, n) \
+ void aom_highbd_sad##m##x##n##x3d_c(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref_array[], \
+ int ref_stride, uint32_t *sad_array) { \
+ aom_highbd_sad##m##x##n##x4d(src, src_stride, ref_array, ref_stride, \
+ sad_array); \
+ }
// 128x128
HIGHBD_SADMXN(128, 128)
HIGHBD_SAD_MXNX4D(128, 128)
+HIGHBD_SAD_MXNX3D(128, 128)
// 128x64
HIGHBD_SADMXN(128, 64)
HIGHBD_SAD_MXNX4D(128, 64)
+HIGHBD_SAD_MXNX3D(128, 64)
// 64x128
HIGHBD_SADMXN(64, 128)
HIGHBD_SAD_MXNX4D(64, 128)
+HIGHBD_SAD_MXNX3D(64, 128)
// 64x64
HIGHBD_SADMXN(64, 64)
HIGHBD_SAD_MXNX4D(64, 64)
+HIGHBD_SAD_MXNX3D(64, 64)
// 64x32
HIGHBD_SADMXN(64, 32)
HIGHBD_SAD_MXNX4D(64, 32)
+HIGHBD_SAD_MXNX3D(64, 32)
// 32x64
HIGHBD_SADMXN(32, 64)
HIGHBD_SAD_MXNX4D(32, 64)
+HIGHBD_SAD_MXNX3D(32, 64)
// 32x32
HIGHBD_SADMXN(32, 32)
HIGHBD_SAD_MXNX4D(32, 32)
+HIGHBD_SAD_MXNX3D(32, 32)
// 32x16
HIGHBD_SADMXN(32, 16)
HIGHBD_SAD_MXNX4D(32, 16)
+HIGHBD_SAD_MXNX3D(32, 16)
// 16x32
HIGHBD_SADMXN(16, 32)
HIGHBD_SAD_MXNX4D(16, 32)
+HIGHBD_SAD_MXNX3D(16, 32)
// 16x16
HIGHBD_SADMXN(16, 16)
HIGHBD_SAD_MXNX4D(16, 16)
+HIGHBD_SAD_MXNX3D(16, 16)
// 16x8
HIGHBD_SADMXN(16, 8)
HIGHBD_SAD_MXNX4D(16, 8)
+HIGHBD_SAD_MXNX3D(16, 8)
// 8x16
HIGHBD_SADMXN(8, 16)
HIGHBD_SAD_MXNX4D(8, 16)
+HIGHBD_SAD_MXNX3D(8, 16)
// 8x8
HIGHBD_SADMXN(8, 8)
HIGHBD_SAD_MXNX4D(8, 8)
+HIGHBD_SAD_MXNX3D(8, 8)
// 8x4
HIGHBD_SADMXN(8, 4)
HIGHBD_SAD_MXNX4D(8, 4)
+HIGHBD_SAD_MXNX3D(8, 4)
// 4x8
HIGHBD_SADMXN(4, 8)
HIGHBD_SAD_MXNX4D(4, 8)
+HIGHBD_SAD_MXNX3D(4, 8)
// 4x4
HIGHBD_SADMXN(4, 4)
HIGHBD_SAD_MXNX4D(4, 4)
+HIGHBD_SAD_MXNX3D(4, 4)
HIGHBD_SADMXN(4, 16)
HIGHBD_SAD_MXNX4D(4, 16)
@@ -368,4 +423,13 @@
HIGHBD_SAD_MXNX4D(16, 64)
HIGHBD_SADMXN(64, 16)
HIGHBD_SAD_MXNX4D(64, 16)
+
+#if !CONFIG_REALTIME_ONLY
+HIGHBD_SAD_MXNX3D(4, 16)
+HIGHBD_SAD_MXNX3D(16, 4)
+HIGHBD_SAD_MXNX3D(8, 32)
+HIGHBD_SAD_MXNX3D(32, 8)
+HIGHBD_SAD_MXNX3D(16, 64)
+HIGHBD_SAD_MXNX3D(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/aom_dsp/variance.h b/aom_dsp/variance.h
index dae4197..6603d31 100644
--- a/aom_dsp/variance.h
+++ b/aom_dsp/variance.h
@@ -90,6 +90,7 @@
aom_subpixvariance_fn_t svf;
aom_subp_avg_variance_fn_t svaf;
aom_sad_multi_d_fn_t sdx4df;
+ aom_sad_multi_d_fn_t sdx3df;
// Same as sadx4, but downsample the rows by a factor of 2.
aom_sad_multi_d_fn_t sdsx4df;
aom_masked_sad_fn_t msdf;
diff --git a/aom_dsp/x86/highbd_sad_avx2.c b/aom_dsp/x86/highbd_sad_avx2.c
index f583772..e11754e 100644
--- a/aom_dsp/x86/highbd_sad_avx2.c
+++ b/aom_dsp/x86/highbd_sad_avx2.c
@@ -602,71 +602,34 @@
s[3] = _mm256_setzero_si256();
}
-static AOM_FORCE_INLINE void aom_highbd_sad16xNx4d_avx2(
- int N, const uint8_t *src, int src_stride, const uint8_t *const ref_array[],
- int ref_stride, uint32_t *sad_array) {
+static AOM_FORCE_INLINE void aom_highbd_sadMxNxD_avx2(
+ int M, int N, int D, const uint8_t *src, int src_stride,
+ const uint8_t *const ref_array[], int ref_stride, uint32_t *sad_array) {
__m256i sad_vec[4];
const uint16_t *refp[4];
const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
const uint16_t *srcp;
- const int shift_for_4_rows = 2;
- int i, j;
-
- init_sad(sad_vec);
- convert_pointers(ref_array, refp);
-
- for (i = 0; i < 4; ++i) {
- srcp = keep;
- for (j = 0; j < N; j += 4) {
- sad16x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
- srcp += src_stride << shift_for_4_rows;
- refp[i] += ref_stride << shift_for_4_rows;
- }
- }
- get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
-}
-
-static AOM_FORCE_INLINE void aom_highbd_sad32xNx4d_avx2(
- int N, const uint8_t *src, int src_stride, const uint8_t *const ref_array[],
- int ref_stride, uint32_t *sad_array) {
- __m256i sad_vec[4];
- const uint16_t *refp[4];
- const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
- const uint16_t *srcp;
- const int shift_for_4_rows = 2;
+ const int shift_for_rows = (M < 128) + (M < 64);
+ const int row_units = 1 << shift_for_rows;
int i, r;
init_sad(sad_vec);
convert_pointers(ref_array, refp);
- for (i = 0; i < 4; ++i) {
+ for (i = 0; i < D; ++i) {
srcp = keep;
- for (r = 0; r < N; r += 4) {
- sad32x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
- srcp += src_stride << shift_for_4_rows;
- refp[i] += ref_stride << shift_for_4_rows;
- }
- }
- get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
-}
-
-static AOM_FORCE_INLINE void aom_highbd_sad64xNx4d_avx2(
- int N, const uint8_t *src, int src_stride, const uint8_t *const ref_array[],
- int ref_stride, uint32_t *sad_array) {
- __m256i sad_vec[4];
- const uint16_t *refp[4];
- const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
- const uint16_t *srcp;
- const int shift_for_rows = 1;
- int i, r;
-
- init_sad(sad_vec);
- convert_pointers(ref_array, refp);
-
- for (i = 0; i < 4; ++i) {
- srcp = keep;
- for (r = 0; r < N; r += 2) {
- sad64x2(srcp, src_stride, refp[i], ref_stride, NULL, &sad_vec[i]);
+ for (r = 0; r < N; r += row_units) {
+ if (M == 128) {
+ sad128x1(srcp, refp[i], NULL, &sad_vec[i]);
+ } else if (M == 64) {
+ sad64x2(srcp, src_stride, refp[i], ref_stride, NULL, &sad_vec[i]);
+ } else if (M == 32) {
+ sad32x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
+ } else if (M == 16) {
+ sad16x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
+ } else {
+ assert(0);
+ }
srcp += src_stride << shift_for_rows;
refp[i] += ref_stride << shift_for_rows;
}
@@ -674,47 +637,31 @@
get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
}
-static AOM_FORCE_INLINE void aom_highbd_sad128xNx4d_avx2(
- int N, const uint8_t *src, int src_stride, const uint8_t *const ref_array[],
- int ref_stride, uint32_t *sad_array) {
- __m256i sad_vec[4];
- const uint16_t *refp[4];
- const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
- const uint16_t *srcp;
- int i, r;
-
- init_sad(sad_vec);
- convert_pointers(ref_array, refp);
-
- for (i = 0; i < 4; ++i) {
- srcp = keep;
- for (r = 0; r < N; r++) {
- sad128x1(srcp, refp[i], NULL, &sad_vec[i]);
- srcp += src_stride;
- refp[i] += ref_stride;
- }
- }
- get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
-}
-
-#define HIGHBD_SAD_MXNX4D_AVX2(m, n) \
- void aom_highbd_sad##m##x##n##x4d_avx2( \
- const uint8_t *src, int src_stride, const uint8_t *const ref_array[], \
- int ref_stride, uint32_t *sad_array) { \
- aom_highbd_sad##m##xNx4d_avx2(n, src, src_stride, ref_array, ref_stride, \
- sad_array); \
+#define HIGHBD_SAD_MXNX4D_AVX2(m, n) \
+ void aom_highbd_sad##m##x##n##x4d_avx2( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[], \
+ int ref_stride, uint32_t *sad_array) { \
+ aom_highbd_sadMxNxD_avx2(m, n, 4, src, src_stride, ref_array, ref_stride, \
+ sad_array); \
}
#define HIGHBD_SAD_SKIP_MXNX4D_AVX2(m, n) \
void aom_highbd_sad_skip_##m##x##n##x4d_avx2( \
const uint8_t *src, int src_stride, const uint8_t *const ref_array[], \
int ref_stride, uint32_t *sad_array) { \
- aom_highbd_sad##m##xNx4d_avx2((n / 2), src, 2 * src_stride, ref_array, \
- 2 * ref_stride, sad_array); \
+ aom_highbd_sadMxNxD_avx2(m, (n / 2), 4, src, 2 * src_stride, ref_array, \
+ 2 * ref_stride, sad_array); \
sad_array[0] <<= 1; \
sad_array[1] <<= 1; \
sad_array[2] <<= 1; \
sad_array[3] <<= 1; \
}
+#define HIGHBD_SAD_MXNX3D_AVX2(m, n) \
+ void aom_highbd_sad##m##x##n##x3d_avx2( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[], \
+ int ref_stride, uint32_t *sad_array) { \
+ aom_highbd_sadMxNxD_avx2(m, n, 3, src, src_stride, ref_array, ref_stride, \
+ sad_array); \
+ }
HIGHBD_SAD_MXNX4D_AVX2(16, 4)
HIGHBD_SAD_MXNX4D_AVX2(16, 8)
@@ -752,3 +699,22 @@
HIGHBD_SAD_SKIP_MXNX4D_AVX2(128, 64)
HIGHBD_SAD_SKIP_MXNX4D_AVX2(128, 128)
+
+HIGHBD_SAD_MXNX3D_AVX2(16, 4)
+HIGHBD_SAD_MXNX3D_AVX2(16, 8)
+HIGHBD_SAD_MXNX3D_AVX2(16, 16)
+HIGHBD_SAD_MXNX3D_AVX2(16, 32)
+HIGHBD_SAD_MXNX3D_AVX2(16, 64)
+
+HIGHBD_SAD_MXNX3D_AVX2(32, 8)
+HIGHBD_SAD_MXNX3D_AVX2(32, 16)
+HIGHBD_SAD_MXNX3D_AVX2(32, 32)
+HIGHBD_SAD_MXNX3D_AVX2(32, 64)
+
+HIGHBD_SAD_MXNX3D_AVX2(64, 16)
+HIGHBD_SAD_MXNX3D_AVX2(64, 32)
+HIGHBD_SAD_MXNX3D_AVX2(64, 64)
+HIGHBD_SAD_MXNX3D_AVX2(64, 128)
+
+HIGHBD_SAD_MXNX3D_AVX2(128, 64)
+HIGHBD_SAD_MXNX3D_AVX2(128, 128)
diff --git a/aom_dsp/x86/sad4d_avx2.c b/aom_dsp/x86/sad4d_avx2.c
index 2f523f7..7629cf4 100644
--- a/aom_dsp/x86/sad4d_avx2.c
+++ b/aom_dsp/x86/sad4d_avx2.c
@@ -13,6 +13,35 @@
#include "config/aom_dsp_rtcd.h"
#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+static AOM_FORCE_INLINE void aggregate_and_store_sum(uint32_t res[4],
+ __m256i sum_ref0,
+ __m256i sum_ref1,
+ __m256i sum_ref2,
+ __m256i sum_ref3) {
+ __m128i sum;
+ // In sum_ref-i the result is saved in the first 4 bytes and the other 4
+ // bytes are zeroed.
+ // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
+ // 0, 0, 1, 1
+ sum_ref0 = _mm256_castps_si256(_mm256_shuffle_ps(
+ _mm256_castsi256_ps(sum_ref0), _mm256_castsi256_ps(sum_ref1),
+ _MM_SHUFFLE(2, 0, 2, 0)));
+ // 2, 2, 3, 3
+ sum_ref2 = _mm256_castps_si256(_mm256_shuffle_ps(
+ _mm256_castsi256_ps(sum_ref2), _mm256_castsi256_ps(sum_ref3),
+ _MM_SHUFFLE(2, 0, 2, 0)));
+
+ // sum adjacent 32 bit integers
+ sum_ref0 = _mm256_hadd_epi32(sum_ref0, sum_ref2);
+
+ // add the low 128 bit to the high 128 bit
+ sum = _mm_add_epi32(_mm256_castsi256_si128(sum_ref0),
+ _mm256_extractf128_si256(sum_ref0, 1));
+
+ _mm_storeu_si128((__m128i *)(res), sum);
+}
static AOM_FORCE_INLINE void aom_sadMxNx4d_avx2(
int M, int N, const uint8_t *src, int src_stride,
@@ -57,29 +86,49 @@
ref2 += ref_stride;
ref3 += ref_stride;
}
- {
- __m128i sum;
- // In sum_ref-i the result is saved in the first 4 bytes and the other 4
- // bytes are zeroed.
- // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
- // 0, 0, 1, 1
- sum_ref0 = _mm256_castps_si256(_mm256_shuffle_ps(
- _mm256_castsi256_ps(sum_ref0), _mm256_castsi256_ps(sum_ref1),
- _MM_SHUFFLE(2, 0, 2, 0)));
- // 2, 2, 3, 3
- sum_ref2 = _mm256_castps_si256(_mm256_shuffle_ps(
- _mm256_castsi256_ps(sum_ref2), _mm256_castsi256_ps(sum_ref3),
- _MM_SHUFFLE(2, 0, 2, 0)));
- // sum adjacent 32 bit integers
- sum_ref0 = _mm256_hadd_epi32(sum_ref0, sum_ref2);
+ aggregate_and_store_sum(res, sum_ref0, sum_ref1, sum_ref2, sum_ref3);
+}
- // add the low 128 bit to the high 128 bit
- sum = _mm_add_epi32(_mm256_castsi256_si128(sum_ref0),
- _mm256_extractf128_si256(sum_ref0, 1));
+static AOM_FORCE_INLINE void aom_sadMxNx3d_avx2(
+ int M, int N, const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride, uint32_t res[4]) {
+ __m256i src_reg, ref0_reg, ref1_reg, ref2_reg;
+ __m256i sum_ref0, sum_ref1, sum_ref2;
+ int i, j;
+ const uint8_t *ref0, *ref1, *ref2;
- _mm_storeu_si128((__m128i *)(res), sum);
+ ref0 = ref[0];
+ ref1 = ref[1];
+ ref2 = ref[2];
+ sum_ref0 = _mm256_setzero_si256();
+ sum_ref2 = _mm256_setzero_si256();
+ sum_ref1 = _mm256_setzero_si256();
+
+ for (i = 0; i < N; i++) {
+ for (j = 0; j < M; j += 32) {
+ // load src and all refs
+ src_reg = _mm256_loadu_si256((const __m256i *)(src + j));
+ ref0_reg = _mm256_loadu_si256((const __m256i *)(ref0 + j));
+ ref1_reg = _mm256_loadu_si256((const __m256i *)(ref1 + j));
+ ref2_reg = _mm256_loadu_si256((const __m256i *)(ref2 + j));
+
+ // sum of the absolute differences between every ref-i to src
+ ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
+ ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
+ ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
+ // sum every ref-i
+ sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
+ sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
+ sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
+ }
+ src += src_stride;
+ ref0 += ref_stride;
+ ref1 += ref_stride;
+ ref2 += ref_stride;
}
+ aggregate_and_store_sum(res, sum_ref0, sum_ref1, sum_ref2,
+ _mm256_setzero_si256());
}
#define SADMXN_AVX2(m, n) \
@@ -87,6 +136,11 @@
const uint8_t *const ref[4], int ref_stride, \
uint32_t res[4]) { \
aom_sadMxNx4d_avx2(m, n, src, src_stride, ref, ref_stride, res); \
+ } \
+ void aom_sad##m##x##n##x3d_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref[4], int ref_stride, \
+ uint32_t res[4]) { \
+ aom_sadMxNx3d_avx2(m, n, src, src_stride, ref, ref_stride, res); \
}
SADMXN_AVX2(32, 8)
@@ -126,3 +180,63 @@
SAD_SKIP_MXN_AVX2(128, 64)
SAD_SKIP_MXN_AVX2(128, 128)
+
+static AOM_FORCE_INLINE void aom_sad16xNx3d_avx2(int N, const uint8_t *src,
+ int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride,
+ uint32_t res[4]) {
+ __m256i src_reg, ref0_reg, ref1_reg, ref2_reg;
+ __m256i sum_ref0, sum_ref1, sum_ref2;
+ const uint8_t *ref0, *ref1, *ref2;
+ assert(N % 2 == 0);
+
+ ref0 = ref[0];
+ ref1 = ref[1];
+ ref2 = ref[2];
+ sum_ref0 = _mm256_setzero_si256();
+ sum_ref2 = _mm256_setzero_si256();
+ sum_ref1 = _mm256_setzero_si256();
+
+ for (int i = 0; i < N; i += 2) {
+ // load src and all refs
+ src_reg = yy_loadu2_128(src + src_stride, src);
+ ref0_reg = yy_loadu2_128(ref0 + ref_stride, ref0);
+ ref1_reg = yy_loadu2_128(ref1 + ref_stride, ref1);
+ ref2_reg = yy_loadu2_128(ref2 + ref_stride, ref2);
+
+ // sum of the absolute differences between every ref-i to src
+ ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
+ ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
+ ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
+
+ // sum every ref-i
+ sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
+ sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
+ sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
+
+ src += 2 * src_stride;
+ ref0 += 2 * ref_stride;
+ ref1 += 2 * ref_stride;
+ ref2 += 2 * ref_stride;
+ }
+
+ aggregate_and_store_sum(res, sum_ref0, sum_ref1, sum_ref2,
+ _mm256_setzero_si256());
+}
+
+#define SAD16XNX3_AVX2(n) \
+ void aom_sad16x##n##x3d_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref[4], int ref_stride, \
+ uint32_t res[4]) { \
+ aom_sad16xNx3d_avx2(n, src, src_stride, ref, ref_stride, res); \
+ }
+
+SAD16XNX3_AVX2(32)
+SAD16XNX3_AVX2(16)
+SAD16XNX3_AVX2(8)
+
+#if !CONFIG_REALTIME_ONLY
+SAD16XNX3_AVX2(64)
+SAD16XNX3_AVX2(4)
+#endif // !CONFIG_REALTIME_ONLY
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index d9de962..6c390d6 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -965,124 +965,129 @@
}
}
-#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, JSDAF, JSVAF) \
- ppi->fn_ptr[BT].sdf = SDF; \
- ppi->fn_ptr[BT].sdaf = SDAF; \
- ppi->fn_ptr[BT].vf = VF; \
- ppi->fn_ptr[BT].svf = SVF; \
- ppi->fn_ptr[BT].svaf = SVAF; \
- ppi->fn_ptr[BT].sdx4df = SDX4DF; \
- ppi->fn_ptr[BT].jsdaf = JSDAF; \
- ppi->fn_ptr[BT].jsvaf = JSVAF;
+#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, SDX3DF, JSDAF, JSVAF) \
+ ppi->fn_ptr[BT].sdf = SDF; \
+ ppi->fn_ptr[BT].sdaf = SDAF; \
+ ppi->fn_ptr[BT].vf = VF; \
+ ppi->fn_ptr[BT].svf = SVF; \
+ ppi->fn_ptr[BT].svaf = SVAF; \
+ ppi->fn_ptr[BT].sdx4df = SDX4DF; \
+ ppi->fn_ptr[BT].jsdaf = JSDAF; \
+ ppi->fn_ptr[BT].jsvaf = JSVAF; \
+ ppi->fn_ptr[BT].sdx3df = SDX3DF;
// Realtime mode doesn't use 4x rectangular blocks.
#if !CONFIG_REALTIME_ONLY
BFP(BLOCK_4X16, aom_sad4x16, aom_sad4x16_avg, aom_variance4x16,
aom_sub_pixel_variance4x16, aom_sub_pixel_avg_variance4x16,
- aom_sad4x16x4d, aom_dist_wtd_sad4x16_avg,
+ aom_sad4x16x4d, aom_sad4x16x3d, aom_dist_wtd_sad4x16_avg,
aom_dist_wtd_sub_pixel_avg_variance4x16)
BFP(BLOCK_16X4, aom_sad16x4, aom_sad16x4_avg, aom_variance16x4,
aom_sub_pixel_variance16x4, aom_sub_pixel_avg_variance16x4,
- aom_sad16x4x4d, aom_dist_wtd_sad16x4_avg,
+ aom_sad16x4x4d, aom_sad16x4x3d, aom_dist_wtd_sad16x4_avg,
aom_dist_wtd_sub_pixel_avg_variance16x4)
BFP(BLOCK_8X32, aom_sad8x32, aom_sad8x32_avg, aom_variance8x32,
aom_sub_pixel_variance8x32, aom_sub_pixel_avg_variance8x32,
- aom_sad8x32x4d, aom_dist_wtd_sad8x32_avg,
+ aom_sad8x32x4d, aom_sad8x32x3d, aom_dist_wtd_sad8x32_avg,
aom_dist_wtd_sub_pixel_avg_variance8x32)
BFP(BLOCK_32X8, aom_sad32x8, aom_sad32x8_avg, aom_variance32x8,
aom_sub_pixel_variance32x8, aom_sub_pixel_avg_variance32x8,
- aom_sad32x8x4d, aom_dist_wtd_sad32x8_avg,
+ aom_sad32x8x4d, aom_sad32x8x3d, aom_dist_wtd_sad32x8_avg,
aom_dist_wtd_sub_pixel_avg_variance32x8)
BFP(BLOCK_16X64, aom_sad16x64, aom_sad16x64_avg, aom_variance16x64,
aom_sub_pixel_variance16x64, aom_sub_pixel_avg_variance16x64,
- aom_sad16x64x4d, aom_dist_wtd_sad16x64_avg,
+ aom_sad16x64x4d, aom_sad16x64x3d, aom_dist_wtd_sad16x64_avg,
aom_dist_wtd_sub_pixel_avg_variance16x64)
BFP(BLOCK_64X16, aom_sad64x16, aom_sad64x16_avg, aom_variance64x16,
aom_sub_pixel_variance64x16, aom_sub_pixel_avg_variance64x16,
- aom_sad64x16x4d, aom_dist_wtd_sad64x16_avg,
+ aom_sad64x16x4d, aom_sad64x16x3d, aom_dist_wtd_sad64x16_avg,
aom_dist_wtd_sub_pixel_avg_variance64x16)
#endif // !CONFIG_REALTIME_ONLY
BFP(BLOCK_128X128, aom_sad128x128, aom_sad128x128_avg, aom_variance128x128,
aom_sub_pixel_variance128x128, aom_sub_pixel_avg_variance128x128,
- aom_sad128x128x4d, aom_dist_wtd_sad128x128_avg,
+ aom_sad128x128x4d, aom_sad128x128x3d, aom_dist_wtd_sad128x128_avg,
aom_dist_wtd_sub_pixel_avg_variance128x128)
BFP(BLOCK_128X64, aom_sad128x64, aom_sad128x64_avg, aom_variance128x64,
aom_sub_pixel_variance128x64, aom_sub_pixel_avg_variance128x64,
- aom_sad128x64x4d, aom_dist_wtd_sad128x64_avg,
+ aom_sad128x64x4d, aom_sad128x64x3d, aom_dist_wtd_sad128x64_avg,
aom_dist_wtd_sub_pixel_avg_variance128x64)
BFP(BLOCK_64X128, aom_sad64x128, aom_sad64x128_avg, aom_variance64x128,
aom_sub_pixel_variance64x128, aom_sub_pixel_avg_variance64x128,
- aom_sad64x128x4d, aom_dist_wtd_sad64x128_avg,
+ aom_sad64x128x4d, aom_sad64x128x3d, aom_dist_wtd_sad64x128_avg,
aom_dist_wtd_sub_pixel_avg_variance64x128)
BFP(BLOCK_32X16, aom_sad32x16, aom_sad32x16_avg, aom_variance32x16,
aom_sub_pixel_variance32x16, aom_sub_pixel_avg_variance32x16,
- aom_sad32x16x4d, aom_dist_wtd_sad32x16_avg,
+ aom_sad32x16x4d, aom_sad32x16x3d, aom_dist_wtd_sad32x16_avg,
aom_dist_wtd_sub_pixel_avg_variance32x16)
BFP(BLOCK_16X32, aom_sad16x32, aom_sad16x32_avg, aom_variance16x32,
aom_sub_pixel_variance16x32, aom_sub_pixel_avg_variance16x32,
- aom_sad16x32x4d, aom_dist_wtd_sad16x32_avg,
+ aom_sad16x32x4d, aom_sad16x32x3d, aom_dist_wtd_sad16x32_avg,
aom_dist_wtd_sub_pixel_avg_variance16x32)
BFP(BLOCK_64X32, aom_sad64x32, aom_sad64x32_avg, aom_variance64x32,
aom_sub_pixel_variance64x32, aom_sub_pixel_avg_variance64x32,
- aom_sad64x32x4d, aom_dist_wtd_sad64x32_avg,
+ aom_sad64x32x4d, aom_sad64x32x3d, aom_dist_wtd_sad64x32_avg,
aom_dist_wtd_sub_pixel_avg_variance64x32)
BFP(BLOCK_32X64, aom_sad32x64, aom_sad32x64_avg, aom_variance32x64,
aom_sub_pixel_variance32x64, aom_sub_pixel_avg_variance32x64,
- aom_sad32x64x4d, aom_dist_wtd_sad32x64_avg,
+ aom_sad32x64x4d, aom_sad32x64x3d, aom_dist_wtd_sad32x64_avg,
aom_dist_wtd_sub_pixel_avg_variance32x64)
BFP(BLOCK_32X32, aom_sad32x32, aom_sad32x32_avg, aom_variance32x32,
aom_sub_pixel_variance32x32, aom_sub_pixel_avg_variance32x32,
- aom_sad32x32x4d, aom_dist_wtd_sad32x32_avg,
+ aom_sad32x32x4d, aom_sad32x32x3d, aom_dist_wtd_sad32x32_avg,
aom_dist_wtd_sub_pixel_avg_variance32x32)
BFP(BLOCK_64X64, aom_sad64x64, aom_sad64x64_avg, aom_variance64x64,
aom_sub_pixel_variance64x64, aom_sub_pixel_avg_variance64x64,
- aom_sad64x64x4d, aom_dist_wtd_sad64x64_avg,
+ aom_sad64x64x4d, aom_sad64x64x3d, aom_dist_wtd_sad64x64_avg,
aom_dist_wtd_sub_pixel_avg_variance64x64)
BFP(BLOCK_16X16, aom_sad16x16, aom_sad16x16_avg, aom_variance16x16,
aom_sub_pixel_variance16x16, aom_sub_pixel_avg_variance16x16,
- aom_sad16x16x4d, aom_dist_wtd_sad16x16_avg,
+ aom_sad16x16x4d, aom_sad16x16x3d, aom_dist_wtd_sad16x16_avg,
aom_dist_wtd_sub_pixel_avg_variance16x16)
BFP(BLOCK_16X8, aom_sad16x8, aom_sad16x8_avg, aom_variance16x8,
aom_sub_pixel_variance16x8, aom_sub_pixel_avg_variance16x8,
- aom_sad16x8x4d, aom_dist_wtd_sad16x8_avg,
+ aom_sad16x8x4d, aom_sad16x8x3d, aom_dist_wtd_sad16x8_avg,
aom_dist_wtd_sub_pixel_avg_variance16x8)
BFP(BLOCK_8X16, aom_sad8x16, aom_sad8x16_avg, aom_variance8x16,
aom_sub_pixel_variance8x16, aom_sub_pixel_avg_variance8x16,
- aom_sad8x16x4d, aom_dist_wtd_sad8x16_avg,
+ aom_sad8x16x4d, aom_sad8x16x3d, aom_dist_wtd_sad8x16_avg,
aom_dist_wtd_sub_pixel_avg_variance8x16)
BFP(BLOCK_8X8, aom_sad8x8, aom_sad8x8_avg, aom_variance8x8,
aom_sub_pixel_variance8x8, aom_sub_pixel_avg_variance8x8, aom_sad8x8x4d,
- aom_dist_wtd_sad8x8_avg, aom_dist_wtd_sub_pixel_avg_variance8x8)
+ aom_sad8x8x3d, aom_dist_wtd_sad8x8_avg,
+ aom_dist_wtd_sub_pixel_avg_variance8x8)
BFP(BLOCK_8X4, aom_sad8x4, aom_sad8x4_avg, aom_variance8x4,
aom_sub_pixel_variance8x4, aom_sub_pixel_avg_variance8x4, aom_sad8x4x4d,
- aom_dist_wtd_sad8x4_avg, aom_dist_wtd_sub_pixel_avg_variance8x4)
+ aom_sad8x4x3d, aom_dist_wtd_sad8x4_avg,
+ aom_dist_wtd_sub_pixel_avg_variance8x4)
BFP(BLOCK_4X8, aom_sad4x8, aom_sad4x8_avg, aom_variance4x8,
aom_sub_pixel_variance4x8, aom_sub_pixel_avg_variance4x8, aom_sad4x8x4d,
- aom_dist_wtd_sad4x8_avg, aom_dist_wtd_sub_pixel_avg_variance4x8)
+ aom_sad4x8x3d, aom_dist_wtd_sad4x8_avg,
+ aom_dist_wtd_sub_pixel_avg_variance4x8)
BFP(BLOCK_4X4, aom_sad4x4, aom_sad4x4_avg, aom_variance4x4,
aom_sub_pixel_variance4x4, aom_sub_pixel_avg_variance4x4, aom_sad4x4x4d,
- aom_dist_wtd_sad4x4_avg, aom_dist_wtd_sub_pixel_avg_variance4x4)
+ aom_sad4x4x3d, aom_dist_wtd_sad4x4_avg,
+ aom_dist_wtd_sub_pixel_avg_variance4x4)
#if !CONFIG_REALTIME_ONLY
#define OBFP(BT, OSDF, OVF, OSVF) \
diff --git a/av1/encoder/encoder_utils.h b/av1/encoder/encoder_utils.h
index 5b59289..92e69da 100644
--- a/av1/encoder/encoder_utils.h
+++ b/av1/encoder/encoder_utils.h
@@ -127,14 +127,15 @@
force_intpel_info->rate_size = 0;
}
-#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, JSDAF, JSVAF) \
- ppi->fn_ptr[BT].sdf = SDF; \
- ppi->fn_ptr[BT].sdaf = SDAF; \
- ppi->fn_ptr[BT].vf = VF; \
- ppi->fn_ptr[BT].svf = SVF; \
- ppi->fn_ptr[BT].svaf = SVAF; \
- ppi->fn_ptr[BT].sdx4df = SDX4DF; \
- ppi->fn_ptr[BT].jsdaf = JSDAF; \
+#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, SDX3DF, JSDAF, JSVAF) \
+ ppi->fn_ptr[BT].sdf = SDF; \
+ ppi->fn_ptr[BT].sdaf = SDAF; \
+ ppi->fn_ptr[BT].vf = VF; \
+ ppi->fn_ptr[BT].svf = SVF; \
+ ppi->fn_ptr[BT].svaf = SVAF; \
+ ppi->fn_ptr[BT].sdx4df = SDX4DF; \
+ ppi->fn_ptr[BT].sdx3df = SDX3DF; \
+ ppi->fn_ptr[BT].jsdaf = JSDAF; \
ppi->fn_ptr[BT].jsvaf = JSVAF;
#define HIGHBD_BFP_WRAPPER(WIDTH, HEIGHT, BD) \
@@ -145,6 +146,7 @@
aom_highbd_##BD##_sub_pixel_variance##WIDTH##x##HEIGHT, \
aom_highbd_##BD##_sub_pixel_avg_variance##WIDTH##x##HEIGHT, \
aom_highbd_sad##WIDTH##x##HEIGHT##x4d_bits##BD, \
+ aom_highbd_sad##WIDTH##x##HEIGHT##x3d_bits##BD, \
aom_highbd_dist_wtd_sad##WIDTH##x##HEIGHT##_avg_bits##BD, \
aom_highbd_##BD##_dist_wtd_sub_pixel_avg_variance##WIDTH##x##HEIGHT)
@@ -234,71 +236,93 @@
MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x128)
MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x128_avg)
MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x128x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x128x3d)
MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x64)
MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x64_avg)
MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x64x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x64x3d)
MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x128)
MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x128_avg)
MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x128x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x128x3d)
MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x16)
MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x16_avg)
MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x16x3d)
MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x32)
MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x32_avg)
MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x32x3d)
MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x32)
MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x32_avg)
MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x32x3d)
MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x64)
MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x64_avg)
MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x64x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x64x3d)
MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x32)
MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x32_avg)
MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x32x3d)
MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x64)
MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x64_avg)
MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x64x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x64x3d)
MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x16)
MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x16_avg)
MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x16x3d)
MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x8)
MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x8_avg)
MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x8x3d)
MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x16)
MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x16_avg)
MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x16x3d)
MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x8)
MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x8_avg)
MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x8x3d)
MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x4)
MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x4_avg)
MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x4x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x4x3d)
MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x8)
MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x8_avg)
MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x8x3d)
MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x4)
MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x4_avg)
MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x4x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x4x3d)
#if !CONFIG_REALTIME_ONLY
MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x16)
MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x16_avg)
MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x16x3d)
MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x4)
MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x4_avg)
MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x4x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x4x3d)
MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x32)
MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x32_avg)
MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x32x3d)
MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x8)
MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x8_avg)
MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x8x3d)
MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x64)
MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x64_avg)
MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x64x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x64x3d)
MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x16)
MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x16_avg)
MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x16x3d)
#endif
MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad128x128_avg)
diff --git a/av1/encoder/mcomp.c b/av1/encoder/mcomp.c
index ad0be67..4c2eb21 100644
--- a/av1/encoder/mcomp.c
+++ b/av1/encoder/mcomp.c
@@ -124,9 +124,12 @@
if (use_downsampled_sad) {
ms_params->sdf = ms_params->vfp->sdsf;
ms_params->sdx4df = ms_params->vfp->sdsx4df;
+ // Skip version of sadx3 is not is not available yet
+ ms_params->sdx3df = ms_params->vfp->sdsx4df;
} else {
ms_params->sdf = ms_params->vfp->sdf;
ms_params->sdx4df = ms_params->vfp->sdx4df;
+ ms_params->sdx3df = ms_params->vfp->sdx3df;
}
ms_params->mesh_patterns[0] = mv_sf->mesh_patterns;
@@ -909,7 +912,7 @@
center_address,
};
unsigned int sads[4];
- ms_params->sdx4df(src->buf, src->stride, block_offset, ref->stride, sads);
+ ms_params->sdx3df(src->buf, src->stride, block_offset, ref->stride, sads);
for (int j = 0; j < 3; j++) {
const int index = chkpts_indices[j];
const FULLPEL_MV this_mv = { center_mv.row + site[index].mv.row,
@@ -1799,6 +1802,7 @@
FULLPEL_MOTION_SEARCH_PARAMS new_ms_params = *ms_params;
new_ms_params.sdf = new_ms_params.vfp->sdf;
new_ms_params.sdx4df = new_ms_params.vfp->sdx4df;
+ new_ms_params.sdx3df = new_ms_params.vfp->sdx3df;
return av1_full_pixel_search(start_mv, &new_ms_params, step_param,
cost_list, best_mv, second_best_mv);
diff --git a/av1/encoder/mcomp.h b/av1/encoder/mcomp.h
index b5ff837..5aba8d9 100644
--- a/av1/encoder/mcomp.h
+++ b/av1/encoder/mcomp.h
@@ -137,6 +137,7 @@
// sdf in vfp (e.g. downsampled sad and not sad) to allow speed up.
aom_sad_fn_t sdf;
aom_sad_multi_d_fn_t sdx4df;
+ aom_sad_multi_d_fn_t sdx3df;
} FULLPEL_MOTION_SEARCH_PARAMS;
void av1_init_obmc_buffer(OBMCBuffer *obmc_buffer);
diff --git a/test/sad_test.cc b/test/sad_test.cc
index 24a45b7..9dae336 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -397,6 +397,41 @@
}
};
+class SADx3Test : public ::testing::WithParamInterface<SadMxNx4Param>,
+ public SADTestBase {
+ public:
+ SADx3Test() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
+
+ protected:
+ void SADs(unsigned int *results) {
+ const uint8_t *references[] = { GetReference(0), GetReference(1),
+ GetReference(2), GetReference(3) };
+
+ API_REGISTER_STATE_CHECK(GET_PARAM(2)(
+ source_data_, source_stride_, references, reference_stride_, results));
+ }
+
+ void CheckSADs() {
+ unsigned int reference_sad, exp_sad[4];
+
+ SADs(exp_sad);
+ for (int block = 0; block < 3; ++block) {
+ reference_sad = ReferenceSAD(block);
+
+ EXPECT_EQ(reference_sad, exp_sad[block]) << "block " << block;
+ }
+ }
+
+ void SpeedSAD() {
+ int test_count = 2000000;
+ unsigned int exp_sad[4];
+ while (test_count > 0) {
+ SADs(exp_sad);
+ test_count -= 1;
+ }
+ }
+};
+
class SADSkipx4Test : public ::testing::WithParamInterface<SadMxNx4Param>,
public SADTestBase {
public:
@@ -959,6 +994,7 @@
source_stride_ = tmp_stride;
}
+// SADx4
TEST_P(SADx4Test, MaxRef) {
FillConstant(source_data_, source_stride_, 0);
FillConstant(GetReference(0), reference_stride_, mask_);
@@ -1040,6 +1076,88 @@
SpeedSAD();
}
+// SADx3
+TEST_P(SADx3Test, MaxRef) {
+ FillConstant(source_data_, source_stride_, 0);
+ FillConstant(GetReference(0), reference_stride_, mask_);
+ FillConstant(GetReference(1), reference_stride_, mask_);
+ FillConstant(GetReference(2), reference_stride_, mask_);
+ FillConstant(GetReference(3), reference_stride_, mask_);
+ CheckSADs();
+}
+
+TEST_P(SADx3Test, MaxSrc) {
+ FillConstant(source_data_, source_stride_, mask_);
+ FillConstant(GetReference(0), reference_stride_, 0);
+ FillConstant(GetReference(1), reference_stride_, 0);
+ FillConstant(GetReference(2), reference_stride_, 0);
+ FillConstant(GetReference(3), reference_stride_, 0);
+ CheckSADs();
+}
+
+TEST_P(SADx3Test, ShortRef) {
+ int tmp_stride = reference_stride_;
+ reference_stride_ >>= 1;
+ FillRandom(source_data_, source_stride_);
+ FillRandom(GetReference(0), reference_stride_);
+ FillRandom(GetReference(1), reference_stride_);
+ FillRandom(GetReference(2), reference_stride_);
+ FillRandom(GetReference(3), reference_stride_);
+ CheckSADs();
+ reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADx3Test, UnalignedRef) {
+ // The reference frame, but not the source frame, may be unaligned for
+ // certain types of searches.
+ int tmp_stride = reference_stride_;
+ reference_stride_ -= 1;
+ FillRandom(source_data_, source_stride_);
+ FillRandom(GetReference(0), reference_stride_);
+ FillRandom(GetReference(1), reference_stride_);
+ FillRandom(GetReference(2), reference_stride_);
+ FillRandom(GetReference(3), reference_stride_);
+ CheckSADs();
+ reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADx3Test, ShortSrc) {
+ int tmp_stride = source_stride_;
+ source_stride_ >>= 1;
+ int test_count = 1000;
+ while (test_count > 0) {
+ FillRandom(source_data_, source_stride_);
+ FillRandom(GetReference(0), reference_stride_);
+ FillRandom(GetReference(1), reference_stride_);
+ FillRandom(GetReference(2), reference_stride_);
+ FillRandom(GetReference(3), reference_stride_);
+ CheckSADs();
+ test_count -= 1;
+ }
+ source_stride_ = tmp_stride;
+}
+
+TEST_P(SADx3Test, SrcAlignedByWidth) {
+ uint8_t *tmp_source_data = source_data_;
+ source_data_ += width_;
+ FillRandom(source_data_, source_stride_);
+ FillRandom(GetReference(0), reference_stride_);
+ FillRandom(GetReference(1), reference_stride_);
+ FillRandom(GetReference(2), reference_stride_);
+ FillRandom(GetReference(3), reference_stride_);
+ CheckSADs();
+ source_data_ = tmp_source_data;
+}
+
+TEST_P(SADx3Test, DISABLED_Speed) {
+ FillRandom(source_data_, source_stride_);
+ FillRandom(GetReference(0), reference_stride_);
+ FillRandom(GetReference(1), reference_stride_);
+ FillRandom(GetReference(2), reference_stride_);
+ FillRandom(GetReference(3), reference_stride_);
+ SpeedSAD();
+}
+
// SADSkipx4
TEST_P(SADSkipx4Test, MaxRef) {
FillConstant(source_data_, source_stride_, 0);
@@ -1660,6 +1778,108 @@
};
INSTANTIATE_TEST_SUITE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests));
+const SadMxNx4Param x3d_c_tests[] = {
+ make_tuple(128, 128, &aom_sad128x128x3d_c, -1),
+ make_tuple(128, 64, &aom_sad128x64x3d_c, -1),
+ make_tuple(64, 128, &aom_sad64x128x3d_c, -1),
+ make_tuple(64, 64, &aom_sad64x64x3d_c, -1),
+ make_tuple(64, 32, &aom_sad64x32x3d_c, -1),
+ make_tuple(32, 64, &aom_sad32x64x3d_c, -1),
+ make_tuple(32, 32, &aom_sad32x32x3d_c, -1),
+ make_tuple(32, 16, &aom_sad32x16x3d_c, -1),
+ make_tuple(16, 32, &aom_sad16x32x3d_c, -1),
+ make_tuple(16, 16, &aom_sad16x16x3d_c, -1),
+ make_tuple(16, 8, &aom_sad16x8x3d_c, -1),
+ make_tuple(8, 16, &aom_sad8x16x3d_c, -1),
+ make_tuple(8, 8, &aom_sad8x8x3d_c, -1),
+ make_tuple(8, 4, &aom_sad8x4x3d_c, -1),
+ make_tuple(4, 8, &aom_sad4x8x3d_c, -1),
+ make_tuple(4, 4, &aom_sad4x4x3d_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(128, 128, &aom_highbd_sad128x128x3d_c, 8),
+ make_tuple(128, 64, &aom_highbd_sad128x64x3d_c, 8),
+ make_tuple(64, 128, &aom_highbd_sad64x128x3d_c, 8),
+ make_tuple(64, 64, &aom_highbd_sad64x64x3d_c, 8),
+ make_tuple(64, 32, &aom_highbd_sad64x32x3d_c, 8),
+ make_tuple(32, 64, &aom_highbd_sad32x64x3d_c, 8),
+ make_tuple(32, 32, &aom_highbd_sad32x32x3d_c, 8),
+ make_tuple(32, 16, &aom_highbd_sad32x16x3d_c, 8),
+ make_tuple(16, 32, &aom_highbd_sad16x32x3d_c, 8),
+ make_tuple(16, 16, &aom_highbd_sad16x16x3d_c, 8),
+ make_tuple(16, 8, &aom_highbd_sad16x8x3d_c, 8),
+ make_tuple(8, 16, &aom_highbd_sad8x16x3d_c, 8),
+ make_tuple(8, 8, &aom_highbd_sad8x8x3d_c, 8),
+ make_tuple(8, 4, &aom_highbd_sad8x4x3d_c, 8),
+ make_tuple(4, 8, &aom_highbd_sad4x8x3d_c, 8),
+ make_tuple(4, 4, &aom_highbd_sad4x4x3d_c, 8),
+ make_tuple(128, 128, &aom_highbd_sad128x128x3d_c, 10),
+ make_tuple(128, 64, &aom_highbd_sad128x64x3d_c, 10),
+ make_tuple(64, 128, &aom_highbd_sad64x128x3d_c, 10),
+ make_tuple(64, 64, &aom_highbd_sad64x64x3d_c, 10),
+ make_tuple(64, 32, &aom_highbd_sad64x32x3d_c, 10),
+ make_tuple(32, 64, &aom_highbd_sad32x64x3d_c, 10),
+ make_tuple(32, 32, &aom_highbd_sad32x32x3d_c, 10),
+ make_tuple(32, 16, &aom_highbd_sad32x16x3d_c, 10),
+ make_tuple(16, 32, &aom_highbd_sad16x32x3d_c, 10),
+ make_tuple(16, 16, &aom_highbd_sad16x16x3d_c, 10),
+ make_tuple(16, 8, &aom_highbd_sad16x8x3d_c, 10),
+ make_tuple(8, 16, &aom_highbd_sad8x16x3d_c, 10),
+ make_tuple(8, 8, &aom_highbd_sad8x8x3d_c, 10),
+ make_tuple(8, 4, &aom_highbd_sad8x4x3d_c, 10),
+ make_tuple(4, 8, &aom_highbd_sad4x8x3d_c, 10),
+ make_tuple(4, 4, &aom_highbd_sad4x4x3d_c, 10),
+ make_tuple(128, 128, &aom_highbd_sad128x128x3d_c, 12),
+ make_tuple(128, 64, &aom_highbd_sad128x64x3d_c, 12),
+ make_tuple(64, 128, &aom_highbd_sad64x128x3d_c, 12),
+ make_tuple(64, 64, &aom_highbd_sad64x64x3d_c, 12),
+ make_tuple(64, 32, &aom_highbd_sad64x32x3d_c, 12),
+ make_tuple(32, 64, &aom_highbd_sad32x64x3d_c, 12),
+ make_tuple(32, 32, &aom_highbd_sad32x32x3d_c, 12),
+ make_tuple(32, 16, &aom_highbd_sad32x16x3d_c, 12),
+ make_tuple(16, 32, &aom_highbd_sad16x32x3d_c, 12),
+ make_tuple(16, 16, &aom_highbd_sad16x16x3d_c, 12),
+ make_tuple(16, 8, &aom_highbd_sad16x8x3d_c, 12),
+ make_tuple(8, 16, &aom_highbd_sad8x16x3d_c, 12),
+ make_tuple(8, 8, &aom_highbd_sad8x8x3d_c, 12),
+ make_tuple(8, 4, &aom_highbd_sad8x4x3d_c, 12),
+ make_tuple(4, 8, &aom_highbd_sad4x8x3d_c, 12),
+ make_tuple(4, 4, &aom_highbd_sad4x4x3d_c, 12),
+#endif
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_sad64x16x3d_c, -1),
+ make_tuple(16, 64, &aom_sad16x64x3d_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(64, 16, &aom_highbd_sad64x16x3d_c, 8),
+ make_tuple(16, 64, &aom_highbd_sad16x64x3d_c, 8),
+ make_tuple(64, 16, &aom_highbd_sad64x16x3d_c, 10),
+ make_tuple(16, 64, &aom_highbd_sad16x64x3d_c, 10),
+ make_tuple(64, 16, &aom_highbd_sad64x16x3d_c, 12),
+ make_tuple(16, 64, &aom_highbd_sad16x64x3d_c, 12),
+#endif
+ make_tuple(32, 8, &aom_sad32x8x3d_c, -1),
+ make_tuple(8, 32, &aom_sad8x32x3d_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(32, 8, &aom_highbd_sad32x8x3d_c, 8),
+ make_tuple(8, 32, &aom_highbd_sad8x32x3d_c, 8),
+ make_tuple(32, 8, &aom_highbd_sad32x8x3d_c, 10),
+ make_tuple(8, 32, &aom_highbd_sad8x32x3d_c, 10),
+ make_tuple(32, 8, &aom_highbd_sad32x8x3d_c, 12),
+ make_tuple(8, 32, &aom_highbd_sad8x32x3d_c, 12),
+#endif
+ make_tuple(16, 4, &aom_sad16x4x3d_c, -1),
+ make_tuple(4, 16, &aom_sad4x16x3d_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(16, 4, &aom_highbd_sad16x4x3d_c, 8),
+ make_tuple(4, 16, &aom_highbd_sad4x16x3d_c, 8),
+ make_tuple(16, 4, &aom_highbd_sad16x4x3d_c, 10),
+ make_tuple(4, 16, &aom_highbd_sad4x16x3d_c, 10),
+ make_tuple(16, 4, &aom_highbd_sad16x4x3d_c, 12),
+ make_tuple(4, 16, &aom_highbd_sad4x16x3d_c, 12),
+#endif
+#endif // !CONFIG_REALTIME_ONLY
+};
+INSTANTIATE_TEST_SUITE_P(C, SADx3Test, ::testing::ValuesIn(x3d_c_tests));
+
const SadMxNx4Param skip_x4d_c_tests[] = {
make_tuple(128, 128, &aom_sad_skip_128x128x4d_c, -1),
make_tuple(128, 64, &aom_sad_skip_128x64x4d_c, -1),
@@ -2837,6 +3057,74 @@
#endif
};
INSTANTIATE_TEST_SUITE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests));
+
+const SadMxNx4Param x3d_avx2_tests[] = {
+ make_tuple(32, 64, &aom_sad32x64x3d_avx2, -1),
+ make_tuple(32, 32, &aom_sad32x32x3d_avx2, -1),
+ make_tuple(32, 16, &aom_sad32x16x3d_avx2, -1),
+ make_tuple(64, 128, &aom_sad64x128x3d_avx2, -1),
+ make_tuple(64, 64, &aom_sad64x64x3d_avx2, -1),
+ make_tuple(64, 32, &aom_sad64x32x3d_avx2, -1),
+ make_tuple(128, 128, &aom_sad128x128x3d_avx2, -1),
+ make_tuple(128, 64, &aom_sad128x64x3d_avx2, -1),
+
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(32, 8, &aom_sad32x8x3d_avx2, -1),
+ make_tuple(64, 16, &aom_sad64x16x3d_avx2, -1),
+#endif // !CONFIG_REALTIME_ONLY
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(128, 128, &aom_highbd_sad128x128x3d_avx2, 8),
+ make_tuple(128, 128, &aom_highbd_sad128x128x3d_avx2, 10),
+ make_tuple(128, 128, &aom_highbd_sad128x128x3d_avx2, 12),
+ make_tuple(128, 64, &aom_highbd_sad128x64x3d_avx2, 8),
+ make_tuple(128, 64, &aom_highbd_sad128x64x3d_avx2, 10),
+ make_tuple(128, 64, &aom_highbd_sad128x64x3d_avx2, 12),
+ make_tuple(64, 128, &aom_highbd_sad64x128x3d_avx2, 8),
+ make_tuple(64, 128, &aom_highbd_sad64x128x3d_avx2, 10),
+ make_tuple(64, 128, &aom_highbd_sad64x128x3d_avx2, 12),
+ make_tuple(64, 64, &aom_highbd_sad64x64x3d_avx2, 8),
+ make_tuple(64, 64, &aom_highbd_sad64x64x3d_avx2, 10),
+ make_tuple(64, 64, &aom_highbd_sad64x64x3d_avx2, 12),
+ make_tuple(64, 32, &aom_highbd_sad64x32x3d_avx2, 8),
+ make_tuple(64, 32, &aom_highbd_sad64x32x3d_avx2, 10),
+ make_tuple(64, 32, &aom_highbd_sad64x32x3d_avx2, 12),
+ make_tuple(32, 64, &aom_highbd_sad32x64x3d_avx2, 8),
+ make_tuple(32, 64, &aom_highbd_sad32x64x3d_avx2, 10),
+ make_tuple(32, 64, &aom_highbd_sad32x64x3d_avx2, 12),
+ make_tuple(32, 32, &aom_highbd_sad32x32x3d_avx2, 8),
+ make_tuple(32, 32, &aom_highbd_sad32x32x3d_avx2, 10),
+ make_tuple(32, 32, &aom_highbd_sad32x32x3d_avx2, 12),
+ make_tuple(32, 16, &aom_highbd_sad32x16x3d_avx2, 8),
+ make_tuple(32, 16, &aom_highbd_sad32x16x3d_avx2, 10),
+ make_tuple(32, 16, &aom_highbd_sad32x16x3d_avx2, 12),
+ make_tuple(16, 32, &aom_highbd_sad16x32x3d_avx2, 8),
+ make_tuple(16, 32, &aom_highbd_sad16x32x3d_avx2, 10),
+ make_tuple(16, 32, &aom_highbd_sad16x32x3d_avx2, 12),
+ make_tuple(16, 16, &aom_highbd_sad16x16x3d_avx2, 8),
+ make_tuple(16, 16, &aom_highbd_sad16x16x3d_avx2, 10),
+ make_tuple(16, 16, &aom_highbd_sad16x16x3d_avx2, 12),
+ make_tuple(16, 8, &aom_highbd_sad16x8x3d_avx2, 8),
+ make_tuple(16, 8, &aom_highbd_sad16x8x3d_avx2, 10),
+ make_tuple(16, 8, &aom_highbd_sad16x8x3d_avx2, 12),
+
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(16, 64, &aom_highbd_sad16x64x3d_avx2, 8),
+ make_tuple(16, 64, &aom_highbd_sad16x64x3d_avx2, 10),
+ make_tuple(16, 64, &aom_highbd_sad16x64x3d_avx2, 12),
+ make_tuple(64, 16, &aom_highbd_sad64x16x3d_avx2, 8),
+ make_tuple(64, 16, &aom_highbd_sad64x16x3d_avx2, 10),
+ make_tuple(64, 16, &aom_highbd_sad64x16x3d_avx2, 12),
+ make_tuple(32, 8, &aom_highbd_sad32x8x3d_avx2, 8),
+ make_tuple(32, 8, &aom_highbd_sad32x8x3d_avx2, 10),
+ make_tuple(32, 8, &aom_highbd_sad32x8x3d_avx2, 12),
+ make_tuple(16, 4, &aom_highbd_sad16x4x3d_avx2, 8),
+ make_tuple(16, 4, &aom_highbd_sad16x4x3d_avx2, 10),
+ make_tuple(16, 4, &aom_highbd_sad16x4x3d_avx2, 12),
+#endif // !CONFIG_REALTIME_ONLY
+#endif // CONFIG_AV1_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, SADx3Test, ::testing::ValuesIn(x3d_avx2_tests));
#endif // HAVE_AVX2
} // namespace