AVX2: Add optimization for sad_mxnx3d

BUG=aomedia:3358

Change-Id: I9ceead2183c2d9673548af57634070a3738d14ab
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 3bc307b..427a3dc 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -999,6 +999,7 @@
   foreach (@encoder_block_sizes) {
     ($w, $h) = @$_;
     add_proto qw/void/, "aom_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]";
+    add_proto qw/void/, "aom_sad${w}x${h}x3d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]";
     add_proto qw/void/, "aom_sad${w}x${h}x4d_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, const uint8_t *second_pred, uint32_t sad_array[4]";
     add_proto qw/void/, "aom_sad_skip_${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]";
     add_proto qw/void/, "aom_masked_sad${w}x${h}x4d", "const uint8_t *src, int src_stride, const uint8_t *ref[4], int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned sads[4]";
@@ -1052,6 +1053,22 @@
   specialize qw/aom_sad_skip_4x16x4d         sse2 neon/;
   specialize qw/aom_sad_skip_4x8x4d          sse2 neon/;
 
+  specialize qw/aom_sad128x128x3d avx2/;
+  specialize qw/aom_sad128x64x3d  avx2/;
+  specialize qw/aom_sad64x128x3d  avx2/;
+  specialize qw/aom_sad64x64x3d   avx2/;
+  specialize qw/aom_sad64x32x3d   avx2/;
+  specialize qw/aom_sad32x64x3d   avx2/;
+  specialize qw/aom_sad32x32x3d   avx2/;
+  specialize qw/aom_sad32x16x3d   avx2/;
+  specialize qw/aom_sad16x32x3d   avx2/;
+  specialize qw/aom_sad16x16x3d   avx2/;
+  specialize qw/aom_sad16x8x3d    avx2/;
+
+  specialize qw/aom_sad64x16x3d   avx2/;
+  specialize qw/aom_sad32x8x3d    avx2/;
+  specialize qw/aom_sad16x64x3d   avx2/;
+
   if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
     specialize qw/aom_sad128x128x4d_avg sse2/;
     specialize qw/aom_sad128x64x4d_avg  sse2/;
@@ -1118,6 +1135,7 @@
     foreach (@encoder_block_sizes) {
       ($w, $h) = @$_;
       add_proto qw/void/, "aom_highbd_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+      add_proto qw/void/, "aom_highbd_sad${w}x${h}x3d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
       add_proto qw/void/, "aom_highbd_sad_skip_${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
       if ($w != 128 && $h != 128) {
         specialize "aom_highbd_sad${w}x${h}x4d", qw/sse2/;
@@ -1167,6 +1185,23 @@
     specialize qw/aom_highbd_sad_skip_32x8x4d    avx2 sse2/;
     specialize qw/aom_highbd_sad_skip_16x64x4d   avx2 sse2/;
     specialize qw/aom_highbd_sad_skip_64x16x4d   avx2 sse2/;
+
+    specialize qw/aom_highbd_sad128x128x3d avx2/;
+    specialize qw/aom_highbd_sad128x64x3d  avx2/;
+    specialize qw/aom_highbd_sad64x128x3d  avx2/;
+    specialize qw/aom_highbd_sad64x64x3d   avx2/;
+    specialize qw/aom_highbd_sad64x32x3d   avx2/;
+    specialize qw/aom_highbd_sad32x64x3d   avx2/;
+    specialize qw/aom_highbd_sad32x32x3d   avx2/;
+    specialize qw/aom_highbd_sad32x16x3d   avx2/;
+    specialize qw/aom_highbd_sad16x32x3d   avx2/;
+    specialize qw/aom_highbd_sad16x16x3d   avx2/;
+    specialize qw/aom_highbd_sad16x8x3d    avx2/;
+
+    specialize qw/aom_highbd_sad16x4x3d    avx2/;
+    specialize qw/aom_highbd_sad32x8x3d    avx2/;
+    specialize qw/aom_highbd_sad16x64x3d   avx2/;
+    specialize qw/aom_highbd_sad64x16x3d   avx2/;
   }
   #
   # Avg
diff --git a/aom_dsp/sad.c b/aom_dsp/sad.c
index 94260ce..5b7b0e4 100644
--- a/aom_dsp/sad.c
+++ b/aom_dsp/sad.c
@@ -120,70 +120,93 @@
     }                                                                         \
   }
 #endif  // CONFIG_REALTIME_ONLY
+// Call SIMD version of aom_sad_mxnx4d if the 3d version is unavailable.
+#define SAD_MXNX3D(m, n)                                                      \
+  void aom_sad##m##x##n##x3d_c(const uint8_t *src, int src_stride,            \
+                               const uint8_t *const ref_array[4],             \
+                               int ref_stride, uint32_t sad_array[4]) {       \
+    aom_sad##m##x##n##x4d(src, src_stride, ref_array, ref_stride, sad_array); \
+  }
 
 // 128x128
 SADMXN(128, 128)
 SAD_MXNX4D(128, 128)
+SAD_MXNX3D(128, 128)
 
 // 128x64
 SADMXN(128, 64)
 SAD_MXNX4D(128, 64)
+SAD_MXNX3D(128, 64)
 
 // 64x128
 SADMXN(64, 128)
 SAD_MXNX4D(64, 128)
+SAD_MXNX3D(64, 128)
 
 // 64x64
 SADMXN(64, 64)
 SAD_MXNX4D(64, 64)
+SAD_MXNX3D(64, 64)
 
 // 64x32
 SADMXN(64, 32)
 SAD_MXNX4D(64, 32)
+SAD_MXNX3D(64, 32)
 
 // 32x64
 SADMXN(32, 64)
 SAD_MXNX4D(32, 64)
+SAD_MXNX3D(32, 64)
 
 // 32x32
 SADMXN(32, 32)
 SAD_MXNX4D(32, 32)
+SAD_MXNX3D(32, 32)
 
 // 32x16
 SADMXN(32, 16)
 SAD_MXNX4D(32, 16)
+SAD_MXNX3D(32, 16)
 
 // 16x32
 SADMXN(16, 32)
 SAD_MXNX4D(16, 32)
+SAD_MXNX3D(16, 32)
 
 // 16x16
 SADMXN(16, 16)
 SAD_MXNX4D(16, 16)
+SAD_MXNX3D(16, 16)
 
 // 16x8
 SADMXN(16, 8)
 SAD_MXNX4D(16, 8)
+SAD_MXNX3D(16, 8)
 
 // 8x16
 SADMXN(8, 16)
 SAD_MXNX4D(8, 16)
+SAD_MXNX3D(8, 16)
 
 // 8x8
 SADMXN(8, 8)
 SAD_MXNX4D(8, 8)
+SAD_MXNX3D(8, 8)
 
 // 8x4
 SADMXN(8, 4)
 SAD_MXNX4D(8, 4)
+SAD_MXNX3D(8, 4)
 
 // 4x8
 SADMXN(4, 8)
 SAD_MXNX4D(4, 8)
+SAD_MXNX3D(4, 8)
 
 // 4x4
 SADMXN(4, 4)
 SAD_MXNX4D(4, 4)
+SAD_MXNX3D(4, 4)
 
 SAD_MXH(128)
 SAD_MXH(64)
@@ -204,6 +227,14 @@
 SAD_MXNX4D(16, 64)
 SADMXN(64, 16)
 SAD_MXNX4D(64, 16)
+#if !CONFIG_REALTIME_ONLY
+SAD_MXNX3D(4, 16)
+SAD_MXNX3D(16, 4)
+SAD_MXNX3D(8, 32)
+SAD_MXNX3D(32, 8)
+SAD_MXNX3D(16, 64)
+SAD_MXNX3D(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
 
 #if CONFIG_AV1_HIGHBITDEPTH
 static INLINE unsigned int highbd_sad(const uint8_t *a8, int a_stride,
@@ -291,70 +322,94 @@
                                     2 * ref_stride, (m), (n / 2));           \
     }                                                                        \
   }
+// Call SIMD version of aom_highbd_sad_mxnx4d if the 3d version is unavailable.
+#define HIGHBD_SAD_MXNX3D(m, n)                                              \
+  void aom_highbd_sad##m##x##n##x3d_c(const uint8_t *src, int src_stride,    \
+                                      const uint8_t *const ref_array[],      \
+                                      int ref_stride, uint32_t *sad_array) { \
+    aom_highbd_sad##m##x##n##x4d(src, src_stride, ref_array, ref_stride,     \
+                                 sad_array);                                 \
+  }
 
 // 128x128
 HIGHBD_SADMXN(128, 128)
 HIGHBD_SAD_MXNX4D(128, 128)
+HIGHBD_SAD_MXNX3D(128, 128)
 
 // 128x64
 HIGHBD_SADMXN(128, 64)
 HIGHBD_SAD_MXNX4D(128, 64)
+HIGHBD_SAD_MXNX3D(128, 64)
 
 // 64x128
 HIGHBD_SADMXN(64, 128)
 HIGHBD_SAD_MXNX4D(64, 128)
+HIGHBD_SAD_MXNX3D(64, 128)
 
 // 64x64
 HIGHBD_SADMXN(64, 64)
 HIGHBD_SAD_MXNX4D(64, 64)
+HIGHBD_SAD_MXNX3D(64, 64)
 
 // 64x32
 HIGHBD_SADMXN(64, 32)
 HIGHBD_SAD_MXNX4D(64, 32)
+HIGHBD_SAD_MXNX3D(64, 32)
 
 // 32x64
 HIGHBD_SADMXN(32, 64)
 HIGHBD_SAD_MXNX4D(32, 64)
+HIGHBD_SAD_MXNX3D(32, 64)
 
 // 32x32
 HIGHBD_SADMXN(32, 32)
 HIGHBD_SAD_MXNX4D(32, 32)
+HIGHBD_SAD_MXNX3D(32, 32)
 
 // 32x16
 HIGHBD_SADMXN(32, 16)
 HIGHBD_SAD_MXNX4D(32, 16)
+HIGHBD_SAD_MXNX3D(32, 16)
 
 // 16x32
 HIGHBD_SADMXN(16, 32)
 HIGHBD_SAD_MXNX4D(16, 32)
+HIGHBD_SAD_MXNX3D(16, 32)
 
 // 16x16
 HIGHBD_SADMXN(16, 16)
 HIGHBD_SAD_MXNX4D(16, 16)
+HIGHBD_SAD_MXNX3D(16, 16)
 
 // 16x8
 HIGHBD_SADMXN(16, 8)
 HIGHBD_SAD_MXNX4D(16, 8)
+HIGHBD_SAD_MXNX3D(16, 8)
 
 // 8x16
 HIGHBD_SADMXN(8, 16)
 HIGHBD_SAD_MXNX4D(8, 16)
+HIGHBD_SAD_MXNX3D(8, 16)
 
 // 8x8
 HIGHBD_SADMXN(8, 8)
 HIGHBD_SAD_MXNX4D(8, 8)
+HIGHBD_SAD_MXNX3D(8, 8)
 
 // 8x4
 HIGHBD_SADMXN(8, 4)
 HIGHBD_SAD_MXNX4D(8, 4)
+HIGHBD_SAD_MXNX3D(8, 4)
 
 // 4x8
 HIGHBD_SADMXN(4, 8)
 HIGHBD_SAD_MXNX4D(4, 8)
+HIGHBD_SAD_MXNX3D(4, 8)
 
 // 4x4
 HIGHBD_SADMXN(4, 4)
 HIGHBD_SAD_MXNX4D(4, 4)
+HIGHBD_SAD_MXNX3D(4, 4)
 
 HIGHBD_SADMXN(4, 16)
 HIGHBD_SAD_MXNX4D(4, 16)
@@ -368,4 +423,13 @@
 HIGHBD_SAD_MXNX4D(16, 64)
 HIGHBD_SADMXN(64, 16)
 HIGHBD_SAD_MXNX4D(64, 16)
+
+#if !CONFIG_REALTIME_ONLY
+HIGHBD_SAD_MXNX3D(4, 16)
+HIGHBD_SAD_MXNX3D(16, 4)
+HIGHBD_SAD_MXNX3D(8, 32)
+HIGHBD_SAD_MXNX3D(32, 8)
+HIGHBD_SAD_MXNX3D(16, 64)
+HIGHBD_SAD_MXNX3D(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
 #endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/aom_dsp/variance.h b/aom_dsp/variance.h
index dae4197..6603d31 100644
--- a/aom_dsp/variance.h
+++ b/aom_dsp/variance.h
@@ -90,6 +90,7 @@
   aom_subpixvariance_fn_t svf;
   aom_subp_avg_variance_fn_t svaf;
   aom_sad_multi_d_fn_t sdx4df;
+  aom_sad_multi_d_fn_t sdx3df;
   // Same as sadx4, but downsample the rows by a factor of 2.
   aom_sad_multi_d_fn_t sdsx4df;
   aom_masked_sad_fn_t msdf;
diff --git a/aom_dsp/x86/highbd_sad_avx2.c b/aom_dsp/x86/highbd_sad_avx2.c
index f583772..e11754e 100644
--- a/aom_dsp/x86/highbd_sad_avx2.c
+++ b/aom_dsp/x86/highbd_sad_avx2.c
@@ -602,71 +602,34 @@
   s[3] = _mm256_setzero_si256();
 }
 
-static AOM_FORCE_INLINE void aom_highbd_sad16xNx4d_avx2(
-    int N, const uint8_t *src, int src_stride, const uint8_t *const ref_array[],
-    int ref_stride, uint32_t *sad_array) {
+static AOM_FORCE_INLINE void aom_highbd_sadMxNxD_avx2(
+    int M, int N, int D, const uint8_t *src, int src_stride,
+    const uint8_t *const ref_array[], int ref_stride, uint32_t *sad_array) {
   __m256i sad_vec[4];
   const uint16_t *refp[4];
   const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
   const uint16_t *srcp;
-  const int shift_for_4_rows = 2;
-  int i, j;
-
-  init_sad(sad_vec);
-  convert_pointers(ref_array, refp);
-
-  for (i = 0; i < 4; ++i) {
-    srcp = keep;
-    for (j = 0; j < N; j += 4) {
-      sad16x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
-      srcp += src_stride << shift_for_4_rows;
-      refp[i] += ref_stride << shift_for_4_rows;
-    }
-  }
-  get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
-}
-
-static AOM_FORCE_INLINE void aom_highbd_sad32xNx4d_avx2(
-    int N, const uint8_t *src, int src_stride, const uint8_t *const ref_array[],
-    int ref_stride, uint32_t *sad_array) {
-  __m256i sad_vec[4];
-  const uint16_t *refp[4];
-  const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
-  const uint16_t *srcp;
-  const int shift_for_4_rows = 2;
+  const int shift_for_rows = (M < 128) + (M < 64);
+  const int row_units = 1 << shift_for_rows;
   int i, r;
 
   init_sad(sad_vec);
   convert_pointers(ref_array, refp);
 
-  for (i = 0; i < 4; ++i) {
+  for (i = 0; i < D; ++i) {
     srcp = keep;
-    for (r = 0; r < N; r += 4) {
-      sad32x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
-      srcp += src_stride << shift_for_4_rows;
-      refp[i] += ref_stride << shift_for_4_rows;
-    }
-  }
-  get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
-}
-
-static AOM_FORCE_INLINE void aom_highbd_sad64xNx4d_avx2(
-    int N, const uint8_t *src, int src_stride, const uint8_t *const ref_array[],
-    int ref_stride, uint32_t *sad_array) {
-  __m256i sad_vec[4];
-  const uint16_t *refp[4];
-  const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
-  const uint16_t *srcp;
-  const int shift_for_rows = 1;
-  int i, r;
-
-  init_sad(sad_vec);
-  convert_pointers(ref_array, refp);
-
-  for (i = 0; i < 4; ++i) {
-    srcp = keep;
-    for (r = 0; r < N; r += 2) {
-      sad64x2(srcp, src_stride, refp[i], ref_stride, NULL, &sad_vec[i]);
+    for (r = 0; r < N; r += row_units) {
+      if (M == 128) {
+        sad128x1(srcp, refp[i], NULL, &sad_vec[i]);
+      } else if (M == 64) {
+        sad64x2(srcp, src_stride, refp[i], ref_stride, NULL, &sad_vec[i]);
+      } else if (M == 32) {
+        sad32x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
+      } else if (M == 16) {
+        sad16x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
+      } else {
+        assert(0);
+      }
       srcp += src_stride << shift_for_rows;
       refp[i] += ref_stride << shift_for_rows;
     }
@@ -674,47 +637,31 @@
   get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
 }
 
-static AOM_FORCE_INLINE void aom_highbd_sad128xNx4d_avx2(
-    int N, const uint8_t *src, int src_stride, const uint8_t *const ref_array[],
-    int ref_stride, uint32_t *sad_array) {
-  __m256i sad_vec[4];
-  const uint16_t *refp[4];
-  const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
-  const uint16_t *srcp;
-  int i, r;
-
-  init_sad(sad_vec);
-  convert_pointers(ref_array, refp);
-
-  for (i = 0; i < 4; ++i) {
-    srcp = keep;
-    for (r = 0; r < N; r++) {
-      sad128x1(srcp, refp[i], NULL, &sad_vec[i]);
-      srcp += src_stride;
-      refp[i] += ref_stride;
-    }
-  }
-  get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
-}
-
-#define HIGHBD_SAD_MXNX4D_AVX2(m, n)                                         \
-  void aom_highbd_sad##m##x##n##x4d_avx2(                                    \
-      const uint8_t *src, int src_stride, const uint8_t *const ref_array[],  \
-      int ref_stride, uint32_t *sad_array) {                                 \
-    aom_highbd_sad##m##xNx4d_avx2(n, src, src_stride, ref_array, ref_stride, \
-                                  sad_array);                                \
+#define HIGHBD_SAD_MXNX4D_AVX2(m, n)                                          \
+  void aom_highbd_sad##m##x##n##x4d_avx2(                                     \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[],   \
+      int ref_stride, uint32_t *sad_array) {                                  \
+    aom_highbd_sadMxNxD_avx2(m, n, 4, src, src_stride, ref_array, ref_stride, \
+                             sad_array);                                      \
   }
 #define HIGHBD_SAD_SKIP_MXNX4D_AVX2(m, n)                                   \
   void aom_highbd_sad_skip_##m##x##n##x4d_avx2(                             \
       const uint8_t *src, int src_stride, const uint8_t *const ref_array[], \
       int ref_stride, uint32_t *sad_array) {                                \
-    aom_highbd_sad##m##xNx4d_avx2((n / 2), src, 2 * src_stride, ref_array,  \
-                                  2 * ref_stride, sad_array);               \
+    aom_highbd_sadMxNxD_avx2(m, (n / 2), 4, src, 2 * src_stride, ref_array, \
+                             2 * ref_stride, sad_array);                    \
     sad_array[0] <<= 1;                                                     \
     sad_array[1] <<= 1;                                                     \
     sad_array[2] <<= 1;                                                     \
     sad_array[3] <<= 1;                                                     \
   }
+#define HIGHBD_SAD_MXNX3D_AVX2(m, n)                                          \
+  void aom_highbd_sad##m##x##n##x3d_avx2(                                     \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[],   \
+      int ref_stride, uint32_t *sad_array) {                                  \
+    aom_highbd_sadMxNxD_avx2(m, n, 3, src, src_stride, ref_array, ref_stride, \
+                             sad_array);                                      \
+  }
 
 HIGHBD_SAD_MXNX4D_AVX2(16, 4)
 HIGHBD_SAD_MXNX4D_AVX2(16, 8)
@@ -752,3 +699,22 @@
 
 HIGHBD_SAD_SKIP_MXNX4D_AVX2(128, 64)
 HIGHBD_SAD_SKIP_MXNX4D_AVX2(128, 128)
+
+HIGHBD_SAD_MXNX3D_AVX2(16, 4)
+HIGHBD_SAD_MXNX3D_AVX2(16, 8)
+HIGHBD_SAD_MXNX3D_AVX2(16, 16)
+HIGHBD_SAD_MXNX3D_AVX2(16, 32)
+HIGHBD_SAD_MXNX3D_AVX2(16, 64)
+
+HIGHBD_SAD_MXNX3D_AVX2(32, 8)
+HIGHBD_SAD_MXNX3D_AVX2(32, 16)
+HIGHBD_SAD_MXNX3D_AVX2(32, 32)
+HIGHBD_SAD_MXNX3D_AVX2(32, 64)
+
+HIGHBD_SAD_MXNX3D_AVX2(64, 16)
+HIGHBD_SAD_MXNX3D_AVX2(64, 32)
+HIGHBD_SAD_MXNX3D_AVX2(64, 64)
+HIGHBD_SAD_MXNX3D_AVX2(64, 128)
+
+HIGHBD_SAD_MXNX3D_AVX2(128, 64)
+HIGHBD_SAD_MXNX3D_AVX2(128, 128)
diff --git a/aom_dsp/x86/sad4d_avx2.c b/aom_dsp/x86/sad4d_avx2.c
index 2f523f7..7629cf4 100644
--- a/aom_dsp/x86/sad4d_avx2.c
+++ b/aom_dsp/x86/sad4d_avx2.c
@@ -13,6 +13,35 @@
 #include "config/aom_dsp_rtcd.h"
 
 #include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+static AOM_FORCE_INLINE void aggregate_and_store_sum(uint32_t res[4],
+                                                     __m256i sum_ref0,
+                                                     __m256i sum_ref1,
+                                                     __m256i sum_ref2,
+                                                     __m256i sum_ref3) {
+  __m128i sum;
+  // In sum_ref-i the result is saved in the first 4 bytes and the other 4
+  // bytes are zeroed.
+  // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
+  // 0, 0, 1, 1
+  sum_ref0 = _mm256_castps_si256(_mm256_shuffle_ps(
+      _mm256_castsi256_ps(sum_ref0), _mm256_castsi256_ps(sum_ref1),
+      _MM_SHUFFLE(2, 0, 2, 0)));
+  // 2, 2, 3, 3
+  sum_ref2 = _mm256_castps_si256(_mm256_shuffle_ps(
+      _mm256_castsi256_ps(sum_ref2), _mm256_castsi256_ps(sum_ref3),
+      _MM_SHUFFLE(2, 0, 2, 0)));
+
+  // sum adjacent 32 bit integers
+  sum_ref0 = _mm256_hadd_epi32(sum_ref0, sum_ref2);
+
+  // add the low 128 bit to the high 128 bit
+  sum = _mm_add_epi32(_mm256_castsi256_si128(sum_ref0),
+                      _mm256_extractf128_si256(sum_ref0, 1));
+
+  _mm_storeu_si128((__m128i *)(res), sum);
+}
 
 static AOM_FORCE_INLINE void aom_sadMxNx4d_avx2(
     int M, int N, const uint8_t *src, int src_stride,
@@ -57,29 +86,49 @@
     ref2 += ref_stride;
     ref3 += ref_stride;
   }
-  {
-    __m128i sum;
-    // In sum_ref-i the result is saved in the first 4 bytes and the other 4
-    // bytes are zeroed.
-    // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
-    // 0, 0, 1, 1
-    sum_ref0 = _mm256_castps_si256(_mm256_shuffle_ps(
-        _mm256_castsi256_ps(sum_ref0), _mm256_castsi256_ps(sum_ref1),
-        _MM_SHUFFLE(2, 0, 2, 0)));
-    // 2, 2, 3, 3
-    sum_ref2 = _mm256_castps_si256(_mm256_shuffle_ps(
-        _mm256_castsi256_ps(sum_ref2), _mm256_castsi256_ps(sum_ref3),
-        _MM_SHUFFLE(2, 0, 2, 0)));
 
-    // sum adjacent 32 bit integers
-    sum_ref0 = _mm256_hadd_epi32(sum_ref0, sum_ref2);
+  aggregate_and_store_sum(res, sum_ref0, sum_ref1, sum_ref2, sum_ref3);
+}
 
-    // add the low 128 bit to the high 128 bit
-    sum = _mm_add_epi32(_mm256_castsi256_si128(sum_ref0),
-                        _mm256_extractf128_si256(sum_ref0, 1));
+static AOM_FORCE_INLINE void aom_sadMxNx3d_avx2(
+    int M, int N, const uint8_t *src, int src_stride,
+    const uint8_t *const ref[4], int ref_stride, uint32_t res[4]) {
+  __m256i src_reg, ref0_reg, ref1_reg, ref2_reg;
+  __m256i sum_ref0, sum_ref1, sum_ref2;
+  int i, j;
+  const uint8_t *ref0, *ref1, *ref2;
 
-    _mm_storeu_si128((__m128i *)(res), sum);
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  sum_ref0 = _mm256_setzero_si256();
+  sum_ref2 = _mm256_setzero_si256();
+  sum_ref1 = _mm256_setzero_si256();
+
+  for (i = 0; i < N; i++) {
+    for (j = 0; j < M; j += 32) {
+      // load src and all refs
+      src_reg = _mm256_loadu_si256((const __m256i *)(src + j));
+      ref0_reg = _mm256_loadu_si256((const __m256i *)(ref0 + j));
+      ref1_reg = _mm256_loadu_si256((const __m256i *)(ref1 + j));
+      ref2_reg = _mm256_loadu_si256((const __m256i *)(ref2 + j));
+
+      // sum of the absolute differences between every ref-i to src
+      ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
+      ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
+      ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
+      // sum every ref-i
+      sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
+      sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
+      sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
+    }
+    src += src_stride;
+    ref0 += ref_stride;
+    ref1 += ref_stride;
+    ref2 += ref_stride;
   }
+  aggregate_and_store_sum(res, sum_ref0, sum_ref1, sum_ref2,
+                          _mm256_setzero_si256());
 }
 
 #define SADMXN_AVX2(m, n)                                                      \
@@ -87,6 +136,11 @@
                                   const uint8_t *const ref[4], int ref_stride, \
                                   uint32_t res[4]) {                           \
     aom_sadMxNx4d_avx2(m, n, src, src_stride, ref, ref_stride, res);           \
+  }                                                                            \
+  void aom_sad##m##x##n##x3d_avx2(const uint8_t *src, int src_stride,          \
+                                  const uint8_t *const ref[4], int ref_stride, \
+                                  uint32_t res[4]) {                           \
+    aom_sadMxNx3d_avx2(m, n, src, src_stride, ref, ref_stride, res);           \
   }
 
 SADMXN_AVX2(32, 8)
@@ -126,3 +180,63 @@
 
 SAD_SKIP_MXN_AVX2(128, 64)
 SAD_SKIP_MXN_AVX2(128, 128)
+
+static AOM_FORCE_INLINE void aom_sad16xNx3d_avx2(int N, const uint8_t *src,
+                                                 int src_stride,
+                                                 const uint8_t *const ref[4],
+                                                 int ref_stride,
+                                                 uint32_t res[4]) {
+  __m256i src_reg, ref0_reg, ref1_reg, ref2_reg;
+  __m256i sum_ref0, sum_ref1, sum_ref2;
+  const uint8_t *ref0, *ref1, *ref2;
+  assert(N % 2 == 0);
+
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  sum_ref0 = _mm256_setzero_si256();
+  sum_ref2 = _mm256_setzero_si256();
+  sum_ref1 = _mm256_setzero_si256();
+
+  for (int i = 0; i < N; i += 2) {
+    // load src and all refs
+    src_reg = yy_loadu2_128(src + src_stride, src);
+    ref0_reg = yy_loadu2_128(ref0 + ref_stride, ref0);
+    ref1_reg = yy_loadu2_128(ref1 + ref_stride, ref1);
+    ref2_reg = yy_loadu2_128(ref2 + ref_stride, ref2);
+
+    // sum of the absolute differences between every ref-i to src
+    ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
+    ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
+    ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
+
+    // sum every ref-i
+    sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
+    sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
+    sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
+
+    src += 2 * src_stride;
+    ref0 += 2 * ref_stride;
+    ref1 += 2 * ref_stride;
+    ref2 += 2 * ref_stride;
+  }
+
+  aggregate_and_store_sum(res, sum_ref0, sum_ref1, sum_ref2,
+                          _mm256_setzero_si256());
+}
+
+#define SAD16XNX3_AVX2(n)                                                   \
+  void aom_sad16x##n##x3d_avx2(const uint8_t *src, int src_stride,          \
+                               const uint8_t *const ref[4], int ref_stride, \
+                               uint32_t res[4]) {                           \
+    aom_sad16xNx3d_avx2(n, src, src_stride, ref, ref_stride, res);          \
+  }
+
+SAD16XNX3_AVX2(32)
+SAD16XNX3_AVX2(16)
+SAD16XNX3_AVX2(8)
+
+#if !CONFIG_REALTIME_ONLY
+SAD16XNX3_AVX2(64)
+SAD16XNX3_AVX2(4)
+#endif  // !CONFIG_REALTIME_ONLY
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index d9de962..6c390d6 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -965,124 +965,129 @@
     }
   }
 
-#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, JSDAF, JSVAF) \
-  ppi->fn_ptr[BT].sdf = SDF;                                    \
-  ppi->fn_ptr[BT].sdaf = SDAF;                                  \
-  ppi->fn_ptr[BT].vf = VF;                                      \
-  ppi->fn_ptr[BT].svf = SVF;                                    \
-  ppi->fn_ptr[BT].svaf = SVAF;                                  \
-  ppi->fn_ptr[BT].sdx4df = SDX4DF;                              \
-  ppi->fn_ptr[BT].jsdaf = JSDAF;                                \
-  ppi->fn_ptr[BT].jsvaf = JSVAF;
+#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, SDX3DF, JSDAF, JSVAF) \
+  ppi->fn_ptr[BT].sdf = SDF;                                            \
+  ppi->fn_ptr[BT].sdaf = SDAF;                                          \
+  ppi->fn_ptr[BT].vf = VF;                                              \
+  ppi->fn_ptr[BT].svf = SVF;                                            \
+  ppi->fn_ptr[BT].svaf = SVAF;                                          \
+  ppi->fn_ptr[BT].sdx4df = SDX4DF;                                      \
+  ppi->fn_ptr[BT].jsdaf = JSDAF;                                        \
+  ppi->fn_ptr[BT].jsvaf = JSVAF;                                        \
+  ppi->fn_ptr[BT].sdx3df = SDX3DF;
 
 // Realtime mode doesn't use 4x rectangular blocks.
 #if !CONFIG_REALTIME_ONLY
   BFP(BLOCK_4X16, aom_sad4x16, aom_sad4x16_avg, aom_variance4x16,
       aom_sub_pixel_variance4x16, aom_sub_pixel_avg_variance4x16,
-      aom_sad4x16x4d, aom_dist_wtd_sad4x16_avg,
+      aom_sad4x16x4d, aom_sad4x16x3d, aom_dist_wtd_sad4x16_avg,
       aom_dist_wtd_sub_pixel_avg_variance4x16)
 
   BFP(BLOCK_16X4, aom_sad16x4, aom_sad16x4_avg, aom_variance16x4,
       aom_sub_pixel_variance16x4, aom_sub_pixel_avg_variance16x4,
-      aom_sad16x4x4d, aom_dist_wtd_sad16x4_avg,
+      aom_sad16x4x4d, aom_sad16x4x3d, aom_dist_wtd_sad16x4_avg,
       aom_dist_wtd_sub_pixel_avg_variance16x4)
 
   BFP(BLOCK_8X32, aom_sad8x32, aom_sad8x32_avg, aom_variance8x32,
       aom_sub_pixel_variance8x32, aom_sub_pixel_avg_variance8x32,
-      aom_sad8x32x4d, aom_dist_wtd_sad8x32_avg,
+      aom_sad8x32x4d, aom_sad8x32x3d, aom_dist_wtd_sad8x32_avg,
       aom_dist_wtd_sub_pixel_avg_variance8x32)
 
   BFP(BLOCK_32X8, aom_sad32x8, aom_sad32x8_avg, aom_variance32x8,
       aom_sub_pixel_variance32x8, aom_sub_pixel_avg_variance32x8,
-      aom_sad32x8x4d, aom_dist_wtd_sad32x8_avg,
+      aom_sad32x8x4d, aom_sad32x8x3d, aom_dist_wtd_sad32x8_avg,
       aom_dist_wtd_sub_pixel_avg_variance32x8)
 
   BFP(BLOCK_16X64, aom_sad16x64, aom_sad16x64_avg, aom_variance16x64,
       aom_sub_pixel_variance16x64, aom_sub_pixel_avg_variance16x64,
-      aom_sad16x64x4d, aom_dist_wtd_sad16x64_avg,
+      aom_sad16x64x4d, aom_sad16x64x3d, aom_dist_wtd_sad16x64_avg,
       aom_dist_wtd_sub_pixel_avg_variance16x64)
 
   BFP(BLOCK_64X16, aom_sad64x16, aom_sad64x16_avg, aom_variance64x16,
       aom_sub_pixel_variance64x16, aom_sub_pixel_avg_variance64x16,
-      aom_sad64x16x4d, aom_dist_wtd_sad64x16_avg,
+      aom_sad64x16x4d, aom_sad64x16x3d, aom_dist_wtd_sad64x16_avg,
       aom_dist_wtd_sub_pixel_avg_variance64x16)
 #endif  // !CONFIG_REALTIME_ONLY
 
   BFP(BLOCK_128X128, aom_sad128x128, aom_sad128x128_avg, aom_variance128x128,
       aom_sub_pixel_variance128x128, aom_sub_pixel_avg_variance128x128,
-      aom_sad128x128x4d, aom_dist_wtd_sad128x128_avg,
+      aom_sad128x128x4d, aom_sad128x128x3d, aom_dist_wtd_sad128x128_avg,
       aom_dist_wtd_sub_pixel_avg_variance128x128)
 
   BFP(BLOCK_128X64, aom_sad128x64, aom_sad128x64_avg, aom_variance128x64,
       aom_sub_pixel_variance128x64, aom_sub_pixel_avg_variance128x64,
-      aom_sad128x64x4d, aom_dist_wtd_sad128x64_avg,
+      aom_sad128x64x4d, aom_sad128x64x3d, aom_dist_wtd_sad128x64_avg,
       aom_dist_wtd_sub_pixel_avg_variance128x64)
 
   BFP(BLOCK_64X128, aom_sad64x128, aom_sad64x128_avg, aom_variance64x128,
       aom_sub_pixel_variance64x128, aom_sub_pixel_avg_variance64x128,
-      aom_sad64x128x4d, aom_dist_wtd_sad64x128_avg,
+      aom_sad64x128x4d, aom_sad64x128x3d, aom_dist_wtd_sad64x128_avg,
       aom_dist_wtd_sub_pixel_avg_variance64x128)
 
   BFP(BLOCK_32X16, aom_sad32x16, aom_sad32x16_avg, aom_variance32x16,
       aom_sub_pixel_variance32x16, aom_sub_pixel_avg_variance32x16,
-      aom_sad32x16x4d, aom_dist_wtd_sad32x16_avg,
+      aom_sad32x16x4d, aom_sad32x16x3d, aom_dist_wtd_sad32x16_avg,
       aom_dist_wtd_sub_pixel_avg_variance32x16)
 
   BFP(BLOCK_16X32, aom_sad16x32, aom_sad16x32_avg, aom_variance16x32,
       aom_sub_pixel_variance16x32, aom_sub_pixel_avg_variance16x32,
-      aom_sad16x32x4d, aom_dist_wtd_sad16x32_avg,
+      aom_sad16x32x4d, aom_sad16x32x3d, aom_dist_wtd_sad16x32_avg,
       aom_dist_wtd_sub_pixel_avg_variance16x32)
 
   BFP(BLOCK_64X32, aom_sad64x32, aom_sad64x32_avg, aom_variance64x32,
       aom_sub_pixel_variance64x32, aom_sub_pixel_avg_variance64x32,
-      aom_sad64x32x4d, aom_dist_wtd_sad64x32_avg,
+      aom_sad64x32x4d, aom_sad64x32x3d, aom_dist_wtd_sad64x32_avg,
       aom_dist_wtd_sub_pixel_avg_variance64x32)
 
   BFP(BLOCK_32X64, aom_sad32x64, aom_sad32x64_avg, aom_variance32x64,
       aom_sub_pixel_variance32x64, aom_sub_pixel_avg_variance32x64,
-      aom_sad32x64x4d, aom_dist_wtd_sad32x64_avg,
+      aom_sad32x64x4d, aom_sad32x64x3d, aom_dist_wtd_sad32x64_avg,
       aom_dist_wtd_sub_pixel_avg_variance32x64)
 
   BFP(BLOCK_32X32, aom_sad32x32, aom_sad32x32_avg, aom_variance32x32,
       aom_sub_pixel_variance32x32, aom_sub_pixel_avg_variance32x32,
-      aom_sad32x32x4d, aom_dist_wtd_sad32x32_avg,
+      aom_sad32x32x4d, aom_sad32x32x3d, aom_dist_wtd_sad32x32_avg,
       aom_dist_wtd_sub_pixel_avg_variance32x32)
 
   BFP(BLOCK_64X64, aom_sad64x64, aom_sad64x64_avg, aom_variance64x64,
       aom_sub_pixel_variance64x64, aom_sub_pixel_avg_variance64x64,
-      aom_sad64x64x4d, aom_dist_wtd_sad64x64_avg,
+      aom_sad64x64x4d, aom_sad64x64x3d, aom_dist_wtd_sad64x64_avg,
       aom_dist_wtd_sub_pixel_avg_variance64x64)
 
   BFP(BLOCK_16X16, aom_sad16x16, aom_sad16x16_avg, aom_variance16x16,
       aom_sub_pixel_variance16x16, aom_sub_pixel_avg_variance16x16,
-      aom_sad16x16x4d, aom_dist_wtd_sad16x16_avg,
+      aom_sad16x16x4d, aom_sad16x16x3d, aom_dist_wtd_sad16x16_avg,
       aom_dist_wtd_sub_pixel_avg_variance16x16)
 
   BFP(BLOCK_16X8, aom_sad16x8, aom_sad16x8_avg, aom_variance16x8,
       aom_sub_pixel_variance16x8, aom_sub_pixel_avg_variance16x8,
-      aom_sad16x8x4d, aom_dist_wtd_sad16x8_avg,
+      aom_sad16x8x4d, aom_sad16x8x3d, aom_dist_wtd_sad16x8_avg,
       aom_dist_wtd_sub_pixel_avg_variance16x8)
 
   BFP(BLOCK_8X16, aom_sad8x16, aom_sad8x16_avg, aom_variance8x16,
       aom_sub_pixel_variance8x16, aom_sub_pixel_avg_variance8x16,
-      aom_sad8x16x4d, aom_dist_wtd_sad8x16_avg,
+      aom_sad8x16x4d, aom_sad8x16x3d, aom_dist_wtd_sad8x16_avg,
       aom_dist_wtd_sub_pixel_avg_variance8x16)
 
   BFP(BLOCK_8X8, aom_sad8x8, aom_sad8x8_avg, aom_variance8x8,
       aom_sub_pixel_variance8x8, aom_sub_pixel_avg_variance8x8, aom_sad8x8x4d,
-      aom_dist_wtd_sad8x8_avg, aom_dist_wtd_sub_pixel_avg_variance8x8)
+      aom_sad8x8x3d, aom_dist_wtd_sad8x8_avg,
+      aom_dist_wtd_sub_pixel_avg_variance8x8)
 
   BFP(BLOCK_8X4, aom_sad8x4, aom_sad8x4_avg, aom_variance8x4,
       aom_sub_pixel_variance8x4, aom_sub_pixel_avg_variance8x4, aom_sad8x4x4d,
-      aom_dist_wtd_sad8x4_avg, aom_dist_wtd_sub_pixel_avg_variance8x4)
+      aom_sad8x4x3d, aom_dist_wtd_sad8x4_avg,
+      aom_dist_wtd_sub_pixel_avg_variance8x4)
 
   BFP(BLOCK_4X8, aom_sad4x8, aom_sad4x8_avg, aom_variance4x8,
       aom_sub_pixel_variance4x8, aom_sub_pixel_avg_variance4x8, aom_sad4x8x4d,
-      aom_dist_wtd_sad4x8_avg, aom_dist_wtd_sub_pixel_avg_variance4x8)
+      aom_sad4x8x3d, aom_dist_wtd_sad4x8_avg,
+      aom_dist_wtd_sub_pixel_avg_variance4x8)
 
   BFP(BLOCK_4X4, aom_sad4x4, aom_sad4x4_avg, aom_variance4x4,
       aom_sub_pixel_variance4x4, aom_sub_pixel_avg_variance4x4, aom_sad4x4x4d,
-      aom_dist_wtd_sad4x4_avg, aom_dist_wtd_sub_pixel_avg_variance4x4)
+      aom_sad4x4x3d, aom_dist_wtd_sad4x4_avg,
+      aom_dist_wtd_sub_pixel_avg_variance4x4)
 
 #if !CONFIG_REALTIME_ONLY
 #define OBFP(BT, OSDF, OVF, OSVF) \
diff --git a/av1/encoder/encoder_utils.h b/av1/encoder/encoder_utils.h
index 5b59289..92e69da 100644
--- a/av1/encoder/encoder_utils.h
+++ b/av1/encoder/encoder_utils.h
@@ -127,14 +127,15 @@
   force_intpel_info->rate_size = 0;
 }
 
-#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, JSDAF, JSVAF) \
-  ppi->fn_ptr[BT].sdf = SDF;                                           \
-  ppi->fn_ptr[BT].sdaf = SDAF;                                         \
-  ppi->fn_ptr[BT].vf = VF;                                             \
-  ppi->fn_ptr[BT].svf = SVF;                                           \
-  ppi->fn_ptr[BT].svaf = SVAF;                                         \
-  ppi->fn_ptr[BT].sdx4df = SDX4DF;                                     \
-  ppi->fn_ptr[BT].jsdaf = JSDAF;                                       \
+#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, SDX3DF, JSDAF, JSVAF) \
+  ppi->fn_ptr[BT].sdf = SDF;                                                   \
+  ppi->fn_ptr[BT].sdaf = SDAF;                                                 \
+  ppi->fn_ptr[BT].vf = VF;                                                     \
+  ppi->fn_ptr[BT].svf = SVF;                                                   \
+  ppi->fn_ptr[BT].svaf = SVAF;                                                 \
+  ppi->fn_ptr[BT].sdx4df = SDX4DF;                                             \
+  ppi->fn_ptr[BT].sdx3df = SDX3DF;                                             \
+  ppi->fn_ptr[BT].jsdaf = JSDAF;                                               \
   ppi->fn_ptr[BT].jsvaf = JSVAF;
 
 #define HIGHBD_BFP_WRAPPER(WIDTH, HEIGHT, BD)                                \
@@ -145,6 +146,7 @@
       aom_highbd_##BD##_sub_pixel_variance##WIDTH##x##HEIGHT,                \
       aom_highbd_##BD##_sub_pixel_avg_variance##WIDTH##x##HEIGHT,            \
       aom_highbd_sad##WIDTH##x##HEIGHT##x4d_bits##BD,                        \
+      aom_highbd_sad##WIDTH##x##HEIGHT##x3d_bits##BD,                        \
       aom_highbd_dist_wtd_sad##WIDTH##x##HEIGHT##_avg_bits##BD,              \
       aom_highbd_##BD##_dist_wtd_sub_pixel_avg_variance##WIDTH##x##HEIGHT)
 
@@ -234,71 +236,93 @@
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x128)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x128_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x128x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x128x3d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x64)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x64_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x64x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x64x3d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x128)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x128_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x128x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x128x3d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x16)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x16_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x16x3d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x32)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x32_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x32x3d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x32)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x32_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x32x3d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x64)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x64_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x64x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x64x3d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x32)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x32_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x32x3d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x64)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x64_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x64x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x64x3d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x16)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x16_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x16x3d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x8)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x8_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x8x3d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x16)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x16_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x16x3d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x8)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x8_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x8x3d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x4)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x4_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x4x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x4x3d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x8)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x8_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x8x3d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x4)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x4_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x4x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x4x3d)
 
 #if !CONFIG_REALTIME_ONLY
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x16)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x16_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x16x3d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x4)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x4_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x4x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x4x3d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x32)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x32_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x32x3d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x8)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x8_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x8x3d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x64)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x64_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x64x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x64x3d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x16)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x16_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x16x3d)
 #endif
 
 MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad128x128_avg)
diff --git a/av1/encoder/mcomp.c b/av1/encoder/mcomp.c
index ad0be67..4c2eb21 100644
--- a/av1/encoder/mcomp.c
+++ b/av1/encoder/mcomp.c
@@ -124,9 +124,12 @@
   if (use_downsampled_sad) {
     ms_params->sdf = ms_params->vfp->sdsf;
     ms_params->sdx4df = ms_params->vfp->sdsx4df;
+    // Skip version of sadx3 is not is not available yet
+    ms_params->sdx3df = ms_params->vfp->sdsx4df;
   } else {
     ms_params->sdf = ms_params->vfp->sdf;
     ms_params->sdx4df = ms_params->vfp->sdx4df;
+    ms_params->sdx3df = ms_params->vfp->sdx3df;
   }
 
   ms_params->mesh_patterns[0] = mv_sf->mesh_patterns;
@@ -909,7 +912,7 @@
     center_address,
   };
   unsigned int sads[4];
-  ms_params->sdx4df(src->buf, src->stride, block_offset, ref->stride, sads);
+  ms_params->sdx3df(src->buf, src->stride, block_offset, ref->stride, sads);
   for (int j = 0; j < 3; j++) {
     const int index = chkpts_indices[j];
     const FULLPEL_MV this_mv = { center_mv.row + site[index].mv.row,
@@ -1799,6 +1802,7 @@
       FULLPEL_MOTION_SEARCH_PARAMS new_ms_params = *ms_params;
       new_ms_params.sdf = new_ms_params.vfp->sdf;
       new_ms_params.sdx4df = new_ms_params.vfp->sdx4df;
+      new_ms_params.sdx3df = new_ms_params.vfp->sdx3df;
 
       return av1_full_pixel_search(start_mv, &new_ms_params, step_param,
                                    cost_list, best_mv, second_best_mv);
diff --git a/av1/encoder/mcomp.h b/av1/encoder/mcomp.h
index b5ff837..5aba8d9 100644
--- a/av1/encoder/mcomp.h
+++ b/av1/encoder/mcomp.h
@@ -137,6 +137,7 @@
   // sdf in vfp (e.g. downsampled sad and not sad) to allow speed up.
   aom_sad_fn_t sdf;
   aom_sad_multi_d_fn_t sdx4df;
+  aom_sad_multi_d_fn_t sdx3df;
 } FULLPEL_MOTION_SEARCH_PARAMS;
 
 void av1_init_obmc_buffer(OBMCBuffer *obmc_buffer);
diff --git a/test/sad_test.cc b/test/sad_test.cc
index 24a45b7..9dae336 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -397,6 +397,41 @@
   }
 };
 
+class SADx3Test : public ::testing::WithParamInterface<SadMxNx4Param>,
+                  public SADTestBase {
+ public:
+  SADx3Test() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
+
+ protected:
+  void SADs(unsigned int *results) {
+    const uint8_t *references[] = { GetReference(0), GetReference(1),
+                                    GetReference(2), GetReference(3) };
+
+    API_REGISTER_STATE_CHECK(GET_PARAM(2)(
+        source_data_, source_stride_, references, reference_stride_, results));
+  }
+
+  void CheckSADs() {
+    unsigned int reference_sad, exp_sad[4];
+
+    SADs(exp_sad);
+    for (int block = 0; block < 3; ++block) {
+      reference_sad = ReferenceSAD(block);
+
+      EXPECT_EQ(reference_sad, exp_sad[block]) << "block " << block;
+    }
+  }
+
+  void SpeedSAD() {
+    int test_count = 2000000;
+    unsigned int exp_sad[4];
+    while (test_count > 0) {
+      SADs(exp_sad);
+      test_count -= 1;
+    }
+  }
+};
+
 class SADSkipx4Test : public ::testing::WithParamInterface<SadMxNx4Param>,
                       public SADTestBase {
  public:
@@ -959,6 +994,7 @@
   source_stride_ = tmp_stride;
 }
 
+// SADx4
 TEST_P(SADx4Test, MaxRef) {
   FillConstant(source_data_, source_stride_, 0);
   FillConstant(GetReference(0), reference_stride_, mask_);
@@ -1040,6 +1076,88 @@
   SpeedSAD();
 }
 
+// SADx3
+TEST_P(SADx3Test, MaxRef) {
+  FillConstant(source_data_, source_stride_, 0);
+  FillConstant(GetReference(0), reference_stride_, mask_);
+  FillConstant(GetReference(1), reference_stride_, mask_);
+  FillConstant(GetReference(2), reference_stride_, mask_);
+  FillConstant(GetReference(3), reference_stride_, mask_);
+  CheckSADs();
+}
+
+TEST_P(SADx3Test, MaxSrc) {
+  FillConstant(source_data_, source_stride_, mask_);
+  FillConstant(GetReference(0), reference_stride_, 0);
+  FillConstant(GetReference(1), reference_stride_, 0);
+  FillConstant(GetReference(2), reference_stride_, 0);
+  FillConstant(GetReference(3), reference_stride_, 0);
+  CheckSADs();
+}
+
+TEST_P(SADx3Test, ShortRef) {
+  int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADx3Test, UnalignedRef) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADx3Test, ShortSrc) {
+  int tmp_stride = source_stride_;
+  source_stride_ >>= 1;
+  int test_count = 1000;
+  while (test_count > 0) {
+    FillRandom(source_data_, source_stride_);
+    FillRandom(GetReference(0), reference_stride_);
+    FillRandom(GetReference(1), reference_stride_);
+    FillRandom(GetReference(2), reference_stride_);
+    FillRandom(GetReference(3), reference_stride_);
+    CheckSADs();
+    test_count -= 1;
+  }
+  source_stride_ = tmp_stride;
+}
+
+TEST_P(SADx3Test, SrcAlignedByWidth) {
+  uint8_t *tmp_source_data = source_data_;
+  source_data_ += width_;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  source_data_ = tmp_source_data;
+}
+
+TEST_P(SADx3Test, DISABLED_Speed) {
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  SpeedSAD();
+}
+
 // SADSkipx4
 TEST_P(SADSkipx4Test, MaxRef) {
   FillConstant(source_data_, source_stride_, 0);
@@ -1660,6 +1778,108 @@
 };
 INSTANTIATE_TEST_SUITE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests));
 
+const SadMxNx4Param x3d_c_tests[] = {
+  make_tuple(128, 128, &aom_sad128x128x3d_c, -1),
+  make_tuple(128, 64, &aom_sad128x64x3d_c, -1),
+  make_tuple(64, 128, &aom_sad64x128x3d_c, -1),
+  make_tuple(64, 64, &aom_sad64x64x3d_c, -1),
+  make_tuple(64, 32, &aom_sad64x32x3d_c, -1),
+  make_tuple(32, 64, &aom_sad32x64x3d_c, -1),
+  make_tuple(32, 32, &aom_sad32x32x3d_c, -1),
+  make_tuple(32, 16, &aom_sad32x16x3d_c, -1),
+  make_tuple(16, 32, &aom_sad16x32x3d_c, -1),
+  make_tuple(16, 16, &aom_sad16x16x3d_c, -1),
+  make_tuple(16, 8, &aom_sad16x8x3d_c, -1),
+  make_tuple(8, 16, &aom_sad8x16x3d_c, -1),
+  make_tuple(8, 8, &aom_sad8x8x3d_c, -1),
+  make_tuple(8, 4, &aom_sad8x4x3d_c, -1),
+  make_tuple(4, 8, &aom_sad4x8x3d_c, -1),
+  make_tuple(4, 4, &aom_sad4x4x3d_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(128, 128, &aom_highbd_sad128x128x3d_c, 8),
+  make_tuple(128, 64, &aom_highbd_sad128x64x3d_c, 8),
+  make_tuple(64, 128, &aom_highbd_sad64x128x3d_c, 8),
+  make_tuple(64, 64, &aom_highbd_sad64x64x3d_c, 8),
+  make_tuple(64, 32, &aom_highbd_sad64x32x3d_c, 8),
+  make_tuple(32, 64, &aom_highbd_sad32x64x3d_c, 8),
+  make_tuple(32, 32, &aom_highbd_sad32x32x3d_c, 8),
+  make_tuple(32, 16, &aom_highbd_sad32x16x3d_c, 8),
+  make_tuple(16, 32, &aom_highbd_sad16x32x3d_c, 8),
+  make_tuple(16, 16, &aom_highbd_sad16x16x3d_c, 8),
+  make_tuple(16, 8, &aom_highbd_sad16x8x3d_c, 8),
+  make_tuple(8, 16, &aom_highbd_sad8x16x3d_c, 8),
+  make_tuple(8, 8, &aom_highbd_sad8x8x3d_c, 8),
+  make_tuple(8, 4, &aom_highbd_sad8x4x3d_c, 8),
+  make_tuple(4, 8, &aom_highbd_sad4x8x3d_c, 8),
+  make_tuple(4, 4, &aom_highbd_sad4x4x3d_c, 8),
+  make_tuple(128, 128, &aom_highbd_sad128x128x3d_c, 10),
+  make_tuple(128, 64, &aom_highbd_sad128x64x3d_c, 10),
+  make_tuple(64, 128, &aom_highbd_sad64x128x3d_c, 10),
+  make_tuple(64, 64, &aom_highbd_sad64x64x3d_c, 10),
+  make_tuple(64, 32, &aom_highbd_sad64x32x3d_c, 10),
+  make_tuple(32, 64, &aom_highbd_sad32x64x3d_c, 10),
+  make_tuple(32, 32, &aom_highbd_sad32x32x3d_c, 10),
+  make_tuple(32, 16, &aom_highbd_sad32x16x3d_c, 10),
+  make_tuple(16, 32, &aom_highbd_sad16x32x3d_c, 10),
+  make_tuple(16, 16, &aom_highbd_sad16x16x3d_c, 10),
+  make_tuple(16, 8, &aom_highbd_sad16x8x3d_c, 10),
+  make_tuple(8, 16, &aom_highbd_sad8x16x3d_c, 10),
+  make_tuple(8, 8, &aom_highbd_sad8x8x3d_c, 10),
+  make_tuple(8, 4, &aom_highbd_sad8x4x3d_c, 10),
+  make_tuple(4, 8, &aom_highbd_sad4x8x3d_c, 10),
+  make_tuple(4, 4, &aom_highbd_sad4x4x3d_c, 10),
+  make_tuple(128, 128, &aom_highbd_sad128x128x3d_c, 12),
+  make_tuple(128, 64, &aom_highbd_sad128x64x3d_c, 12),
+  make_tuple(64, 128, &aom_highbd_sad64x128x3d_c, 12),
+  make_tuple(64, 64, &aom_highbd_sad64x64x3d_c, 12),
+  make_tuple(64, 32, &aom_highbd_sad64x32x3d_c, 12),
+  make_tuple(32, 64, &aom_highbd_sad32x64x3d_c, 12),
+  make_tuple(32, 32, &aom_highbd_sad32x32x3d_c, 12),
+  make_tuple(32, 16, &aom_highbd_sad32x16x3d_c, 12),
+  make_tuple(16, 32, &aom_highbd_sad16x32x3d_c, 12),
+  make_tuple(16, 16, &aom_highbd_sad16x16x3d_c, 12),
+  make_tuple(16, 8, &aom_highbd_sad16x8x3d_c, 12),
+  make_tuple(8, 16, &aom_highbd_sad8x16x3d_c, 12),
+  make_tuple(8, 8, &aom_highbd_sad8x8x3d_c, 12),
+  make_tuple(8, 4, &aom_highbd_sad8x4x3d_c, 12),
+  make_tuple(4, 8, &aom_highbd_sad4x8x3d_c, 12),
+  make_tuple(4, 4, &aom_highbd_sad4x4x3d_c, 12),
+#endif
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_sad64x16x3d_c, -1),
+  make_tuple(16, 64, &aom_sad16x64x3d_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(64, 16, &aom_highbd_sad64x16x3d_c, 8),
+  make_tuple(16, 64, &aom_highbd_sad16x64x3d_c, 8),
+  make_tuple(64, 16, &aom_highbd_sad64x16x3d_c, 10),
+  make_tuple(16, 64, &aom_highbd_sad16x64x3d_c, 10),
+  make_tuple(64, 16, &aom_highbd_sad64x16x3d_c, 12),
+  make_tuple(16, 64, &aom_highbd_sad16x64x3d_c, 12),
+#endif
+  make_tuple(32, 8, &aom_sad32x8x3d_c, -1),
+  make_tuple(8, 32, &aom_sad8x32x3d_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(32, 8, &aom_highbd_sad32x8x3d_c, 8),
+  make_tuple(8, 32, &aom_highbd_sad8x32x3d_c, 8),
+  make_tuple(32, 8, &aom_highbd_sad32x8x3d_c, 10),
+  make_tuple(8, 32, &aom_highbd_sad8x32x3d_c, 10),
+  make_tuple(32, 8, &aom_highbd_sad32x8x3d_c, 12),
+  make_tuple(8, 32, &aom_highbd_sad8x32x3d_c, 12),
+#endif
+  make_tuple(16, 4, &aom_sad16x4x3d_c, -1),
+  make_tuple(4, 16, &aom_sad4x16x3d_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(16, 4, &aom_highbd_sad16x4x3d_c, 8),
+  make_tuple(4, 16, &aom_highbd_sad4x16x3d_c, 8),
+  make_tuple(16, 4, &aom_highbd_sad16x4x3d_c, 10),
+  make_tuple(4, 16, &aom_highbd_sad4x16x3d_c, 10),
+  make_tuple(16, 4, &aom_highbd_sad16x4x3d_c, 12),
+  make_tuple(4, 16, &aom_highbd_sad4x16x3d_c, 12),
+#endif
+#endif  // !CONFIG_REALTIME_ONLY
+};
+INSTANTIATE_TEST_SUITE_P(C, SADx3Test, ::testing::ValuesIn(x3d_c_tests));
+
 const SadMxNx4Param skip_x4d_c_tests[] = {
   make_tuple(128, 128, &aom_sad_skip_128x128x4d_c, -1),
   make_tuple(128, 64, &aom_sad_skip_128x64x4d_c, -1),
@@ -2837,6 +3057,74 @@
 #endif
 };
 INSTANTIATE_TEST_SUITE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests));
+
+const SadMxNx4Param x3d_avx2_tests[] = {
+  make_tuple(32, 64, &aom_sad32x64x3d_avx2, -1),
+  make_tuple(32, 32, &aom_sad32x32x3d_avx2, -1),
+  make_tuple(32, 16, &aom_sad32x16x3d_avx2, -1),
+  make_tuple(64, 128, &aom_sad64x128x3d_avx2, -1),
+  make_tuple(64, 64, &aom_sad64x64x3d_avx2, -1),
+  make_tuple(64, 32, &aom_sad64x32x3d_avx2, -1),
+  make_tuple(128, 128, &aom_sad128x128x3d_avx2, -1),
+  make_tuple(128, 64, &aom_sad128x64x3d_avx2, -1),
+
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(32, 8, &aom_sad32x8x3d_avx2, -1),
+  make_tuple(64, 16, &aom_sad64x16x3d_avx2, -1),
+#endif  // !CONFIG_REALTIME_ONLY
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(128, 128, &aom_highbd_sad128x128x3d_avx2, 8),
+  make_tuple(128, 128, &aom_highbd_sad128x128x3d_avx2, 10),
+  make_tuple(128, 128, &aom_highbd_sad128x128x3d_avx2, 12),
+  make_tuple(128, 64, &aom_highbd_sad128x64x3d_avx2, 8),
+  make_tuple(128, 64, &aom_highbd_sad128x64x3d_avx2, 10),
+  make_tuple(128, 64, &aom_highbd_sad128x64x3d_avx2, 12),
+  make_tuple(64, 128, &aom_highbd_sad64x128x3d_avx2, 8),
+  make_tuple(64, 128, &aom_highbd_sad64x128x3d_avx2, 10),
+  make_tuple(64, 128, &aom_highbd_sad64x128x3d_avx2, 12),
+  make_tuple(64, 64, &aom_highbd_sad64x64x3d_avx2, 8),
+  make_tuple(64, 64, &aom_highbd_sad64x64x3d_avx2, 10),
+  make_tuple(64, 64, &aom_highbd_sad64x64x3d_avx2, 12),
+  make_tuple(64, 32, &aom_highbd_sad64x32x3d_avx2, 8),
+  make_tuple(64, 32, &aom_highbd_sad64x32x3d_avx2, 10),
+  make_tuple(64, 32, &aom_highbd_sad64x32x3d_avx2, 12),
+  make_tuple(32, 64, &aom_highbd_sad32x64x3d_avx2, 8),
+  make_tuple(32, 64, &aom_highbd_sad32x64x3d_avx2, 10),
+  make_tuple(32, 64, &aom_highbd_sad32x64x3d_avx2, 12),
+  make_tuple(32, 32, &aom_highbd_sad32x32x3d_avx2, 8),
+  make_tuple(32, 32, &aom_highbd_sad32x32x3d_avx2, 10),
+  make_tuple(32, 32, &aom_highbd_sad32x32x3d_avx2, 12),
+  make_tuple(32, 16, &aom_highbd_sad32x16x3d_avx2, 8),
+  make_tuple(32, 16, &aom_highbd_sad32x16x3d_avx2, 10),
+  make_tuple(32, 16, &aom_highbd_sad32x16x3d_avx2, 12),
+  make_tuple(16, 32, &aom_highbd_sad16x32x3d_avx2, 8),
+  make_tuple(16, 32, &aom_highbd_sad16x32x3d_avx2, 10),
+  make_tuple(16, 32, &aom_highbd_sad16x32x3d_avx2, 12),
+  make_tuple(16, 16, &aom_highbd_sad16x16x3d_avx2, 8),
+  make_tuple(16, 16, &aom_highbd_sad16x16x3d_avx2, 10),
+  make_tuple(16, 16, &aom_highbd_sad16x16x3d_avx2, 12),
+  make_tuple(16, 8, &aom_highbd_sad16x8x3d_avx2, 8),
+  make_tuple(16, 8, &aom_highbd_sad16x8x3d_avx2, 10),
+  make_tuple(16, 8, &aom_highbd_sad16x8x3d_avx2, 12),
+
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(16, 64, &aom_highbd_sad16x64x3d_avx2, 8),
+  make_tuple(16, 64, &aom_highbd_sad16x64x3d_avx2, 10),
+  make_tuple(16, 64, &aom_highbd_sad16x64x3d_avx2, 12),
+  make_tuple(64, 16, &aom_highbd_sad64x16x3d_avx2, 8),
+  make_tuple(64, 16, &aom_highbd_sad64x16x3d_avx2, 10),
+  make_tuple(64, 16, &aom_highbd_sad64x16x3d_avx2, 12),
+  make_tuple(32, 8, &aom_highbd_sad32x8x3d_avx2, 8),
+  make_tuple(32, 8, &aom_highbd_sad32x8x3d_avx2, 10),
+  make_tuple(32, 8, &aom_highbd_sad32x8x3d_avx2, 12),
+  make_tuple(16, 4, &aom_highbd_sad16x4x3d_avx2, 8),
+  make_tuple(16, 4, &aom_highbd_sad16x4x3d_avx2, 10),
+  make_tuple(16, 4, &aom_highbd_sad16x4x3d_avx2, 12),
+#endif  // !CONFIG_REALTIME_ONLY
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, SADx3Test, ::testing::ValuesIn(x3d_avx2_tests));
 #endif  // HAVE_AVX2
 
 }  // namespace