Downsample sad computation in hd motion search

This CL adds a speed feature to downsample the SAD computation during
motion search by skipping every other row in sad computation.

The downsampling is only performed when there are sufficiently many rows
to get a good estimation. Empirically, this happens when there are at
leas 16 rows.

Currently this speed feature is only enabled on hdres set.

Performance:
 SPD_SET | OVR_PSNR | AVG_PSNR |   SSIM  |  SPD
    0    |  +0.021% |  +0.025% | +0.084% | +2.3%
    1    |  +0.053% |  +0.071% | +0.136% | +3.8%
    2    |  +0.037% |  +0.055% | +0.091% | +4.3%
    3    |  +0.025% |  +0.038% | +0.074% | +2.6%
    4    |  +0.029% |  +0.043% | +0.093% | +3.0%
    5    |  +0.117% |  +0.128% | +0.194% | +3.3%
    6    |  +0.065% |  +0.082% | +0.179% | +3.2%

BUG=aomedia:2781

STATS_CHANGED

Change-Id: Ibf2afd9a7ffed939897249527b41bbaa4152a62c
(cherry picked from commit 0a32d3c251a0ae49b6e0a76249a699d33244e0be)
diff --git a/test/sad_test.cc b/test/sad_test.cc
index 101204f..fdcf49f 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -31,6 +31,10 @@
                                    const uint8_t *ref_ptr, int ref_stride);
 typedef std::tuple<int, int, SadMxNFunc, int> SadMxNParam;
 
+typedef unsigned int (*SadSkipMxNFunc)(const uint8_t *src_ptr, int src_stride,
+                                       const uint8_t *ref_ptr, int ref_stride);
+typedef std::tuple<int, int, SadSkipMxNFunc, int> SadSkipMxNParam;
+
 typedef uint32_t (*SadMxNAvgFunc)(const uint8_t *src_ptr, int src_stride,
                                   const uint8_t *ref_ptr, int ref_stride,
                                   const uint8_t *second_pred);
@@ -60,6 +64,11 @@
                              uint32_t *sad_array);
 typedef std::tuple<int, int, SadMxNx4Func, int> SadMxNx4Param;
 
+typedef void (*SadSkipMxNx4Func)(const uint8_t *src_ptr, int src_stride,
+                                 const uint8_t *const ref_ptr[], int ref_stride,
+                                 uint32_t *sad_array);
+typedef std::tuple<int, int, SadSkipMxNx4Func, int> SadSkipMxNx4Param;
+
 typedef void (*SadMxNx4AvgFunc)(const uint8_t *src_ptr, int src_stride,
                                 const uint8_t *const ref_ptr[], int ref_stride,
                                 const uint8_t *second_pred,
@@ -182,6 +191,31 @@
     return sad;
   }
 
+  // Sum of Absolute Differences Skip rows. Given two blocks,
+  // calculate the absolute  difference between two pixels in the same
+  // relative location every other row; accumulate and double the result at the
+  // end.
+  unsigned int ReferenceSADSkip(int block_idx) {
+    unsigned int sad = 0;
+    const uint8_t *const reference8 = GetReference(block_idx);
+    const uint8_t *const source8 = source_data_;
+    const uint16_t *const reference16 =
+        CONVERT_TO_SHORTPTR(GetReference(block_idx));
+    const uint16_t *const source16 = CONVERT_TO_SHORTPTR(source_data_);
+    for (int h = 0; h < height_; h += 2) {
+      for (int w = 0; w < width_; ++w) {
+        if (!use_high_bit_depth_) {
+          sad += abs(source8[h * source_stride_ + w] -
+                     reference8[h * reference_stride_ + w]);
+        } else {
+          sad += abs(source16[h * source_stride_ + w] -
+                     reference16[h * reference_stride_ + w]);
+        }
+      }
+    }
+    return sad * 2;
+  }
+
   // Sum of Absolute Differences Average. Given two blocks, and a prediction
   // calculate the absolute difference between one pixel and average of the
   // corresponding and predicted pixels; accumulate.
@@ -343,6 +377,50 @@
       EXPECT_EQ(reference_sad, exp_sad[block]) << "block " << block;
     }
   }
+
+  void SpeedSAD() {
+    int test_count = 2000000;
+    unsigned int exp_sad[4];
+    while (test_count > 0) {
+      SADs(exp_sad);
+      test_count -= 1;
+    }
+  }
+};
+
+class SADSkipx4Test : public ::testing::WithParamInterface<SadMxNx4Param>,
+                      public SADTestBase {
+ public:
+  SADSkipx4Test() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
+
+ protected:
+  void SADs(unsigned int *results) {
+    const uint8_t *references[] = { GetReference(0), GetReference(1),
+                                    GetReference(2), GetReference(3) };
+
+    ASM_REGISTER_STATE_CHECK(GET_PARAM(2)(
+        source_data_, source_stride_, references, reference_stride_, results));
+  }
+
+  void CheckSADs() {
+    unsigned int reference_sad, exp_sad[4];
+
+    SADs(exp_sad);
+    for (int block = 0; block < 4; ++block) {
+      reference_sad = ReferenceSADSkip(block);
+
+      EXPECT_EQ(reference_sad, exp_sad[block]) << "block " << block;
+    }
+  }
+
+  void SpeedSAD() {
+    int test_count = 2000000;
+    unsigned int exp_sad[4];
+    while (test_count > 0) {
+      SADs(exp_sad);
+      test_count -= 1;
+    }
+  }
 };
 
 class SADx4AvgTest : public ::testing::WithParamInterface<SadMxNx4AvgParam>,
@@ -412,6 +490,37 @@
   }
 };
 
+class SADSkipTest : public ::testing::WithParamInterface<SadMxNParam>,
+                    public SADTestBase {
+ public:
+  SADSkipTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
+
+ protected:
+  unsigned int SAD(int block_idx) {
+    unsigned int ret;
+    const uint8_t *const reference = GetReference(block_idx);
+
+    ASM_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
+                                                reference, reference_stride_));
+    return ret;
+  }
+
+  void CheckSAD() {
+    const unsigned int reference_sad = ReferenceSADSkip(0);
+    const unsigned int exp_sad = SAD(0);
+
+    ASSERT_EQ(reference_sad, exp_sad);
+  }
+
+  void SpeedSAD() {
+    int test_count = 20000000;
+    while (test_count > 0) {
+      SAD(0);
+      test_count -= 1;
+    }
+  }
+};
+
 class SADavgTest : public ::testing::WithParamInterface<SadMxNAvgParam>,
                    public SADTestBase {
  public:
@@ -608,6 +717,62 @@
 }
 #endif
 
+TEST_P(SADSkipTest, MaxRef) {
+  FillConstant(source_data_, source_stride_, 0);
+  FillConstant(reference_data_, reference_stride_, mask_);
+  CheckSAD();
+}
+
+TEST_P(SADSkipTest, MaxSrc) {
+  FillConstant(source_data_, source_stride_, mask_);
+  FillConstant(reference_data_, reference_stride_, 0);
+  CheckSAD();
+}
+
+TEST_P(SADSkipTest, ShortRef) {
+  const int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSAD();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipTest, UnalignedRef) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  const int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSAD();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipTest, ShortSrc) {
+  const int tmp_stride = source_stride_;
+  source_stride_ >>= 1;
+  int test_count = 2000;
+  while (test_count > 0) {
+    FillRandom(source_data_, source_stride_);
+    FillRandom(reference_data_, reference_stride_);
+    CheckSAD();
+    test_count -= 1;
+  }
+  source_stride_ = tmp_stride;
+}
+
+#if SPEED_TEST
+TEST_P(SADSkipTest, Speed) {
+  const int tmp_stride = source_stride_;
+  source_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  SpeedSAD();
+  source_stride_ = tmp_stride;
+}
+#endif
+
 TEST_P(SADavgTest, MaxRef) {
   FillConstant(source_data_, source_stride_, 0);
   FillConstant(reference_data_, reference_stride_, mask_);
@@ -855,6 +1020,101 @@
   source_data_ = tmp_source_data;
 }
 
+#if SPEED_TEST
+TEST_P(SADx4Test, Speed) {
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  SpeedSAD();
+}
+#endif
+
+// SADSkipx4
+TEST_P(SADSkipx4Test, MaxRef) {
+  FillConstant(source_data_, source_stride_, 0);
+  FillConstant(GetReference(0), reference_stride_, mask_);
+  FillConstant(GetReference(1), reference_stride_, mask_);
+  FillConstant(GetReference(2), reference_stride_, mask_);
+  FillConstant(GetReference(3), reference_stride_, mask_);
+  CheckSADs();
+}
+
+TEST_P(SADSkipx4Test, MaxSrc) {
+  FillConstant(source_data_, source_stride_, mask_);
+  FillConstant(GetReference(0), reference_stride_, 0);
+  FillConstant(GetReference(1), reference_stride_, 0);
+  FillConstant(GetReference(2), reference_stride_, 0);
+  FillConstant(GetReference(3), reference_stride_, 0);
+  CheckSADs();
+}
+
+TEST_P(SADSkipx4Test, ShortRef) {
+  int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipx4Test, UnalignedRef) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipx4Test, ShortSrc) {
+  int tmp_stride = source_stride_;
+  source_stride_ >>= 1;
+  int test_count = 1000;
+  while (test_count > 0) {
+    FillRandom(source_data_, source_stride_);
+    FillRandom(GetReference(0), reference_stride_);
+    FillRandom(GetReference(1), reference_stride_);
+    FillRandom(GetReference(2), reference_stride_);
+    FillRandom(GetReference(3), reference_stride_);
+    CheckSADs();
+    test_count -= 1;
+  }
+  source_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipx4Test, SrcAlignedByWidth) {
+  uint8_t *tmp_source_data = source_data_;
+  source_data_ += width_;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  source_data_ = tmp_source_data;
+}
+
+#if SPEED_TEST
+TEST_P(SADSkipx4Test, Speed) {
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  SpeedSAD();
+}
+#endif
+
 using std::make_tuple;
 
 #if SPEED_TEST
@@ -1022,6 +1282,103 @@
 };
 INSTANTIATE_TEST_SUITE_P(C, SADTest, ::testing::ValuesIn(c_tests));
 
+const SadSkipMxNParam skip_c_tests[] = {
+  make_tuple(128, 128, &aom_sad_skip_128x128_c, -1),
+  make_tuple(128, 64, &aom_sad_skip_128x64_c, -1),
+  make_tuple(64, 128, &aom_sad_skip_64x128_c, -1),
+  make_tuple(64, 64, &aom_sad_skip_64x64_c, -1),
+  make_tuple(64, 32, &aom_sad_skip_64x32_c, -1),
+  make_tuple(32, 64, &aom_sad_skip_32x64_c, -1),
+  make_tuple(32, 32, &aom_sad_skip_32x32_c, -1),
+  make_tuple(32, 16, &aom_sad_skip_32x16_c, -1),
+  make_tuple(16, 32, &aom_sad_skip_16x32_c, -1),
+  make_tuple(16, 16, &aom_sad_skip_16x16_c, -1),
+  make_tuple(16, 8, &aom_sad_skip_16x8_c, -1),
+  make_tuple(8, 16, &aom_sad_skip_8x16_c, -1),
+  make_tuple(8, 8, &aom_sad_skip_8x8_c, -1),
+  make_tuple(8, 4, &aom_sad_skip_8x4_c, -1),
+  make_tuple(4, 8, &aom_sad_skip_4x8_c, -1),
+  make_tuple(4, 4, &aom_sad_skip_4x4_c, -1),
+  make_tuple(64, 16, &aom_sad_skip_64x16_c, -1),
+  make_tuple(16, 64, &aom_sad_skip_16x64_c, -1),
+  make_tuple(32, 8, &aom_sad_skip_32x8_c, -1),
+  make_tuple(8, 32, &aom_sad_skip_8x32_c, -1),
+  make_tuple(16, 4, &aom_sad_skip_16x4_c, -1),
+  make_tuple(4, 16, &aom_sad_skip_4x16_c, -1),
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(128, 128, &aom_highbd_sad_skip_128x128_c, 8),
+  make_tuple(128, 64, &aom_highbd_sad_skip_128x64_c, 8),
+  make_tuple(64, 128, &aom_highbd_sad_skip_64x128_c, 8),
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64_c, 8),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32_c, 8),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64_c, 8),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32_c, 8),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16_c, 8),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32_c, 8),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16_c, 8),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8_c, 8),
+  make_tuple(8, 16, &aom_highbd_sad_skip_8x16_c, 8),
+  make_tuple(8, 8, &aom_highbd_sad_skip_8x8_c, 8),
+  make_tuple(8, 4, &aom_highbd_sad_skip_8x4_c, 8),
+  make_tuple(4, 8, &aom_highbd_sad_skip_4x8_c, 8),
+  make_tuple(4, 4, &aom_highbd_sad_skip_4x4_c, 8),
+  make_tuple(64, 16, &aom_highbd_sad_skip_64x16_c, 8),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64_c, 8),
+  make_tuple(32, 8, &aom_highbd_sad_skip_32x8_c, 8),
+  make_tuple(8, 32, &aom_highbd_sad_skip_8x32_c, 8),
+  make_tuple(16, 4, &aom_highbd_sad_skip_16x4_c, 8),
+  make_tuple(4, 16, &aom_highbd_sad_skip_4x16_c, 8),
+
+  make_tuple(128, 128, &aom_highbd_sad_skip_128x128_c, 10),
+  make_tuple(128, 64, &aom_highbd_sad_skip_128x64_c, 10),
+  make_tuple(64, 128, &aom_highbd_sad_skip_64x128_c, 10),
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64_c, 10),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32_c, 10),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64_c, 10),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32_c, 10),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16_c, 10),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32_c, 10),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16_c, 10),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8_c, 10),
+  make_tuple(8, 16, &aom_highbd_sad_skip_8x16_c, 10),
+  make_tuple(8, 8, &aom_highbd_sad_skip_8x8_c, 10),
+  make_tuple(8, 4, &aom_highbd_sad_skip_8x4_c, 10),
+  make_tuple(4, 8, &aom_highbd_sad_skip_4x8_c, 10),
+  make_tuple(4, 4, &aom_highbd_sad_skip_4x4_c, 10),
+  make_tuple(64, 16, &aom_highbd_sad_skip_64x16_c, 10),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64_c, 10),
+  make_tuple(32, 8, &aom_highbd_sad_skip_32x8_c, 10),
+  make_tuple(8, 32, &aom_highbd_sad_skip_8x32_c, 10),
+  make_tuple(16, 4, &aom_highbd_sad_skip_16x4_c, 10),
+  make_tuple(4, 16, &aom_highbd_sad_skip_4x16_c, 10),
+
+  make_tuple(128, 128, &aom_highbd_sad_skip_128x128_c, 12),
+  make_tuple(128, 64, &aom_highbd_sad_skip_128x64_c, 12),
+  make_tuple(64, 128, &aom_highbd_sad_skip_64x128_c, 12),
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64_c, 12),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32_c, 12),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64_c, 12),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32_c, 12),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16_c, 12),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32_c, 12),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16_c, 12),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8_c, 12),
+  make_tuple(8, 16, &aom_highbd_sad_skip_8x16_c, 12),
+  make_tuple(8, 8, &aom_highbd_sad_skip_8x8_c, 12),
+  make_tuple(8, 4, &aom_highbd_sad_skip_8x4_c, 12),
+  make_tuple(4, 8, &aom_highbd_sad_skip_4x8_c, 12),
+  make_tuple(4, 4, &aom_highbd_sad_skip_4x4_c, 12),
+  make_tuple(64, 16, &aom_highbd_sad_skip_64x16_c, 12),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64_c, 12),
+  make_tuple(32, 8, &aom_highbd_sad_skip_32x8_c, 12),
+  make_tuple(8, 32, &aom_highbd_sad_skip_8x32_c, 12),
+  make_tuple(16, 4, &aom_highbd_sad_skip_16x4_c, 12),
+  make_tuple(4, 16, &aom_highbd_sad_skip_4x16_c, 12),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(C, SADSkipTest, ::testing::ValuesIn(skip_c_tests));
+
 const SadMxNAvgParam avg_c_tests[] = {
   make_tuple(128, 128, &aom_sad128x128_avg_c, -1),
   make_tuple(128, 64, &aom_sad128x64_avg_c, -1),
@@ -1281,6 +1638,101 @@
 };
 INSTANTIATE_TEST_SUITE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests));
 
+const SadMxNx4Param skip_x4d_c_tests[] = {
+  make_tuple(128, 128, &aom_sad_skip_128x128x4d_c, -1),
+  make_tuple(128, 64, &aom_sad_skip_128x64x4d_c, -1),
+  make_tuple(64, 128, &aom_sad_skip_64x128x4d_c, -1),
+  make_tuple(64, 64, &aom_sad_skip_64x64x4d_c, -1),
+  make_tuple(64, 32, &aom_sad_skip_64x32x4d_c, -1),
+  make_tuple(32, 64, &aom_sad_skip_32x64x4d_c, -1),
+  make_tuple(32, 32, &aom_sad_skip_32x32x4d_c, -1),
+  make_tuple(32, 16, &aom_sad_skip_32x16x4d_c, -1),
+  make_tuple(16, 32, &aom_sad_skip_16x32x4d_c, -1),
+  make_tuple(16, 16, &aom_sad_skip_16x16x4d_c, -1),
+  make_tuple(16, 8, &aom_sad_skip_16x8x4d_c, -1),
+  make_tuple(8, 16, &aom_sad_skip_8x16x4d_c, -1),
+  make_tuple(8, 8, &aom_sad_skip_8x8x4d_c, -1),
+  make_tuple(4, 8, &aom_sad_skip_4x8x4d_c, -1),
+  make_tuple(64, 16, &aom_sad_skip_64x16x4d_c, -1),
+  make_tuple(16, 64, &aom_sad_skip_16x64x4d_c, -1),
+  make_tuple(32, 8, &aom_sad_skip_32x8x4d_c, -1),
+  make_tuple(8, 32, &aom_sad_skip_8x32x4d_c, -1),
+  make_tuple(4, 16, &aom_sad_skip_4x16x4d_c, -1),
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(128, 128, &aom_highbd_sad_skip_128x128x4d_c, 8),
+  make_tuple(128, 64, &aom_highbd_sad_skip_128x64x4d_c, 8),
+  make_tuple(64, 128, &aom_highbd_sad_skip_64x128x4d_c, 8),
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64x4d_c, 8),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32x4d_c, 8),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64x4d_c, 8),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32x4d_c, 8),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16x4d_c, 8),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32x4d_c, 8),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16x4d_c, 8),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8x4d_c, 8),
+  make_tuple(8, 16, &aom_highbd_sad_skip_8x16x4d_c, 8),
+  make_tuple(8, 8, &aom_highbd_sad_skip_8x8x4d_c, 8),
+  make_tuple(8, 4, &aom_highbd_sad_skip_8x4x4d_c, 8),
+  make_tuple(4, 8, &aom_highbd_sad_skip_4x8x4d_c, 8),
+  make_tuple(4, 4, &aom_highbd_sad_skip_4x4x4d_c, 8),
+  make_tuple(64, 16, &aom_highbd_sad_skip_64x16x4d_c, 8),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64x4d_c, 8),
+  make_tuple(32, 8, &aom_highbd_sad_skip_32x8x4d_c, 8),
+  make_tuple(8, 32, &aom_highbd_sad_skip_8x32x4d_c, 8),
+  make_tuple(16, 4, &aom_highbd_sad_skip_16x4x4d_c, 8),
+  make_tuple(4, 16, &aom_highbd_sad_skip_4x16x4d_c, 8),
+
+  make_tuple(128, 128, &aom_highbd_sad_skip_128x128x4d_c, 10),
+  make_tuple(128, 64, &aom_highbd_sad_skip_128x64x4d_c, 10),
+  make_tuple(64, 128, &aom_highbd_sad_skip_64x128x4d_c, 10),
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64x4d_c, 10),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32x4d_c, 10),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64x4d_c, 10),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32x4d_c, 10),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16x4d_c, 10),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32x4d_c, 10),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16x4d_c, 10),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8x4d_c, 10),
+  make_tuple(8, 16, &aom_highbd_sad_skip_8x16x4d_c, 10),
+  make_tuple(8, 8, &aom_highbd_sad_skip_8x8x4d_c, 10),
+  make_tuple(8, 4, &aom_highbd_sad_skip_8x4x4d_c, 10),
+  make_tuple(4, 8, &aom_highbd_sad_skip_4x8x4d_c, 10),
+  make_tuple(4, 4, &aom_highbd_sad_skip_4x4x4d_c, 10),
+  make_tuple(64, 16, &aom_highbd_sad_skip_64x16x4d_c, 10),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64x4d_c, 10),
+  make_tuple(32, 8, &aom_highbd_sad_skip_32x8x4d_c, 10),
+  make_tuple(8, 32, &aom_highbd_sad_skip_8x32x4d_c, 10),
+  make_tuple(16, 4, &aom_highbd_sad_skip_16x4x4d_c, 10),
+  make_tuple(4, 16, &aom_highbd_sad_skip_4x16x4d_c, 10),
+
+  make_tuple(128, 128, &aom_highbd_sad_skip_128x128x4d_c, 12),
+  make_tuple(128, 64, &aom_highbd_sad_skip_128x64x4d_c, 12),
+  make_tuple(64, 128, &aom_highbd_sad_skip_64x128x4d_c, 12),
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64x4d_c, 12),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32x4d_c, 12),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64x4d_c, 12),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32x4d_c, 12),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16x4d_c, 12),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32x4d_c, 12),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16x4d_c, 12),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8x4d_c, 12),
+  make_tuple(8, 16, &aom_highbd_sad_skip_8x16x4d_c, 12),
+  make_tuple(8, 8, &aom_highbd_sad_skip_8x8x4d_c, 12),
+  make_tuple(8, 4, &aom_highbd_sad_skip_8x4x4d_c, 12),
+  make_tuple(4, 8, &aom_highbd_sad_skip_4x8x4d_c, 12),
+  make_tuple(4, 4, &aom_highbd_sad_skip_4x4x4d_c, 12),
+  make_tuple(64, 16, &aom_highbd_sad_skip_64x16x4d_c, 12),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64x4d_c, 12),
+  make_tuple(32, 8, &aom_highbd_sad_skip_32x8x4d_c, 12),
+  make_tuple(8, 32, &aom_highbd_sad_skip_8x32x4d_c, 12),
+  make_tuple(16, 4, &aom_highbd_sad_skip_16x4x4d_c, 12),
+  make_tuple(4, 16, &aom_highbd_sad_skip_4x16x4d_c, 12),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(C, SADSkipx4Test,
+                         ::testing::ValuesIn(skip_x4d_c_tests));
+
 const SadMxNx4AvgParam x4d_avg_c_tests[] = {
   make_tuple(128, 128, &aom_sad128x128x4d_avg_c, -1),
   make_tuple(128, 64, &aom_sad128x64x4d_avg_c, -1),
@@ -1424,6 +1876,83 @@
 };
 INSTANTIATE_TEST_SUITE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests));
 
+const SadSkipMxNParam skip_sse2_tests[] = {
+  make_tuple(128, 128, &aom_sad_skip_128x128_sse2, -1),
+  make_tuple(128, 64, &aom_sad_skip_128x64_sse2, -1),
+  make_tuple(64, 128, &aom_sad_skip_64x128_sse2, -1),
+  make_tuple(64, 64, &aom_sad_skip_64x64_sse2, -1),
+  make_tuple(64, 32, &aom_sad_skip_64x32_sse2, -1),
+  make_tuple(32, 64, &aom_sad_skip_32x64_sse2, -1),
+  make_tuple(32, 32, &aom_sad_skip_32x32_sse2, -1),
+  make_tuple(32, 16, &aom_sad_skip_32x16_sse2, -1),
+  make_tuple(16, 32, &aom_sad_skip_16x32_sse2, -1),
+  make_tuple(16, 16, &aom_sad_skip_16x16_sse2, -1),
+  make_tuple(16, 8, &aom_sad_skip_16x8_sse2, -1),
+  make_tuple(8, 16, &aom_sad_skip_8x16_sse2, -1),
+  make_tuple(8, 8, &aom_sad_skip_8x8_sse2, -1),
+  make_tuple(4, 8, &aom_sad_skip_4x8_sse2, -1),
+  make_tuple(64, 16, &aom_sad_skip_64x16_sse2, -1),
+  make_tuple(16, 64, &aom_sad_skip_16x64_sse2, -1),
+  make_tuple(32, 8, &aom_sad_skip_32x8_sse2, -1),
+  make_tuple(8, 32, &aom_sad_skip_8x32_sse2, -1),
+  make_tuple(4, 16, &aom_sad_skip_4x16_sse2, -1),
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64_sse2, 8),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32_sse2, 8),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64_sse2, 8),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32_sse2, 8),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16_sse2, 8),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32_sse2, 8),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16_sse2, 8),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8_sse2, 8),
+  make_tuple(8, 16, &aom_highbd_sad_skip_8x16_sse2, 8),
+  make_tuple(8, 8, &aom_highbd_sad_skip_8x8_sse2, 8),
+  make_tuple(4, 8, &aom_highbd_sad_skip_4x8_sse2, 8),
+  make_tuple(64, 16, &aom_highbd_sad_skip_64x16_sse2, 8),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64_sse2, 8),
+  make_tuple(32, 8, &aom_highbd_sad_skip_32x8_sse2, 8),
+  make_tuple(8, 32, &aom_highbd_sad_skip_8x32_sse2, 8),
+  make_tuple(4, 16, &aom_highbd_sad_skip_4x16_sse2, 8),
+
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64_sse2, 10),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32_sse2, 10),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64_sse2, 10),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32_sse2, 10),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16_sse2, 10),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32_sse2, 10),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16_sse2, 10),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8_sse2, 10),
+  make_tuple(8, 16, &aom_highbd_sad_skip_8x16_sse2, 10),
+  make_tuple(8, 8, &aom_highbd_sad_skip_8x8_sse2, 10),
+  make_tuple(4, 8, &aom_highbd_sad_skip_4x8_sse2, 10),
+  make_tuple(64, 16, &aom_highbd_sad_skip_64x16_sse2, 10),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64_sse2, 10),
+  make_tuple(32, 8, &aom_highbd_sad_skip_32x8_sse2, 10),
+  make_tuple(8, 32, &aom_highbd_sad_skip_8x32_sse2, 10),
+  make_tuple(4, 16, &aom_highbd_sad_skip_4x16_sse2, 10),
+
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64_sse2, 12),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32_sse2, 12),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64_sse2, 12),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32_sse2, 12),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16_sse2, 12),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32_sse2, 12),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16_sse2, 12),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8_sse2, 12),
+  make_tuple(8, 16, &aom_highbd_sad_skip_8x16_sse2, 12),
+  make_tuple(8, 8, &aom_highbd_sad_skip_8x8_sse2, 12),
+  make_tuple(4, 8, &aom_highbd_sad_skip_4x8_sse2, 12),
+  make_tuple(64, 16, &aom_highbd_sad_skip_64x16_sse2, 12),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64_sse2, 12),
+  make_tuple(32, 8, &aom_highbd_sad_skip_32x8_sse2, 12),
+  make_tuple(8, 32, &aom_highbd_sad_skip_8x32_sse2, 12),
+  make_tuple(4, 16, &aom_highbd_sad_skip_4x16_sse2, 12),
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, SADSkipTest,
+                         ::testing::ValuesIn(skip_sse2_tests));
+
 const SadMxNAvgParam avg_sse2_tests[] = {
   make_tuple(128, 128, &aom_sad128x128_avg_sse2, -1),
   make_tuple(128, 64, &aom_sad128x64_avg_sse2, -1),
@@ -1606,6 +2135,84 @@
 };
 INSTANTIATE_TEST_SUITE_P(SSE2, SADx4Test, ::testing::ValuesIn(x4d_sse2_tests));
 
+const SadSkipMxNx4Param skip_x4d_sse2_tests[] = {
+  make_tuple(128, 128, &aom_sad_skip_128x128x4d_sse2, -1),
+  make_tuple(128, 64, &aom_sad_skip_128x64x4d_sse2, -1),
+  make_tuple(64, 128, &aom_sad_skip_64x128x4d_sse2, -1),
+  make_tuple(64, 64, &aom_sad_skip_64x64x4d_sse2, -1),
+  make_tuple(64, 32, &aom_sad_skip_64x32x4d_sse2, -1),
+  make_tuple(32, 64, &aom_sad_skip_32x64x4d_sse2, -1),
+  make_tuple(32, 32, &aom_sad_skip_32x32x4d_sse2, -1),
+  make_tuple(32, 16, &aom_sad_skip_32x16x4d_sse2, -1),
+  make_tuple(16, 32, &aom_sad_skip_16x32x4d_sse2, -1),
+  make_tuple(16, 16, &aom_sad_skip_16x16x4d_sse2, -1),
+  make_tuple(16, 8, &aom_sad_skip_16x8x4d_sse2, -1),
+  make_tuple(8, 16, &aom_sad_skip_8x16x4d_sse2, -1),
+  make_tuple(8, 8, &aom_sad_skip_8x8x4d_sse2, -1),
+  make_tuple(4, 8, &aom_sad_skip_4x8x4d_sse2, -1),
+  make_tuple(64, 16, &aom_sad_skip_64x16x4d_sse2, -1),
+  make_tuple(16, 64, &aom_sad_skip_16x64x4d_sse2, -1),
+  make_tuple(32, 8, &aom_sad_skip_32x8x4d_sse2, -1),
+  make_tuple(8, 32, &aom_sad_skip_8x32x4d_sse2, -1),
+  make_tuple(4, 16, &aom_sad_skip_4x16x4d_sse2, -1),
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64x4d_sse2, 8),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32x4d_sse2, 8),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64x4d_sse2, 8),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32x4d_sse2, 8),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16x4d_sse2, 8),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32x4d_sse2, 8),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16x4d_sse2, 8),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8x4d_sse2, 8),
+  make_tuple(8, 16, &aom_highbd_sad_skip_8x16x4d_sse2, 8),
+  make_tuple(8, 8, &aom_highbd_sad_skip_8x8x4d_sse2, 8),
+  make_tuple(4, 8, &aom_highbd_sad_skip_4x8x4d_sse2, 8),
+  make_tuple(64, 16, &aom_highbd_sad_skip_64x16x4d_sse2, 8),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64x4d_sse2, 8),
+  make_tuple(32, 8, &aom_highbd_sad_skip_32x8x4d_sse2, 8),
+  make_tuple(8, 32, &aom_highbd_sad_skip_8x32x4d_sse2, 8),
+  make_tuple(4, 16, &aom_highbd_sad_skip_4x16x4d_sse2, 8),
+
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64x4d_sse2, 10),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32x4d_sse2, 10),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64x4d_sse2, 10),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32x4d_sse2, 10),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16x4d_sse2, 10),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32x4d_sse2, 10),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16x4d_sse2, 10),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8x4d_sse2, 10),
+  make_tuple(8, 16, &aom_highbd_sad_skip_8x16x4d_sse2, 10),
+  make_tuple(8, 8, &aom_highbd_sad_skip_8x8x4d_sse2, 10),
+  make_tuple(4, 8, &aom_highbd_sad_skip_4x8x4d_sse2, 10),
+  make_tuple(64, 16, &aom_highbd_sad_skip_64x16x4d_sse2, 10),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64x4d_sse2, 10),
+  make_tuple(32, 8, &aom_highbd_sad_skip_32x8x4d_sse2, 10),
+  make_tuple(8, 32, &aom_highbd_sad_skip_8x32x4d_sse2, 10),
+  make_tuple(4, 16, &aom_highbd_sad_skip_4x16x4d_sse2, 10),
+
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64x4d_sse2, 12),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32x4d_sse2, 12),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64x4d_sse2, 12),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32x4d_sse2, 12),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16x4d_sse2, 12),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32x4d_sse2, 12),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16x4d_sse2, 12),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8x4d_sse2, 12),
+  make_tuple(8, 16, &aom_highbd_sad_skip_8x16x4d_sse2, 12),
+  make_tuple(8, 8, &aom_highbd_sad_skip_8x8x4d_sse2, 12),
+  make_tuple(4, 8, &aom_highbd_sad_skip_4x8x4d_sse2, 12),
+  make_tuple(64, 16, &aom_highbd_sad_skip_64x16x4d_sse2, 12),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64x4d_sse2, 12),
+  make_tuple(32, 8, &aom_highbd_sad_skip_32x8x4d_sse2, 12),
+  make_tuple(8, 32, &aom_highbd_sad_skip_8x32x4d_sse2, 12),
+  make_tuple(4, 16, &aom_highbd_sad_skip_4x16x4d_sse2, 12),
+
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, SADSkipx4Test,
+                         ::testing::ValuesIn(skip_x4d_sse2_tests));
+
 const SadMxNx4AvgParam x4d_avg_sse2_tests[] = {
   make_tuple(128, 128, &aom_sad128x128x4d_avg_sse2, -1),
   make_tuple(128, 64, &aom_sad128x64x4d_avg_sse2, -1),
@@ -1802,6 +2409,59 @@
 };
 INSTANTIATE_TEST_SUITE_P(AVX2, SADTest, ::testing::ValuesIn(avx2_tests));
 
+const SadSkipMxNParam skip_avx2_tests[] = {
+  make_tuple(128, 128, &aom_sad_skip_128x128_avx2, -1),
+  make_tuple(128, 64, &aom_sad_skip_128x64_avx2, -1),
+  make_tuple(64, 128, &aom_sad_skip_64x128_avx2, -1),
+  make_tuple(64, 64, &aom_sad_skip_64x64_avx2, -1),
+  make_tuple(64, 32, &aom_sad_skip_64x32_avx2, -1),
+  make_tuple(32, 64, &aom_sad_skip_32x64_avx2, -1),
+  make_tuple(32, 32, &aom_sad_skip_32x32_avx2, -1),
+  make_tuple(32, 16, &aom_sad_skip_32x16_avx2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(128, 128, &aom_highbd_sad_skip_128x128_avx2, 8),
+  make_tuple(128, 64, &aom_highbd_sad_skip_128x64_avx2, 8),
+  make_tuple(64, 128, &aom_highbd_sad_skip_64x128_avx2, 8),
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64_avx2, 8),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32_avx2, 8),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64_avx2, 8),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32_avx2, 8),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16_avx2, 8),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64_avx2, 8),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32_avx2, 8),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16_avx2, 8),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8_avx2, 8),
+
+  make_tuple(128, 128, &aom_highbd_sad_skip_128x128_avx2, 10),
+  make_tuple(128, 64, &aom_highbd_sad_skip_128x64_avx2, 10),
+  make_tuple(64, 128, &aom_highbd_sad_skip_64x128_avx2, 10),
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64_avx2, 10),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32_avx2, 10),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64_avx2, 10),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32_avx2, 10),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16_avx2, 10),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64_avx2, 10),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32_avx2, 10),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16_avx2, 10),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8_avx2, 10),
+
+  make_tuple(128, 128, &aom_highbd_sad_skip_128x128_avx2, 12),
+  make_tuple(128, 64, &aom_highbd_sad_skip_128x64_avx2, 12),
+  make_tuple(64, 128, &aom_highbd_sad_skip_64x128_avx2, 12),
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64_avx2, 12),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32_avx2, 12),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64_avx2, 12),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32_avx2, 12),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16_avx2, 12),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64_avx2, 12),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32_avx2, 12),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16_avx2, 12),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8_avx2, 12),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, SADSkipTest,
+                         ::testing::ValuesIn(skip_avx2_tests));
+
 const SadMxNAvgParam avg_avx2_tests[] = {
   make_tuple(64, 128, &aom_sad64x128_avg_avx2, -1),
   make_tuple(128, 64, &aom_sad128x64_avg_avx2, -1),
@@ -1862,6 +2522,67 @@
 };
 INSTANTIATE_TEST_SUITE_P(AVX2, SADavgTest, ::testing::ValuesIn(avg_avx2_tests));
 
+const SadSkipMxNx4Param skip_x4d_avx2_tests[] = {
+  make_tuple(128, 128, &aom_sad_skip_128x128x4d_avx2, -1),
+  make_tuple(128, 64, &aom_sad_skip_128x64x4d_avx2, -1),
+  make_tuple(64, 128, &aom_sad_skip_64x128x4d_avx2, -1),
+  make_tuple(64, 64, &aom_sad_skip_64x64x4d_avx2, -1),
+  make_tuple(64, 32, &aom_sad_skip_64x32x4d_avx2, -1),
+  make_tuple(64, 16, &aom_sad_skip_64x16x4d_avx2, -1),
+  make_tuple(32, 64, &aom_sad_skip_32x64x4d_avx2, -1),
+  make_tuple(32, 32, &aom_sad_skip_32x32x4d_avx2, -1),
+  make_tuple(32, 16, &aom_sad_skip_32x16x4d_avx2, -1),
+  make_tuple(32, 8, &aom_sad_skip_32x8x4d_avx2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(128, 128, &aom_highbd_sad_skip_128x128x4d_avx2, 8),
+  make_tuple(128, 64, &aom_highbd_sad_skip_128x64x4d_avx2, 8),
+  make_tuple(64, 128, &aom_highbd_sad_skip_64x128x4d_avx2, 8),
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64x4d_avx2, 8),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32x4d_avx2, 8),
+  make_tuple(64, 16, &aom_highbd_sad_skip_64x16x4d_avx2, 8),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64x4d_avx2, 8),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32x4d_avx2, 8),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16x4d_avx2, 8),
+  make_tuple(32, 8, &aom_highbd_sad_skip_32x8x4d_avx2, 8),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64x4d_avx2, 8),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32x4d_avx2, 8),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16x4d_avx2, 8),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8x4d_avx2, 8),
+
+  make_tuple(128, 128, &aom_highbd_sad_skip_128x128x4d_avx2, 10),
+  make_tuple(128, 64, &aom_highbd_sad_skip_128x64x4d_avx2, 10),
+  make_tuple(64, 128, &aom_highbd_sad_skip_64x128x4d_avx2, 10),
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64x4d_avx2, 10),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32x4d_avx2, 10),
+  make_tuple(64, 16, &aom_highbd_sad_skip_64x16x4d_avx2, 10),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64x4d_avx2, 10),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32x4d_avx2, 10),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16x4d_avx2, 10),
+  make_tuple(32, 8, &aom_highbd_sad_skip_32x8x4d_avx2, 10),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64x4d_avx2, 10),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32x4d_avx2, 10),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16x4d_avx2, 10),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8x4d_avx2, 10),
+
+  make_tuple(128, 128, &aom_highbd_sad_skip_128x128x4d_avx2, 12),
+  make_tuple(128, 64, &aom_highbd_sad_skip_128x64x4d_avx2, 12),
+  make_tuple(64, 128, &aom_highbd_sad_skip_64x128x4d_avx2, 12),
+  make_tuple(64, 64, &aom_highbd_sad_skip_64x64x4d_avx2, 12),
+  make_tuple(64, 32, &aom_highbd_sad_skip_64x32x4d_avx2, 12),
+  make_tuple(64, 16, &aom_highbd_sad_skip_64x16x4d_avx2, 12),
+  make_tuple(32, 64, &aom_highbd_sad_skip_32x64x4d_avx2, 12),
+  make_tuple(32, 32, &aom_highbd_sad_skip_32x32x4d_avx2, 12),
+  make_tuple(32, 16, &aom_highbd_sad_skip_32x16x4d_avx2, 12),
+  make_tuple(32, 8, &aom_highbd_sad_skip_32x8x4d_avx2, 12),
+  make_tuple(16, 64, &aom_highbd_sad_skip_16x64x4d_avx2, 12),
+  make_tuple(16, 32, &aom_highbd_sad_skip_16x32x4d_avx2, 12),
+  make_tuple(16, 16, &aom_highbd_sad_skip_16x16x4d_avx2, 12),
+  make_tuple(16, 8, &aom_highbd_sad_skip_16x8x4d_avx2, 12),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, SADSkipx4Test,
+                         ::testing::ValuesIn(skip_x4d_avx2_tests));
+
 const SadMxNx4Param x4d_avx2_tests[] = {
   make_tuple(32, 64, &aom_sad32x64x4d_avx2, -1),
   make_tuple(32, 32, &aom_sad32x32x4d_avx2, -1),