test/reconinter_test.cc - avm - Git at Google

 /*
  * Copyright (c) 2021, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 3-Clause Clear License
  * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
  * License was not distributed with this source code in the LICENSE file, you
  * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
  * Alliance for Open Media Patent License 1.0 was not distributed with this
  * source code in the PATENTS file, you can obtain it at
  * aomedia.org/license/patent-license/.
  */

 #include <stdint.h>
 #include <stdio.h>
 #include <string.h>
 #include <tuple>

 #include "config/aom_config.h"
 #include "config/av1_rtcd.h"

 #include "aom_ports/mem.h"
 #include "av1/common/scan.h"
 #include "av1/common/txb_common.h"
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"

 namespace {
 using libaom_test::ACMRandom;

 typedef void (*buildcompdiffwtdmaskd16_func)(
     uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
     int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
     ConvolveParams *conv_params, int bd);

 typedef std::tuple<int, buildcompdiffwtdmaskd16_func, BLOCK_SIZE>
     BuildCompDiffwtdMaskD16Param;

 #if HAVE_SSE4_1 || HAVE_NEON
 ::testing::internal::ParamGenerator<BuildCompDiffwtdMaskD16Param> BuildParams(
     buildcompdiffwtdmaskd16_func filter) {
   return ::testing::Combine(::testing::Range(8, 13, 2),
                             ::testing::Values(filter),
                             ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
 }
 #endif
 class BuildCompDiffwtdMaskD16Test
     : public ::testing::TestWithParam<BuildCompDiffwtdMaskD16Param> {
  public:
   ~BuildCompDiffwtdMaskD16Test() {}
   virtual void TearDown() { libaom_test::ClearSystemState(); }
   void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }

  protected:
   void RunCheckOutput(buildcompdiffwtdmaskd16_func test_impl);
   void RunSpeedTest(buildcompdiffwtdmaskd16_func test_impl,
                     DIFFWTD_MASK_TYPE mask_type);
   libaom_test::ACMRandom rnd_;
 };  // class BuildCompDiffwtdMaskD16Test
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(BuildCompDiffwtdMaskD16Test);

 void BuildCompDiffwtdMaskD16Test::RunCheckOutput(
     buildcompdiffwtdmaskd16_func test_impl) {
   const int block_idx = GET_PARAM(2);
   const int bd = GET_PARAM(0);
   const int width = block_size_wide[block_idx];
   const int height = block_size_high[block_idx];
   DECLARE_ALIGNED(16, uint8_t, mask_ref[2 * MAX_SB_SQUARE]);
   DECLARE_ALIGNED(16, uint8_t, mask_test[2 * MAX_SB_SQUARE]);
   DECLARE_ALIGNED(32, uint16_t, src0[MAX_SB_SQUARE]);
   DECLARE_ALIGNED(32, uint16_t, src1[MAX_SB_SQUARE]);

   ConvolveParams conv_params = get_conv_params_no_round(0, 0, NULL, 0, 1, bd);

   int in_precision =
       bd + 2 * FILTER_BITS - conv_params.round_0 - conv_params.round_1 + 2;
   assert(in_precision >= 0);  // Ensure left-shift is non-negative.
   assert(in_precision < 32);  // Ensure left-shift doesn't overflow.

   for (int i = 0; i < MAX_SB_SQUARE; i++) {
     src0[i] = rnd_.Rand16() & ((1 << in_precision) - 1);
     src1[i] = rnd_.Rand16() & ((1 << in_precision) - 1);
   }

   for (int mask_type = 0; mask_type < DIFFWTD_MASK_TYPES; mask_type++) {
     av1_build_compound_diffwtd_mask_d16_c(
         mask_ref, (DIFFWTD_MASK_TYPE)mask_type, src0, width, src1, width,
         height, width, &conv_params, bd);

     test_impl(mask_test, (DIFFWTD_MASK_TYPE)mask_type, src0, width, src1, width,
               height, width, &conv_params, bd);

     for (int r = 0; r < height; ++r) {
       for (int c = 0; c < width; ++c) {
         ASSERT_EQ(mask_ref[c + r * width], mask_test[c + r * width])
             << "Mismatch at unit tests for BuildCompDiffwtdMaskD16Test\n"
             << " Pixel mismatch at index " << "[" << r << "," << c << "] "
             << " @ " << width << "x" << height << " inv " << mask_type;
       }
     }
   }
 }

 void BuildCompDiffwtdMaskD16Test::RunSpeedTest(
     buildcompdiffwtdmaskd16_func test_impl, DIFFWTD_MASK_TYPE mask_type) {
   const int block_idx = GET_PARAM(2);
   const int bd = GET_PARAM(0);
   const int width = block_size_wide[block_idx];
   const int height = block_size_high[block_idx];
   DECLARE_ALIGNED(16, uint8_t, mask[MAX_SB_SQUARE]);
   DECLARE_ALIGNED(32, uint16_t, src0[MAX_SB_SQUARE]);
   DECLARE_ALIGNED(32, uint16_t, src1[MAX_SB_SQUARE]);

   ConvolveParams conv_params = get_conv_params_no_round(0, 0, NULL, 0, 1, bd);

   int in_precision =
       bd + 2 * FILTER_BITS - conv_params.round_0 - conv_params.round_1 + 2;
   assert(in_precision >= 0);  // Ensure left-shift is non-negative.
   assert(in_precision < 32);  // Ensure left-shift doesn't overflow.

   for (int i = 0; i < MAX_SB_SQUARE; i++) {
     src0[i] = rnd_.Rand16() & ((1 << in_precision) - 1);
     src1[i] = rnd_.Rand16() & ((1 << in_precision) - 1);
   }

   const int num_loops = 10000000 / (width + height);
   aom_usec_timer timer;
   aom_usec_timer_start(&timer);

   for (int i = 0; i < num_loops; ++i)
     av1_build_compound_diffwtd_mask_d16_c(mask, mask_type, src0, width, src1,
                                           width, height, width, &conv_params,
                                           bd);

   aom_usec_timer_mark(&timer);
   const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));

   aom_usec_timer timer1;
   aom_usec_timer_start(&timer1);

   for (int i = 0; i < num_loops; ++i)
     test_impl(mask, mask_type, src0, width, src1, width, height, width,
               &conv_params, bd);

   aom_usec_timer_mark(&timer1);
   const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
   printf("av1_build_compound_diffwtd_mask_d16  %3dx%-3d: %7.2f \n", width,
          height, elapsed_time / double(elapsed_time1));
 }
 TEST_P(BuildCompDiffwtdMaskD16Test, CheckOutput) {
   RunCheckOutput(GET_PARAM(1));
 }

 TEST_P(BuildCompDiffwtdMaskD16Test, DISABLED_Speed) {
   RunSpeedTest(GET_PARAM(1), DIFFWTD_38);
   RunSpeedTest(GET_PARAM(1), DIFFWTD_38_INV);
 }

 #if HAVE_SSE4_1
 INSTANTIATE_TEST_SUITE_P(
     SSE4_1, BuildCompDiffwtdMaskD16Test,
     BuildParams(av1_build_compound_diffwtd_mask_d16_sse4_1));
 #endif

 #if HAVE_AVX2
 INSTANTIATE_TEST_SUITE_P(AVX2, BuildCompDiffwtdMaskD16Test,
                          BuildParams(av1_build_compound_diffwtd_mask_d16_avx2));
 #endif

 #if HAVE_NEON
 INSTANTIATE_TEST_SUITE_P(NEON, BuildCompDiffwtdMaskD16Test,
                          BuildParams(av1_build_compound_diffwtd_mask_d16_neon));
 #endif

 typedef void (*RefinemvPadMCBorderFunc)(const uint16_t *src, int src_stride,
                                         uint16_t *dst, int dst_stride, int x0,
                                         int y0, int b_w, int b_h,
                                         const ReferenceArea *ref_area);

 using std::get;
 using std::make_tuple;
 using std::tuple;

 static constexpr int kSpeedIterations = 10000;
 static constexpr int kMaxDimension = 2 * (REF_BUFFER_WIDTH + 1);

 // <width, height, bit_depth, subtract>
 typedef tuple<int, int, int, RefinemvPadMCBorderFunc> Params;

 class RefinemvPadMCBorderTest : public ::testing::TestWithParam<Params> {
  public:
   virtual void SetUp() {
     block_width_ = GET_PARAM(0) + (AOM_INTERP_EXTEND - 1) + AOM_INTERP_EXTEND;
     block_height_ = GET_PARAM(1) + (AOM_INTERP_EXTEND - 1) + AOM_INTERP_EXTEND;
     func_ = GET_PARAM(3);
     src_stride_ = kMaxDimension;
     dst_stride_ = REF_BUFFER_WIDTH;

     const size_t max_width = REF_BUFFER_WIDTH + 1;
     const size_t max_block_size = max_width * max_width;
     src_ = reinterpret_cast<uint16_t *>(
         aom_memalign(16, kMaxDimension * kMaxDimension * sizeof(uint16_t)));
     dst_ref_ = reinterpret_cast<uint16_t *>(
         aom_memalign(16, max_block_size * sizeof(uint16_t)));
     dst_test_ = reinterpret_cast<uint16_t *>(
         aom_memalign(16, max_block_size * sizeof(uint16_t)));

     aom_bit_depth_t bit_depth = static_cast<aom_bit_depth_t>(GET_PARAM(2));
     const int mask = (1 << bit_depth) - 1;
     ref_area_ = { { 0, 15, 0, 15 }, { 0 }, 0 };

     ACMRandom rnd;
     rnd.Reset(ACMRandom::DeterministicSeed());
     for (int j = 0; j < (int)max_block_size; ++j) {
       src_[j] = rnd.Rand16() & mask;
     }
   }

   int BorderLeft() const { return (kMaxDimension - block_width_) / 2; }
   int BorderTop() const { return (kMaxDimension - block_height_) / 2; }

   uint16_t *input() const {
     const int offset = BorderTop() * kMaxDimension + BorderLeft();
     return src_ + offset;
   }

   virtual void TearDown() {
     aom_free(src_);
     aom_free(dst_ref_);
     aom_free(dst_test_);
   }

   void AssertOutputBufferEq(const uint16_t *p1, const uint16_t *p2, int width,
                             int height, int stride) {
     ASSERT_TRUE(p1 != p2) << "Buffers must be in different memory locations";
     for (int j = 0; j < height; ++j) {
       if (memcmp(p1, p2, sizeof(*p1) * width) == 0) {
         p1 += stride;
         p2 += stride;
         continue;
       }
       for (int i = 0; i < width; ++i) {
         ASSERT_EQ(p1[i], p2[i])
             << width << "x" << height << " Pixel mismatch at (" << i << ", "
             << j << ")";
       }
     }
   }

  protected:
   void CheckResult();
   void RunSpeedTest();

  private:
   int block_height_;
   int block_width_;
   RefinemvPadMCBorderFunc func_;
   uint16_t *src_;
   uint16_t *dst_ref_;
   uint16_t *dst_test_;
   int src_stride_;
   int dst_stride_;
   ReferenceArea ref_area_;
 };
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(RefinemvPadMCBorderTest);

 void RefinemvPadMCBorderTest::CheckResult() {
   uint16_t *const in = input();
   for (int y0 = -15; y0 <= 15; y0++) {
     for (int x0 = -15; x0 <= 15; x0++) {
       const uint16_t *const buf_ptr = in + y0 * src_stride_ + x0;
       refinemv_highbd_pad_mc_border_c(buf_ptr, src_stride_, dst_ref_,
                                       dst_stride_, x0, y0, block_width_,
                                       block_height_, &ref_area_);
       func_(buf_ptr, src_stride_, dst_test_, dst_stride_, x0, y0, block_width_,
             block_height_, &ref_area_);

       AssertOutputBufferEq(dst_ref_, dst_test_, block_width_, block_height_,
                            dst_stride_);
     }
   }
 }

 TEST_P(RefinemvPadMCBorderTest, CheckResult) { CheckResult(); }

 void RefinemvPadMCBorderTest::RunSpeedTest() {
   int x0[3] = { 0, 1, 15 };
   int y0[3] = { 0, 1, 15 };
   uint16_t *const in = input();

   for (int k = 0; k <= 2; k++) {
     for (int l = 0; l <= 2; l++) {
       const uint16_t *const buf_ptr = in + y0[k] * src_stride_ + x0[l];
       aom_usec_timer timer;
       aom_usec_timer_start(&timer);
       for (int i = 0; i < kSpeedIterations; ++i) {
         refinemv_highbd_pad_mc_border_c(buf_ptr, src_stride_, dst_ref_,
                                         dst_stride_, x0[l], y0[k], block_width_,
                                         block_height_, &ref_area_);
       }
       aom_usec_timer_mark(&timer);
       auto elapsed_time_c = aom_usec_timer_elapsed(&timer);

       aom_usec_timer_start(&timer);
       for (int i = 0; i < kSpeedIterations; ++i) {
         func_(buf_ptr, src_stride_, dst_test_, dst_stride_, x0[l], y0[k],
               block_width_, block_height_, &ref_area_);
       }
       aom_usec_timer_mark(&timer);
       auto elapsed_time_opt = aom_usec_timer_elapsed(&timer);

       float c_time_per_pixel =
           (float)1000.0 * elapsed_time_c /
           (kSpeedIterations * block_width_ * block_height_);
       float opt_time_per_pixel =
           (float)1000.0 * elapsed_time_opt /
           (kSpeedIterations * block_width_ * block_height_);
       float scaling = c_time_per_pixel / opt_time_per_pixel;
       printf(
           "%3dx%-3d: c_time_per_pixel=%10.5f, "
           "opt_time_per_pixel=%10.5f,  scaling=%f, x0:%d, y0:%d \n",
           block_width_, block_height_, c_time_per_pixel, opt_time_per_pixel,
           scaling, x0[l], y0[k]);
     }
   }
 }

 TEST_P(RefinemvPadMCBorderTest, DISABLED_Speed) { RunSpeedTest(); }

 #if HAVE_AVX2
 const Params kRefinemvPadMCBorder_avx2[] = {
   make_tuple(8, 4, 8, &refinemv_highbd_pad_mc_border_avx2),
   make_tuple(8, 8, 8, &refinemv_highbd_pad_mc_border_avx2),
   make_tuple(8, 16, 8, &refinemv_highbd_pad_mc_border_avx2),
   make_tuple(8, 4, 10, &refinemv_highbd_pad_mc_border_avx2),
   make_tuple(8, 8, 10, &refinemv_highbd_pad_mc_border_avx2),
   make_tuple(8, 16, 10, &refinemv_highbd_pad_mc_border_avx2),
   make_tuple(8, 4, 12, &refinemv_highbd_pad_mc_border_avx2),
   make_tuple(8, 8, 12, &refinemv_highbd_pad_mc_border_avx2),
   make_tuple(8, 16, 12, &refinemv_highbd_pad_mc_border_avx2)
 };

 INSTANTIATE_TEST_SUITE_P(AVX2, RefinemvPadMCBorderTest,
                          ::testing::ValuesIn(kRefinemvPadMCBorder_avx2));
 #endif  // HAVE_AVX2

 }  // namespace
	/*
	* Copyright (c) 2021, Alliance for Open Media. All rights reserved
	*
	* This source code is subject to the terms of the BSD 3-Clause Clear License
	* and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
	* License was not distributed with this source code in the LICENSE file, you
	* can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the
	* Alliance for Open Media Patent License 1.0 was not distributed with this
	* source code in the PATENTS file, you can obtain it at
	* aomedia.org/license/patent-license/.
	*/

	#include <stdint.h>
	#include <stdio.h>
	#include <string.h>
	#include <tuple>

	#include "config/aom_config.h"
	#include "config/av1_rtcd.h"

	#include "aom_ports/mem.h"
	#include "av1/common/scan.h"
	#include "av1/common/txb_common.h"
	#include "test/acm_random.h"
	#include "test/clear_system_state.h"
	#include "test/register_state_check.h"
	#include "test/util.h"
	#include "third_party/googletest/src/googletest/include/gtest/gtest.h"

	namespace {
	using libaom_test::ACMRandom;

	typedef void (*buildcompdiffwtdmaskd16_func)(
	uint8_t mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE src0,
	int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
	ConvolveParams *conv_params, int bd);

	typedef std::tuple<int, buildcompdiffwtdmaskd16_func, BLOCK_SIZE>
	BuildCompDiffwtdMaskD16Param;

	#if HAVE_SSE4_1 \|\| HAVE_NEON
	::testing::internal::ParamGenerator<BuildCompDiffwtdMaskD16Param> BuildParams(
	buildcompdiffwtdmaskd16_func filter) {
	return ::testing::Combine(::testing::Range(8, 13, 2),
	::testing::Values(filter),
	::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
	}
	#endif
	class BuildCompDiffwtdMaskD16Test
	: public ::testing::TestWithParam<BuildCompDiffwtdMaskD16Param> {
	public:
	~BuildCompDiffwtdMaskD16Test() {}
	virtual void TearDown() { libaom_test::ClearSystemState(); }
	void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }

	protected:
	void RunCheckOutput(buildcompdiffwtdmaskd16_func test_impl);
	void RunSpeedTest(buildcompdiffwtdmaskd16_func test_impl,
	DIFFWTD_MASK_TYPE mask_type);
	libaom_test::ACMRandom rnd_;
	}; // class BuildCompDiffwtdMaskD16Test
	GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(BuildCompDiffwtdMaskD16Test);

	void BuildCompDiffwtdMaskD16Test::RunCheckOutput(
	buildcompdiffwtdmaskd16_func test_impl) {
	const int block_idx = GET_PARAM(2);
	const int bd = GET_PARAM(0);
	const int width = block_size_wide[block_idx];
	const int height = block_size_high[block_idx];
	DECLARE_ALIGNED(16, uint8_t, mask_ref[2 * MAX_SB_SQUARE]);
	DECLARE_ALIGNED(16, uint8_t, mask_test[2 * MAX_SB_SQUARE]);
	DECLARE_ALIGNED(32, uint16_t, src0[MAX_SB_SQUARE]);
	DECLARE_ALIGNED(32, uint16_t, src1[MAX_SB_SQUARE]);

	ConvolveParams conv_params = get_conv_params_no_round(0, 0, NULL, 0, 1, bd);

	int in_precision =
	bd + 2 * FILTER_BITS - conv_params.round_0 - conv_params.round_1 + 2;
	assert(in_precision >= 0); // Ensure left-shift is non-negative.
	assert(in_precision < 32); // Ensure left-shift doesn't overflow.

	for (int i = 0; i < MAX_SB_SQUARE; i++) {
	src0[i] = rnd_.Rand16() & ((1 << in_precision) - 1);
	src1[i] = rnd_.Rand16() & ((1 << in_precision) - 1);
	}

	for (int mask_type = 0; mask_type < DIFFWTD_MASK_TYPES; mask_type++) {
	av1_build_compound_diffwtd_mask_d16_c(
	mask_ref, (DIFFWTD_MASK_TYPE)mask_type, src0, width, src1, width,
	height, width, &conv_params, bd);

	test_impl(mask_test, (DIFFWTD_MASK_TYPE)mask_type, src0, width, src1, width,
	height, width, &conv_params, bd);

	for (int r = 0; r < height; ++r) {
	for (int c = 0; c < width; ++c) {
	ASSERT_EQ(mask_ref[c + r * width], mask_test[c + r * width])
	<< "Mismatch at unit tests for BuildCompDiffwtdMaskD16Test\n"
	<< " Pixel mismatch at index " << "[" << r << "," << c << "] "
	<< " @ " << width << "x" << height << " inv " << mask_type;
	}
	}
	}
	}

	void BuildCompDiffwtdMaskD16Test::RunSpeedTest(
	buildcompdiffwtdmaskd16_func test_impl, DIFFWTD_MASK_TYPE mask_type) {
	const int block_idx = GET_PARAM(2);
	const int bd = GET_PARAM(0);
	const int width = block_size_wide[block_idx];
	const int height = block_size_high[block_idx];
	DECLARE_ALIGNED(16, uint8_t, mask[MAX_SB_SQUARE]);
	DECLARE_ALIGNED(32, uint16_t, src0[MAX_SB_SQUARE]);
	DECLARE_ALIGNED(32, uint16_t, src1[MAX_SB_SQUARE]);

	ConvolveParams conv_params = get_conv_params_no_round(0, 0, NULL, 0, 1, bd);

	int in_precision =
	bd + 2 * FILTER_BITS - conv_params.round_0 - conv_params.round_1 + 2;
	assert(in_precision >= 0); // Ensure left-shift is non-negative.
	assert(in_precision < 32); // Ensure left-shift doesn't overflow.

	for (int i = 0; i < MAX_SB_SQUARE; i++) {
	src0[i] = rnd_.Rand16() & ((1 << in_precision) - 1);
	src1[i] = rnd_.Rand16() & ((1 << in_precision) - 1);
	}

	const int num_loops = 10000000 / (width + height);
	aom_usec_timer timer;
	aom_usec_timer_start(&timer);

	for (int i = 0; i < num_loops; ++i)
	av1_build_compound_diffwtd_mask_d16_c(mask, mask_type, src0, width, src1,
	width, height, width, &conv_params,
	bd);

	aom_usec_timer_mark(&timer);
	const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));

	aom_usec_timer timer1;
	aom_usec_timer_start(&timer1);

	for (int i = 0; i < num_loops; ++i)
	test_impl(mask, mask_type, src0, width, src1, width, height, width,
	&conv_params, bd);

	aom_usec_timer_mark(&timer1);
	const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
	printf("av1_build_compound_diffwtd_mask_d16 %3dx%-3d: %7.2f \n", width,
	height, elapsed_time / double(elapsed_time1));
	}
	TEST_P(BuildCompDiffwtdMaskD16Test, CheckOutput) {
	RunCheckOutput(GET_PARAM(1));
	}

	TEST_P(BuildCompDiffwtdMaskD16Test, DISABLED_Speed) {
	RunSpeedTest(GET_PARAM(1), DIFFWTD_38);
	RunSpeedTest(GET_PARAM(1), DIFFWTD_38_INV);
	}

	#if HAVE_SSE4_1
	INSTANTIATE_TEST_SUITE_P(
	SSE4_1, BuildCompDiffwtdMaskD16Test,
	BuildParams(av1_build_compound_diffwtd_mask_d16_sse4_1));
	#endif

	#if HAVE_AVX2
	INSTANTIATE_TEST_SUITE_P(AVX2, BuildCompDiffwtdMaskD16Test,
	BuildParams(av1_build_compound_diffwtd_mask_d16_avx2));
	#endif

	#if HAVE_NEON
	INSTANTIATE_TEST_SUITE_P(NEON, BuildCompDiffwtdMaskD16Test,
	BuildParams(av1_build_compound_diffwtd_mask_d16_neon));
	#endif

	typedef void (RefinemvPadMCBorderFunc)(const uint16_t src, int src_stride,
	uint16_t *dst, int dst_stride, int x0,
	int y0, int b_w, int b_h,
	const ReferenceArea *ref_area);

	using std::get;
	using std::make_tuple;
	using std::tuple;

	static constexpr int kSpeedIterations = 10000;
	static constexpr int kMaxDimension = 2 * (REF_BUFFER_WIDTH + 1);

	// <width, height, bit_depth, subtract>
	typedef tuple<int, int, int, RefinemvPadMCBorderFunc> Params;

	class RefinemvPadMCBorderTest : public ::testing::TestWithParam<Params> {
	public:
	virtual void SetUp() {
	block_width_ = GET_PARAM(0) + (AOM_INTERP_EXTEND - 1) + AOM_INTERP_EXTEND;
	block_height_ = GET_PARAM(1) + (AOM_INTERP_EXTEND - 1) + AOM_INTERP_EXTEND;
	func_ = GET_PARAM(3);
	src_stride_ = kMaxDimension;
	dst_stride_ = REF_BUFFER_WIDTH;

	const size_t max_width = REF_BUFFER_WIDTH + 1;
	const size_t max_block_size = max_width * max_width;
	src_ = reinterpret_cast<uint16_t *>(
	aom_memalign(16, kMaxDimension * kMaxDimension * sizeof(uint16_t)));
	dst_ref_ = reinterpret_cast<uint16_t *>(
	aom_memalign(16, max_block_size * sizeof(uint16_t)));
	dst_test_ = reinterpret_cast<uint16_t *>(
	aom_memalign(16, max_block_size * sizeof(uint16_t)));

	aom_bit_depth_t bit_depth = static_cast<aom_bit_depth_t>(GET_PARAM(2));
	const int mask = (1 << bit_depth) - 1;
	ref_area_ = { { 0, 15, 0, 15 }, { 0 }, 0 };

	ACMRandom rnd;
	rnd.Reset(ACMRandom::DeterministicSeed());
	for (int j = 0; j < (int)max_block_size; ++j) {
	src_[j] = rnd.Rand16() & mask;
	}
	}

	int BorderLeft() const { return (kMaxDimension - block_width_) / 2; }
	int BorderTop() const { return (kMaxDimension - block_height_) / 2; }

	uint16_t *input() const {
	const int offset = BorderTop() * kMaxDimension + BorderLeft();
	return src_ + offset;
	}

	virtual void TearDown() {
	aom_free(src_);
	aom_free(dst_ref_);
	aom_free(dst_test_);
	}

	void AssertOutputBufferEq(const uint16_t p1, const uint16_t p2, int width,
	int height, int stride) {
	ASSERT_TRUE(p1 != p2) << "Buffers must be in different memory locations";
	for (int j = 0; j < height; ++j) {
	if (memcmp(p1, p2, sizeof(p1) width) == 0) {
	p1 += stride;
	p2 += stride;
	continue;
	}
	for (int i = 0; i < width; ++i) {
	ASSERT_EQ(p1[i], p2[i])
	<< width << "x" << height << " Pixel mismatch at (" << i << ", "
	<< j << ")";
	}
	}
	}

	protected:
	void CheckResult();
	void RunSpeedTest();

	private:
	int block_height_;
	int block_width_;
	RefinemvPadMCBorderFunc func_;
	uint16_t *src_;
	uint16_t *dst_ref_;
	uint16_t *dst_test_;
	int src_stride_;
	int dst_stride_;
	ReferenceArea ref_area_;
	};
	GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(RefinemvPadMCBorderTest);

	void RefinemvPadMCBorderTest::CheckResult() {
	uint16_t *const in = input();
	for (int y0 = -15; y0 <= 15; y0++) {
	for (int x0 = -15; x0 <= 15; x0++) {
	const uint16_t const buf_ptr = in + y0 src_stride_ + x0;
	refinemv_highbd_pad_mc_border_c(buf_ptr, src_stride_, dst_ref_,
	dst_stride_, x0, y0, block_width_,
	block_height_, &ref_area_);
	func_(buf_ptr, src_stride_, dst_test_, dst_stride_, x0, y0, block_width_,
	block_height_, &ref_area_);

	AssertOutputBufferEq(dst_ref_, dst_test_, block_width_, block_height_,
	dst_stride_);
	}
	}
	}

	TEST_P(RefinemvPadMCBorderTest, CheckResult) { CheckResult(); }

	void RefinemvPadMCBorderTest::RunSpeedTest() {
	int x0[3] = { 0, 1, 15 };
	int y0[3] = { 0, 1, 15 };
	uint16_t *const in = input();

	for (int k = 0; k <= 2; k++) {
	for (int l = 0; l <= 2; l++) {
	const uint16_t const buf_ptr = in + y0[k] src_stride_ + x0[l];
	aom_usec_timer timer;
	aom_usec_timer_start(&timer);
	for (int i = 0; i < kSpeedIterations; ++i) {
	refinemv_highbd_pad_mc_border_c(buf_ptr, src_stride_, dst_ref_,
	dst_stride_, x0[l], y0[k], block_width_,
	block_height_, &ref_area_);
	}
	aom_usec_timer_mark(&timer);
	auto elapsed_time_c = aom_usec_timer_elapsed(&timer);

	aom_usec_timer_start(&timer);
	for (int i = 0; i < kSpeedIterations; ++i) {
	func_(buf_ptr, src_stride_, dst_test_, dst_stride_, x0[l], y0[k],
	block_width_, block_height_, &ref_area_);
	}
	aom_usec_timer_mark(&timer);
	auto elapsed_time_opt = aom_usec_timer_elapsed(&timer);

	float c_time_per_pixel =
	(float)1000.0 * elapsed_time_c /
	(kSpeedIterations * block_width_ * block_height_);
	float opt_time_per_pixel =
	(float)1000.0 * elapsed_time_opt /
	(kSpeedIterations * block_width_ * block_height_);
	float scaling = c_time_per_pixel / opt_time_per_pixel;
	printf(
	"%3dx%-3d: c_time_per_pixel=%10.5f, "
	"opt_time_per_pixel=%10.5f, scaling=%f, x0:%d, y0:%d \n",
	block_width_, block_height_, c_time_per_pixel, opt_time_per_pixel,
	scaling, x0[l], y0[k]);
	}
	}
	}

	TEST_P(RefinemvPadMCBorderTest, DISABLED_Speed) { RunSpeedTest(); }

	#if HAVE_AVX2
	const Params kRefinemvPadMCBorder_avx2[] = {
	make_tuple(8, 4, 8, &refinemv_highbd_pad_mc_border_avx2),
	make_tuple(8, 8, 8, &refinemv_highbd_pad_mc_border_avx2),
	make_tuple(8, 16, 8, &refinemv_highbd_pad_mc_border_avx2),
	make_tuple(8, 4, 10, &refinemv_highbd_pad_mc_border_avx2),
	make_tuple(8, 8, 10, &refinemv_highbd_pad_mc_border_avx2),
	make_tuple(8, 16, 10, &refinemv_highbd_pad_mc_border_avx2),
	make_tuple(8, 4, 12, &refinemv_highbd_pad_mc_border_avx2),
	make_tuple(8, 8, 12, &refinemv_highbd_pad_mc_border_avx2),
	make_tuple(8, 16, 12, &refinemv_highbd_pad_mc_border_avx2)
	};

	INSTANTIATE_TEST_SUITE_P(AVX2, RefinemvPadMCBorderTest,
	::testing::ValuesIn(kRefinemvPadMCBorder_avx2));
	#endif // HAVE_AVX2

	} // namespace