test/opt_flow_test.cc - avm - Git at Google

 /*
  * Copyright (c) 2021, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 3-Clause Clear License
  * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
  * License was not distributed with this source code in the LICENSE file, you
  * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
  * Alliance for Open Media Patent License 1.0 was not distributed with this
  * source code in the PATENTS file, you can obtain it at
  * aomedia.org/license/patent-license/.
  */

 #include <set>
 #include <vector>
 #include "config/av1_rtcd.h"
 #include "config/aom_dsp_rtcd.h"
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"

 #include "aom_ports/aom_timer.h"
 #include "av1/common/reconinter.h"
 #include "av1/common/mvref_common.h"

 #if CONFIG_OPTFLOW_REFINEMENT
 namespace {

 class BlockSize {
  public:
   BlockSize(int w, int h) : width_(w), height_(h) {
     n_ = (w <= 8 && h <= 8) ? OF_MIN_BSIZE : OF_BSIZE;
   }

   int Width() const { return width_; }
   int Height() const { return height_; }
   int OptFlowBlkSize() const { return n_; }

   bool operator<(const BlockSize &other) const {
     if (Width() == other.Width()) {
       return Height() < other.Height();
     }
     return Width() < other.Width();
   }

   bool operator==(const BlockSize &other) const {
     return Width() == other.Width() && Height() == other.Height();
   }

  private:
   int width_;
   int height_;
   int n_;
 };

 // Block size / bit depth / test function used to parameterize the tests.
 template <typename T>
 class TestParam {
  public:
   TestParam(const BlockSize &block, int bd, T test_func)
       : block_(block), bd_(bd), test_func_(test_func) {}

   const BlockSize &Block() const { return block_; }
   int BitDepth() const { return bd_; }
   T TestFunction() const { return test_func_; }

   bool operator==(const TestParam &other) const {
     return Block() == other.Block() && BitDepth() == other.BitDepth() &&
            TestFunction() == other.TestFunction();
   }

  private:
   BlockSize block_;
   int bd_;
   T test_func_;
 };

 template <typename T>
 std::ostream &operator<<(std::ostream &os, const TestParam<T> &test_arg) {
   return os << "TestParam { width:" << test_arg.Block().Width()
             << " height:" << test_arg.Block().Height()
             << " bd:" << test_arg.BitDepth() << " }";
 }

 // AV1OptFlowTest is the base class that all optical flow tests should derive
 // from.
 template <typename T>
 class AV1OptFlowTest : public ::testing::TestWithParam<TestParam<T>> {
  public:
   virtual ~AV1OptFlowTest() { TearDown(); }

   virtual void SetUp() override {
     rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
   }

   virtual void TearDown() override { libaom_test::ClearSystemState(); }

   // Check that two 8-bit output buffers are identical.
   void AssertOutputEq(const int *ref, const int *test, int n) {
     ASSERT_TRUE(ref != test) << "Buffers must be at different memory locations";
     for (int idx = 0; idx < n; ++idx) {
       ASSERT_EQ(ref[idx], test[idx]) << "Mismatch at index " << idx;
     }
   }

   // Check that two 16-bit output buffers are identical.
   void AssertOutputBufferEq(const int16_t *ref, const int16_t *test, int width,
                             int height, int stride) {
     ASSERT_TRUE(ref != test) << "Buffers must be in different memory locations";
     for (int row = 0; row < height; ++row) {
       for (int col = 0; col < width; ++col) {
         ASSERT_EQ(ref[row * stride + col], test[row * stride + col])
             << width << "x" << height << " Pixel mismatch at (" << col << ", "
             << row << ")";
       }
     }
   }

   uint8_t RandomFrameIdx(int max_bit_range) {
     const int max_val = (1 << max_bit_range) - 1;
     uint8_t rand_val = rnd_.Rand8() & max_val;
     return rand_val;
   }

   int8_t RelativeDistExtreme(int max_bit_range) {
     return Rand8SingedExtremes(max_bit_range);
   }

   void RandomInput8(uint8_t *p, const TestParam<T> &param) {
     EXPECT_EQ(8, param.BitDepth());
     EXPECT_GE(MAX_SB_SIZE, param.Block().Width());
     EXPECT_GE(MAX_SB_SIZE, param.Block().Height());
     const int bw = param.Block().Width();
     const int bh = param.Block().Height();
     Randomize(p, bw * bh);
   }

   void Randomize9Signed(int16_t *p, int size) {
     for (int i = 0; i < size; ++i) {
       p[i] = rnd_.Rand9Signed();
     }
   }

   void RandomInput9(int16_t *p, const TestParam<T> &param) {
     EXPECT_GE(MAX_SB_SIZE, param.Block().Width());
     EXPECT_GE(MAX_SB_SIZE, param.Block().Height());
     const int bw = param.Block().Width();
     const int bh = param.Block().Height();
     Randomize9Signed(p, bw * bh);
   }

   void RandomInput16(uint16_t *p, const TestParam<T> &param,
                      int max_bit_range) {
     EXPECT_GE(12, param.BitDepth());
     EXPECT_GE(MAX_SB_SIZE, param.Block().Width());
     EXPECT_GE(MAX_SB_SIZE, param.Block().Height());
     const int bw = param.Block().Width();
     const int bh = param.Block().Height();
     Randomize(p, bw * bh, max_bit_range);
   }

   void RandomInput16(int16_t *p, const TestParam<T> &param, int max_bit_range) {
     EXPECT_GE(MAX_SB_SIZE, param.Block().Width());
     EXPECT_GE(MAX_SB_SIZE, param.Block().Height());
     const int bw = param.Block().Width();
     const int bh = param.Block().Height();
     Randomize(p, bw * bh, max_bit_range);
   }

   void RandomInput8Extreme(uint8_t *p, const TestParam<T> &param) {
     EXPECT_EQ(8, param.BitDepth());
     EXPECT_GE(MAX_SB_SIZE, param.Block().Width());
     EXPECT_GE(MAX_SB_SIZE, param.Block().Height());
     const int bw = param.Block().Width();
     const int bh = param.Block().Height();
     RandomizeExtreme(p, bw * bh);
   }

   void RandomInput9Extreme(int16_t *p, const TestParam<T> &param,
                            int max_bit_range) {
     EXPECT_GE(12, param.BitDepth());
     EXPECT_GE(MAX_SB_SIZE, param.Block().Width());
     EXPECT_GE(MAX_SB_SIZE, param.Block().Height());
     const int bw = param.Block().Width();
     const int bh = param.Block().Height();
     Randomize9Extreme(p, bw * bh, max_bit_range);
   }

   void RandomInput16Extreme(uint16_t *p, const TestParam<T> &param,
                             int max_bit_range) {
     EXPECT_GE(12, param.BitDepth());
     EXPECT_GE(MAX_SB_SIZE, param.Block().Width());
     EXPECT_GE(MAX_SB_SIZE, param.Block().Height());
     const int bw = param.Block().Width();
     const int bh = param.Block().Height();
     RandomizeExtreme(p, bw * bh, max_bit_range);
   }

   void RandomInput16Extreme(int16_t *p, const TestParam<T> &param,
                             int max_bit_range) {
     EXPECT_GE(12, param.BitDepth());
     EXPECT_GE(MAX_SB_SIZE, param.Block().Width());
     EXPECT_GE(MAX_SB_SIZE, param.Block().Height());
     const int bw = param.Block().Width();
     const int bh = param.Block().Height();
     RandomizeExtreme(p, bw * bh, max_bit_range);
   }

  private:
   void Randomize(uint8_t *p, int size) {
     for (int i = 0; i < size; ++i) {
       p[i] = rnd_.Rand8();
     }
   }

   void Randomize(uint16_t *p, int size, int max_bit_range) {
     assert(max_bit_range <= 16);
     for (int i = 0; i < size; ++i) {
       p[i] = rnd_.Rand16() & ((1 << max_bit_range) - 1);
     }
   }

   void Randomize(int16_t *p, int size, int max_bit_range) {
     assert(max_bit_range <= 16);
     for (int i = 0; i < size; ++i) {
       p[i] = (rnd_.Rand16() & ((1 << max_bit_range) - 1)) -
              (1 << (max_bit_range - 1));
     }
   }

   int RandBool() {
     const uint32_t value = rnd_.Rand8();
     // There's a bit more entropy in the upper bits of this implementation.
     return (value >> 7) & 0x1;
   }

   uint8_t Rand8Extremes() { return static_cast<uint8_t>(RandBool() ? 255 : 0); }

   int8_t Rand8SingedExtremes(int max_bit_range) {
     const int max_val = (1 << max_bit_range) - 1;
     const int half_max_val = 1 << (max_bit_range - 1);
     uint8_t r_u8 = Rand8Extremes() & max_val;
     return static_cast<int8_t>(r_u8 - half_max_val);
   }

   int16_t Rand9SingedExtremes(int max_bit_range) {
     const int half_max_val = 1 << (max_bit_range - 1);
     uint16_t r_u16 = Rand16Extremes(max_bit_range);
     return static_cast<int16_t>(r_u16 - half_max_val);
   }

   uint16_t Rand16Extremes(int max_bit_range) {
     const int max_val = (1 << max_bit_range) - 1;
     return static_cast<uint16_t>(RandBool() ? max_val : 0);
   }

   int16_t Rand16SingedExtremes(int max_bit_range) {
     const int half_max_val = 1 << (max_bit_range - 1);
     uint16_t r_u16 = Rand16Extremes(max_bit_range);
     return static_cast<int16_t>(r_u16 - half_max_val);
   }

   void RandomizeExtreme(uint8_t *p, int size) {
     for (int i = 0; i < size; ++i) {
       p[i] = Rand8Extremes();
     }
   }

   void RandomizeExtreme(uint16_t *p, int size, int max_bit_range) {
     for (int i = 0; i < size; ++i) {
       p[i] = Rand16Extremes(max_bit_range);
     }
   }

   void RandomizeExtreme(int16_t *p, int size, int max_bit_range) {
     for (int i = 0; i < size; ++i) {
       p[i] = Rand16SingedExtremes(max_bit_range);
     }
   }

   void Randomize9Extreme(int16_t *p, int size, int max_bit_range) {
     for (int i = 0; i < size; ++i) {
       p[i] = Rand9SingedExtremes(max_bit_range);
     }
   }

   libaom_test::ACMRandom rnd_;
 };

 // a function to generate test parameters for just luma block sizes.
 template <typename T>
 std::vector<TestParam<T>> GetOptFlowTestParams(
     std::initializer_list<int> bit_depths, T test_func) {
   std::set<BlockSize> sizes;
   for (int bsize = BLOCK_8X8; bsize < BLOCK_SIZES_ALL; ++bsize) {
     const int w = block_size_wide[bsize];
     const int h = block_size_high[bsize];
     if (w < 8 || h < 8) continue;
     sizes.insert(BlockSize(w, h));
   }
   std::vector<TestParam<T>> result;
   for (int bit_depth : bit_depths) {
     for (const auto &block : sizes) {
       result.push_back(TestParam<T>(block, bit_depth, test_func));
     }
   }
   return result;
 }

 template <typename T>
 std::vector<TestParam<T>> GetOptFlowHighbdTestParams(T test_func) {
   return GetOptFlowTestParams({ 8, 10, 12 }, test_func);
 }

 template <typename T>
 ::testing::internal::ParamGenerator<TestParam<T>> BuildOptFlowHighbdParams(
     T test_func) {
   return ::testing::ValuesIn(GetOptFlowHighbdTestParams(test_func));
 }

 typedef int (*opfl_mv_refinement_highbd)(const uint16_t *p0, int pstride0,
                                          const uint16_t *p1, int pstride1,
                                          const int16_t *gx0, const int16_t *gy0,
                                          const int16_t *gx1, const int16_t *gy1,
                                          int gstride, int bw, int bh, int n,
                                          int d0, int d1, int grad_prec_bits,
                                          int mv_prec_bits, int *vx0, int *vy0,
                                          int *vx1, int *vy1);

 class AV1OptFlowRefineHighbdTest
     : public AV1OptFlowTest<opfl_mv_refinement_highbd> {
  public:
   AV1OptFlowRefineHighbdTest() {
     const BlockSize &block = GetParam().Block();
     const int bw = block.Width();
     const int bh = block.Height();

     input0_ = (uint16_t *)aom_memalign(16, bw * bh * sizeof(uint16_t));
     input1_ = (uint16_t *)aom_memalign(16, bw * bh * sizeof(uint16_t));
     gx0_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t));
     gy0_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t));
     gx1_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t));
     gy1_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t));
   }

   ~AV1OptFlowRefineHighbdTest() {
     aom_free(input0_);
     aom_free(input1_);
     aom_free(gx0_);
     aom_free(gy0_);
     aom_free(gx1_);
     aom_free(gy1_);
   }

   void RunTest(const int is_speed) {
     OrderHintInfo oh_info;
     const BlockSize &block = GetParam().Block();
     const int bd = GetParam().BitDepth();
     const int bw_log2 = block.Width() >> MI_SIZE_LOG2;
     const int bh_log2 = block.Height() >> MI_SIZE_LOG2;
     const int numIter = is_speed ? 1 : 16384 / (bw_log2 * bh_log2);
     const int oh_start_bits = is_speed ? kMaxOrderHintBits : 1;

     oh_info.enable_order_hint = 1;
     for (int oh_bits = oh_start_bits; oh_bits <= kMaxOrderHintBits; oh_bits++) {
       for (int count = 0; count < numIter;) {
         const int cur_frm_idx = RandomFrameIdx(oh_bits);
         const int ref0_frm_idx = RandomFrameIdx(oh_bits);
         const int ref1_frm_idx = RandomFrameIdx(oh_bits);

         oh_info.order_hint_bits_minus_1 = oh_bits - 1;
         const int d0 = get_relative_dist(&oh_info, cur_frm_idx, ref0_frm_idx);
         const int d1 = get_relative_dist(&oh_info, cur_frm_idx, ref1_frm_idx);
         if (!d0 || !d1) continue;

         RandomInput16(input0_, GetParam(), bd);
         RandomInput16(input1_, GetParam(), bd);
         RandomInput16(gx0_, GetParam(), bd + 1);
         RandomInput16(gy0_, GetParam(), bd + 1);
         RandomInput16(gx1_, GetParam(), bd + 1);
         RandomInput16(gy1_, GetParam(), bd + 1);

         TestOptFlowRefine(input0_, input1_, gx0_, gy0_, gx1_, gy1_, is_speed,
                           d0, d1);
         count++;
       }
     }
     if (is_speed) return;

     // Extreme value test
     for (int oh_bits = oh_start_bits; oh_bits <= kMaxOrderHintBits;
          oh_bits += kMaxOrderHintBits - 1) {
       for (int count = 0; count < numIter;) {
         const int d0 = RelativeDistExtreme(oh_bits);
         const int d1 = RelativeDistExtreme(oh_bits);
         if (!d0 || !d1) continue;

         RandomInput16Extreme(input0_, GetParam(), bd);
         RandomInput16Extreme(input1_, GetParam(), bd);
         RandomInput16Extreme(gx0_, GetParam(), bd + 1);
         RandomInput16Extreme(gy0_, GetParam(), bd + 1);
         RandomInput16Extreme(gx1_, GetParam(), bd + 1);
         RandomInput16Extreme(gy1_, GetParam(), bd + 1);

         TestOptFlowRefine(input0_, input1_, gx0_, gy0_, gx1_, gy1_, 0, d0, d1);
         count++;
       }
     }
   }

  private:
   void TestOptFlowRefine(uint16_t *input0, uint16_t *input1, int16_t *gx0,
                          int16_t *gy0, int16_t *gx1, int16_t *gy1,
                          const int is_speed, int d0, int d1) {
     const BlockSize &block = GetParam().Block();
     const int bw = block.Width();
     const int bh = block.Height();
     const int n = block.OptFlowBlkSize();

     opfl_mv_refinement_highbd ref_func = av1_opfl_mv_refinement_nxn_highbd_c;
     opfl_mv_refinement_highbd test_func = GetParam().TestFunction();

     if (is_speed)
       OptFlowRefineSpeed(ref_func, test_func, input0, input1, gx0, gy0, gx1,
                          gy1, bw, bh, n, d0, d1);
     else
       OptFlowRefine(ref_func, test_func, input0, input1, gx0, gy0, gx1, gy1, bw,
                     bh, n, d0, d1);
   }

   void OptFlowRefine(opfl_mv_refinement_highbd ref_func,
                      opfl_mv_refinement_highbd test_func,
                      const uint16_t *input0, const uint16_t *input1,
                      const int16_t *gx0, const int16_t *gy0, const int16_t *gx1,
                      const int16_t *gy1, int bw, int bh, int n, int d0,
                      int d1) {
     int ref_out[4 * N_OF_OFFSETS] = { 0 };
     int test_out[4 * N_OF_OFFSETS] = { 0 };
     const int grad_prec_bits = 3 - kSubpelGradDeltaBits - 2;
     const int mv_prec_bits = MV_REFINE_PREC_BITS;
     int stride0 = bw;
     int stride1 = bw;
     int gstride = bw;
     int n_blocks = 0;

     n_blocks = ref_func(
         input0, stride0, input1, stride1, gx0, gy0, gx1, gy1, gstride, bw, bh,
         n, d0, d1, grad_prec_bits, mv_prec_bits, &ref_out[kVX_0 * N_OF_OFFSETS],
         &ref_out[kVY_0 * N_OF_OFFSETS], &ref_out[kVX_1 * N_OF_OFFSETS],
         &ref_out[kVY_1 * N_OF_OFFSETS]);
     test_func(input0, stride0, input1, stride1, gx0, gy0, gx1, gy1, gstride, bw,
               bh, n, d0, d1, grad_prec_bits, mv_prec_bits,
               &test_out[kVX_0 * N_OF_OFFSETS], &test_out[kVY_0 * N_OF_OFFSETS],
               &test_out[kVX_1 * N_OF_OFFSETS], &test_out[kVY_1 * N_OF_OFFSETS]);

     AssertOutputEq(&ref_out[kVX_0 * N_OF_OFFSETS],
                    &test_out[kVX_0 * N_OF_OFFSETS], n_blocks);
     AssertOutputEq(&ref_out[kVY_0 * N_OF_OFFSETS],
                    &test_out[kVY_0 * N_OF_OFFSETS], n_blocks);
     AssertOutputEq(&ref_out[kVX_1 * N_OF_OFFSETS],
                    &test_out[kVX_1 * N_OF_OFFSETS], n_blocks);
     AssertOutputEq(&ref_out[kVY_1 * N_OF_OFFSETS],
                    &test_out[kVY_1 * N_OF_OFFSETS], n_blocks);
   }

   void OptFlowRefineSpeed(opfl_mv_refinement_highbd ref_func,
                           opfl_mv_refinement_highbd test_func,
                           const uint16_t *input0, const uint16_t *input1,
                           const int16_t *gx0, const int16_t *gy0,
                           const int16_t *gx1, const int16_t *gy1, int bw,
                           int bh, int n, int d0, int d1) {
     int ref_out[4 * N_OF_OFFSETS] = { 0 };
     int test_out[4 * N_OF_OFFSETS] = { 0 };
     const int grad_prec_bits = 3 - kSubpelGradDeltaBits - 2;
     const int mv_prec_bits = MV_REFINE_PREC_BITS;
     const int bw_log2 = bw >> MI_SIZE_LOG2;
     const int bh_log2 = bh >> MI_SIZE_LOG2;
     int stride0 = bw;
     int stride1 = bw;
     int gstride = bw;

     const int numIter = 2097152 / (bw_log2 * bh_log2);
     aom_usec_timer timer_ref;
     aom_usec_timer timer_test;

     aom_usec_timer_start(&timer_ref);
     for (int count = 0; count < numIter; count++) {
       ref_func(input0, stride0, input1, stride1, gx0, gy0, gx1, gy1, gstride,
                bw, bh, n, d0, d1, grad_prec_bits, mv_prec_bits,
                &ref_out[kVX_0 * N_OF_OFFSETS], &ref_out[kVY_0 * N_OF_OFFSETS],
                &ref_out[kVX_1 * N_OF_OFFSETS], &ref_out[kVY_1 * N_OF_OFFSETS]);
     }
     aom_usec_timer_mark(&timer_ref);

     aom_usec_timer_start(&timer_test);
     for (int count = 0; count < numIter; count++) {
       test_func(
           input0, stride0, input1, stride1, gx0, gy0, gx1, gy1, gstride, bw, bh,
           n, d0, d1, grad_prec_bits, mv_prec_bits,
           &test_out[kVX_0 * N_OF_OFFSETS], &test_out[kVY_0 * N_OF_OFFSETS],
           &test_out[kVX_1 * N_OF_OFFSETS], &test_out[kVY_1 * N_OF_OFFSETS]);
     }
     aom_usec_timer_mark(&timer_test);

     const int total_time_ref =
         static_cast<int>(aom_usec_timer_elapsed(&timer_ref));
     const int total_time_test =
         static_cast<int>(aom_usec_timer_elapsed(&timer_test));

     printf("ref_time = %d \t simd_time = %d \t Gain = %4.2f \n", total_time_ref,
            total_time_test,
            (static_cast<float>(total_time_ref) /
             static_cast<float>(total_time_test)));
   }

   static constexpr int kVX_0 = 0;
   static constexpr int kVX_1 = 1;
   static constexpr int kVY_0 = 2;
   static constexpr int kVY_1 = 3;
   static constexpr int kMaxOrderHintBits = 8;
   static constexpr int kSubpelGradDeltaBits = 3;
   uint16_t *input0_;
   uint16_t *input1_;
   int16_t *gx0_;
   int16_t *gy0_;
   int16_t *gx1_;
   int16_t *gy1_;
 };
 TEST_P(AV1OptFlowRefineHighbdTest, CheckOutput) { RunTest(0); }
 TEST_P(AV1OptFlowRefineHighbdTest, DISABLED_Speed) { RunTest(1); }

 INSTANTIATE_TEST_SUITE_P(
     C, AV1OptFlowRefineHighbdTest,
     BuildOptFlowHighbdParams(av1_opfl_mv_refinement_nxn_highbd_c));

 #if HAVE_SSE4_1
 INSTANTIATE_TEST_SUITE_P(
     SSE4_1, AV1OptFlowRefineHighbdTest,
     BuildOptFlowHighbdParams(av1_opfl_mv_refinement_nxn_highbd_sse4_1));
 #endif

 #if OPFL_BICUBIC_GRAD
 typedef void (*bicubic_grad_interp_highbd)(const int16_t *pred_src,
                                            int16_t *x_grad, int16_t *y_grad,
                                            const int blk_width,
                                            const int blk_height);

 class AV1OptFlowBiCubicGradHighbdTest
     : public AV1OptFlowTest<bicubic_grad_interp_highbd> {
  public:
   AV1OptFlowBiCubicGradHighbdTest() {
     const BlockSize &block = GetParam().Block();
     const int bw = block.Width();
     const int bh = block.Height();

     pred_src_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t));
     x_grad_ref_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t));
     y_grad_ref_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t));
     x_grad_test_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t));
     y_grad_test_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t));

     memset(x_grad_ref_, 0, bw * bh * sizeof(int16_t));
     memset(y_grad_ref_, 0, bw * bh * sizeof(int16_t));
     memset(x_grad_test_, 0, bw * bh * sizeof(int16_t));
     memset(y_grad_test_, 0, bw * bh * sizeof(int16_t));
   }

   ~AV1OptFlowBiCubicGradHighbdTest() {
     aom_free(pred_src_);
     aom_free(x_grad_ref_);
     aom_free(y_grad_ref_);
     aom_free(x_grad_test_);
     aom_free(y_grad_test_);
   }

   void Run(const int is_speed) {
     const BlockSize &block = GetParam().Block();
     const int bd = GetParam().BitDepth();
     const int bw_log2 = block.Width() >> MI_SIZE_LOG2;
     const int bh_log2 = block.Height() >> MI_SIZE_LOG2;
     const int numIter = is_speed ? 1 : 16384 / (bw_log2 * bh_log2);

     for (int count = 0; count < numIter; count++) {
       RandomInput16(pred_src_, GetParam(), bd);
       TestBicubicGradHighbd(pred_src_, x_grad_ref_, y_grad_ref_, x_grad_test_,
                             y_grad_test_, is_speed);
     }
     if (is_speed) return;

     for (int count = 0; count < numIter; count++) {
       RandomInput16Extreme((uint16_t *)pred_src_, GetParam(), bd);
       TestBicubicGradHighbd(pred_src_, x_grad_ref_, y_grad_ref_, x_grad_test_,
                             y_grad_test_, 0);
     }
   }

  private:
   void TestBicubicGradHighbd(int16_t *pred_src, int16_t *x_grad_ref,
                              int16_t *y_grad_ref, int16_t *x_grad_test,
                              int16_t *y_grad_test, int is_speed) {
     const BlockSize &block = GetParam().Block();
     const int bw = block.Width();
     const int bh = block.Height();

     bicubic_grad_interp_highbd ref_func =
         av1_bicubic_grad_interpolation_highbd_c;
     bicubic_grad_interp_highbd test_func = GetParam().TestFunction();
     if (is_speed)
       BicubicGradHighbdSpeed(ref_func, test_func, pred_src, x_grad_ref,
                              y_grad_ref, x_grad_test, y_grad_test, bw, bh);
     else
       BicubicGradHighbd(ref_func, test_func, pred_src, x_grad_ref, y_grad_ref,
                         x_grad_test, y_grad_test, bw, bh);
   }

   void BicubicGradHighbd(bicubic_grad_interp_highbd ref_func,
                          bicubic_grad_interp_highbd test_func,
                          const int16_t *pred_src, int16_t *x_grad_ref,
                          int16_t *y_grad_ref, int16_t *x_grad_test,
                          int16_t *y_grad_test, const int bw, const int bh) {
     ref_func(pred_src, x_grad_ref, y_grad_ref, bw, bh);
     test_func(pred_src, x_grad_test, y_grad_test, bw, bh);

     AssertOutputBufferEq(x_grad_ref, x_grad_test, bw, bh, bw);
     AssertOutputBufferEq(y_grad_ref, y_grad_test, bw, bh, bw);
   }

   void BicubicGradHighbdSpeed(bicubic_grad_interp_highbd ref_func,
                               bicubic_grad_interp_highbd test_func,
                               int16_t *pred_src, int16_t *x_grad_ref,
                               int16_t *y_grad_ref, int16_t *x_grad_test,
                               int16_t *y_grad_test, const int bw,
                               const int bh) {
     const int bw_log2 = bw >> MI_SIZE_LOG2;
     const int bh_log2 = bh >> MI_SIZE_LOG2;

     const int numIter = 2097152 / (bw_log2 * bh_log2);
     aom_usec_timer timer_ref;
     aom_usec_timer timer_test;

     aom_usec_timer_start(&timer_ref);
     for (int count = 0; count < numIter; count++)
       ref_func(pred_src, x_grad_ref, y_grad_ref, bw, bh);
     aom_usec_timer_mark(&timer_ref);

     aom_usec_timer_start(&timer_test);
     for (int count = 0; count < numIter; count++)
       test_func(pred_src, x_grad_test, y_grad_test, bw, bh);
     aom_usec_timer_mark(&timer_test);

     const int total_time_ref =
         static_cast<int>(aom_usec_timer_elapsed(&timer_ref));
     const int total_time_test =
         static_cast<int>(aom_usec_timer_elapsed(&timer_test));

     printf("ref_time = %d \t simd_time = %d \t Gain = %4.2f \n", total_time_ref,
            total_time_test,
            (static_cast<float>(total_time_ref) /
             static_cast<float>(total_time_test)));
   }

   int16_t *pred_src_;
   int16_t *x_grad_ref_;
   int16_t *y_grad_ref_;
   int16_t *x_grad_test_;
   int16_t *y_grad_test_;
 };
 TEST_P(AV1OptFlowBiCubicGradHighbdTest, CheckOutput) { Run(0); }
 TEST_P(AV1OptFlowBiCubicGradHighbdTest, DISABLED_Speed) { Run(1); }

 INSTANTIATE_TEST_SUITE_P(
     C, AV1OptFlowBiCubicGradHighbdTest,
     BuildOptFlowHighbdParams(av1_bicubic_grad_interpolation_highbd_c));

 #if HAVE_SSE4_1
 INSTANTIATE_TEST_SUITE_P(
     SSE4_1, AV1OptFlowBiCubicGradHighbdTest,
     BuildOptFlowHighbdParams(av1_bicubic_grad_interpolation_highbd_sse4_1));
 #endif
 #endif  // OPFL_BICUBIC_GRAD

 #if OPFL_COMBINE_INTERP_GRAD_LS
 typedef int (*opfl_mv_refinement_interp_grad)(
     const int16_t *pdiff, int pstride, const int16_t *gx, const int16_t *gy,
     int gstride, int bw, int bh, int n, int d0, int d1, int grad_prec_bits,
     int mv_prec_bits, int *vx0, int *vy0, int *vx1, int *vy1);

 class AV1OptFlowRefineInterpGradTest
     : public AV1OptFlowTest<opfl_mv_refinement_interp_grad> {
  public:
   AV1OptFlowRefineInterpGradTest() {
     const BlockSize &block = GetParam().Block();
     const int bw = block.Width();
     const int bh = block.Height();

     input_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(*input_));
     gx_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(*gx_));
     gy_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(*gy_));
   }

   ~AV1OptFlowRefineInterpGradTest() {
     aom_free(input_);
     aom_free(gx_);
     aom_free(gy_);
   }

   void RunTest(const int is_speed) {
     OrderHintInfo oh_info;
     const BlockSize &block = GetParam().Block();
     const int bd = GetParam().BitDepth();
     const int bw_log2 = block.Width() >> MI_SIZE_LOG2;
     const int bh_log2 = block.Height() >> MI_SIZE_LOG2;
     const int numIter = is_speed ? 1 : 16384 / (bw_log2 * bh_log2);
     const int oh_start_bits = is_speed ? kMaxOrderHintBits : 1;

     oh_info.enable_order_hint = 1;
     for (int oh_bits = oh_start_bits; oh_bits <= kMaxOrderHintBits; oh_bits++) {
       for (int count = 0; count < numIter;) {
         const int cur_frm_idx = RandomFrameIdx(oh_bits);
         const int ref0_frm_idx = RandomFrameIdx(oh_bits);
         const int ref1_frm_idx = RandomFrameIdx(oh_bits);

         oh_info.order_hint_bits_minus_1 = oh_bits - 1;
         const int d0 = get_relative_dist(&oh_info, cur_frm_idx, ref0_frm_idx);
         const int d1 = get_relative_dist(&oh_info, cur_frm_idx, ref1_frm_idx);
         if (!d0 || !d1) continue;

         // Here, the input corresponds to 'd0*p0 - d1*p1' (where P0 and P1 can
         // be 12 bits, d0 and d1 can be >=5 bits) and gx, gy are gradients of
         // input. Due to the clamping of these value to [INT16_MIN, INT16_MAX],
         // testing of the same is required. Hence, populating the input_, gx_
         // and gy_ buffers as per the requirement.
         RandomInput16(input_, GetParam(), AOMMIN(16, bd + 1));
         RandomInput16(gx_, GetParam(), AOMMIN(16, bd + 6));
         RandomInput16(gy_, GetParam(), AOMMIN(16, bd + 6));

         TestOptFlowRefine(input_, gx_, gy_, is_speed, d0, d1);
         count++;
       }
     }
     if (is_speed) return;

     // Extreme value test
     for (int oh_bits = oh_start_bits; oh_bits <= kMaxOrderHintBits;
          oh_bits += kMaxOrderHintBits - 1) {
       for (int count = 0; count < numIter;) {
         const int d0 = RelativeDistExtreme(oh_bits);
         const int d1 = RelativeDistExtreme(oh_bits);
         if (!d0 || !d1) continue;

         RandomInput16Extreme(input_, GetParam(), AOMMIN(16, bd + 1));
         RandomInput16Extreme(gx_, GetParam(), AOMMIN(16, bd + 6));
         RandomInput16Extreme(gy_, GetParam(), AOMMIN(16, bd + 6));

         TestOptFlowRefine(input_, gx_, gy_, 0, d0, d1);
         count++;
       }
     }
   }

  private:
   void TestOptFlowRefine(int16_t *input, int16_t *gx, int16_t *gy,
                          const int is_speed, int d0, int d1) {
     const BlockSize &block = GetParam().Block();
     const int bw = block.Width();
     const int bh = block.Height();
     const int n = block.OptFlowBlkSize();

     opfl_mv_refinement_interp_grad ref_func =
         av1_opfl_mv_refinement_nxn_interp_grad_c;
     opfl_mv_refinement_interp_grad test_func = GetParam().TestFunction();

     if (is_speed)
       OptFlowRefineSpeed(ref_func, test_func, input, gx, gy, bw, bh, n, d0, d1);
     else
       OptFlowRefine(ref_func, test_func, input, gx, gy, bw, bh, n, d0, d1);
   }

   void OptFlowRefine(opfl_mv_refinement_interp_grad ref_func,
                      opfl_mv_refinement_interp_grad test_func,
                      const int16_t *input, const int16_t *gx, const int16_t *gy,
                      int bw, int bh, int n, int d0, int d1) {
     int ref_out[4 * N_OF_OFFSETS] = { 0 };
     int test_out[4 * N_OF_OFFSETS] = { 0 };
     const int grad_prec_bits = 3 - kSubpelGradDeltaBits - 2;
     const int mv_prec_bits = MV_REFINE_PREC_BITS;
     int stride = bw;
     int gstride = bw;

     int n_blocks_ref =
         ref_func(input, stride, gx, gy, gstride, bw, bh, n, d0, d1,
                  grad_prec_bits, mv_prec_bits, &ref_out[kVX_0 * N_OF_OFFSETS],
                  &ref_out[kVY_0 * N_OF_OFFSETS], &ref_out[kVX_1 * N_OF_OFFSETS],
                  &ref_out[kVY_1 * N_OF_OFFSETS]);
     int n_blocks = test_func(
         input, stride, gx, gy, gstride, bw, bh, n, d0, d1, grad_prec_bits,
         mv_prec_bits, &test_out[kVX_0 * N_OF_OFFSETS],
         &test_out[kVY_0 * N_OF_OFFSETS], &test_out[kVX_1 * N_OF_OFFSETS],
         &test_out[kVY_1 * N_OF_OFFSETS]);

     ASSERT_EQ(n_blocks_ref, n_blocks) << "Mismatch of subblock numbers";
     AssertOutputEq(&ref_out[kVX_0 * N_OF_OFFSETS],
                    &test_out[kVX_0 * N_OF_OFFSETS], n_blocks);
     AssertOutputEq(&ref_out[kVY_0 * N_OF_OFFSETS],
                    &test_out[kVY_0 * N_OF_OFFSETS], n_blocks);
     AssertOutputEq(&ref_out[kVX_1 * N_OF_OFFSETS],
                    &test_out[kVX_1 * N_OF_OFFSETS], n_blocks);
     AssertOutputEq(&ref_out[kVY_1 * N_OF_OFFSETS],
                    &test_out[kVY_1 * N_OF_OFFSETS], n_blocks);
   }

   void OptFlowRefineSpeed(opfl_mv_refinement_interp_grad ref_func,
                           opfl_mv_refinement_interp_grad test_func,
                           const int16_t *input, const int16_t *gx,
                           const int16_t *gy, int bw, int bh, int n, int d0,
                           int d1) {
     int ref_out[4 * N_OF_OFFSETS] = { 0 };
     int test_out[4 * N_OF_OFFSETS] = { 0 };
     const int grad_prec_bits = 3 - kSubpelGradDeltaBits - 2;
     const int mv_prec_bits = MV_REFINE_PREC_BITS;
     const int bw_log2 = bw >> MI_SIZE_LOG2;
     const int bh_log2 = bh >> MI_SIZE_LOG2;
     int stride = bw;
     int gstride = bw;

     const int numIter = 2097152 / (bw_log2 * bh_log2);
     aom_usec_timer timer_ref;
     aom_usec_timer timer_test;

     aom_usec_timer_start(&timer_ref);
     for (int count = 0; count < numIter; count++) {
       ref_func(input, stride, gx, gy, gstride, bw, bh, n, d0, d1,
                grad_prec_bits, mv_prec_bits, &ref_out[kVX_0 * N_OF_OFFSETS],
                &ref_out[kVY_0 * N_OF_OFFSETS], &ref_out[kVX_1 * N_OF_OFFSETS],
                &ref_out[kVY_1 * N_OF_OFFSETS]);
     }
     aom_usec_timer_mark(&timer_ref);

     aom_usec_timer_start(&timer_test);
     for (int count = 0; count < numIter; count++) {
       test_func(input, stride, gx, gy, gstride, bw, bh, n, d0, d1,
                 grad_prec_bits, mv_prec_bits, &test_out[kVX_0 * N_OF_OFFSETS],
                 &test_out[kVY_0 * N_OF_OFFSETS],
                 &test_out[kVX_1 * N_OF_OFFSETS],
                 &test_out[kVY_1 * N_OF_OFFSETS]);
     }
     aom_usec_timer_mark(&timer_test);

     const int total_time_ref =
         static_cast<int>(aom_usec_timer_elapsed(&timer_ref));
     const int total_time_test =
         static_cast<int>(aom_usec_timer_elapsed(&timer_test));

     printf("ref_time = %d \t simd_time = %d \t Gain = %4.2f \n", total_time_ref,
            total_time_test,
            (static_cast<float>(total_time_ref) /
             static_cast<float>(total_time_test)));
   }

   static constexpr int kVX_0 = 0;
   static constexpr int kVX_1 = 1;
   static constexpr int kVY_0 = 2;
   static constexpr int kVY_1 = 3;
   static constexpr int kMaxOrderHintBits = 8;
   static constexpr int kSubpelGradDeltaBits = 3;
   int16_t *input_;
   int16_t *gx_;
   int16_t *gy_;
 };
 TEST_P(AV1OptFlowRefineInterpGradTest, CheckOutput) { RunTest(0); }
 TEST_P(AV1OptFlowRefineInterpGradTest, DISABLED_Speed) { RunTest(1); }

 INSTANTIATE_TEST_SUITE_P(
     C, AV1OptFlowRefineInterpGradTest,
     BuildOptFlowHighbdParams(av1_opfl_mv_refinement_nxn_interp_grad_c));

 #if HAVE_SSE4_1
 INSTANTIATE_TEST_SUITE_P(
     SSE4_1, AV1OptFlowRefineInterpGradTest,
     BuildOptFlowHighbdParams(av1_opfl_mv_refinement_nxn_interp_grad_sse4_1));
 #endif
 #endif  // OPFL_COMBINE_INTERP_GRAD_LS

 #if OPFL_BILINEAR_GRAD || OPFL_BICUBIC_GRAD
 typedef void (*pred_buffer_copy_highbd)(const uint16_t *src1,
                                         const uint16_t *src2, int16_t *dst1,
                                         int16_t *dst2, int bw, int bh, int d0,
                                         int d1, int centered);

 class AV1OptFlowCopyPredHighbdTest
     : public AV1OptFlowTest<pred_buffer_copy_highbd> {
  public:
   AV1OptFlowCopyPredHighbdTest() {
     const BlockSize &block = GetParam().Block();
     const int bw = block.Width();
     const int bh = block.Height();

     src_buf1_ = (uint16_t *)aom_memalign(16, bw * bh * sizeof(*src_buf1_));
     src_buf2_ = (uint16_t *)aom_memalign(16, bw * bh * sizeof(*src_buf2_));
     dst_buf1_ref_ =
         (int16_t *)aom_memalign(16, bw * bh * sizeof(*dst_buf1_ref_));
     dst_buf2_ref_ =
         (int16_t *)aom_memalign(16, bw * bh * sizeof(*dst_buf2_ref_));
     dst_buf1_test_ =
         (int16_t *)aom_memalign(16, bw * bh * sizeof(*dst_buf1_test_));
     dst_buf2_test_ =
         (int16_t *)aom_memalign(16, bw * bh * sizeof(*dst_buf2_test_));

     memset(dst_buf2_ref_, 0, bw * bh * sizeof(*dst_buf2_ref_));
     memset(dst_buf2_test_, 0, bw * bh * sizeof(*dst_buf2_test_));
   }

   ~AV1OptFlowCopyPredHighbdTest() {
     aom_free(src_buf1_);
     aom_free(src_buf2_);
     aom_free(dst_buf1_ref_);
     aom_free(dst_buf2_ref_);
     aom_free(dst_buf1_test_);
     aom_free(dst_buf2_test_);
   }

   void Run(const int is_speed) {
     OrderHintInfo oh_info;
     const BlockSize &block = GetParam().Block();
     const int bw_log2 = block.Width() >> MI_SIZE_LOG2;
     const int bh_log2 = block.Height() >> MI_SIZE_LOG2;
     const int bd = GetParam().BitDepth();
     const int numIter = is_speed ? 1 : 16384 / (bw_log2 * bh_log2);
     const int oh_start_bits = is_speed ? kMaxOrderHintBits : 1;

     oh_info.enable_order_hint = 1;
     for (int oh_bits = oh_start_bits; oh_bits <= kMaxOrderHintBits; oh_bits++) {
       for (int count = 0; count < numIter;) {
         const int cur_frm_idx = RandomFrameIdx(oh_bits);
         const int ref0_frm_idx = RandomFrameIdx(oh_bits);
         const int ref1_frm_idx = RandomFrameIdx(oh_bits);

         oh_info.order_hint_bits_minus_1 = oh_bits - 1;
         const int d0 = get_relative_dist(&oh_info, cur_frm_idx, ref0_frm_idx);
         const int d1 = get_relative_dist(&oh_info, cur_frm_idx, ref1_frm_idx);
         if (!d0 || !d1) continue;

         RandomInput16(src_buf1_, GetParam(), bd);
         RandomInput16(src_buf2_, GetParam(), bd);
         TestCopyPredArray(src_buf1_, src_buf2_, dst_buf1_ref_, dst_buf2_ref_,
                           dst_buf1_test_, dst_buf2_test_, d0, d1, is_speed);
         count++;
       }
     }
     if (is_speed) return;

     // Extreme value test
     for (int oh_bits = oh_start_bits; oh_bits <= kMaxOrderHintBits;
          oh_bits += kMaxOrderHintBits - 1) {
       for (int count = 0; count < numIter;) {
         const int d0 = RelativeDistExtreme(oh_bits);
         const int d1 = RelativeDistExtreme(oh_bits);
         if (!d0 || !d1) continue;

         RandomInput16Extreme(src_buf1_, GetParam(), bd);
         RandomInput16Extreme(src_buf2_, GetParam(), bd);
         TestCopyPredArray(src_buf1_, src_buf2_, dst_buf1_ref_, dst_buf2_ref_,
                           dst_buf1_test_, dst_buf2_test_, d0, d1, 0);
         count++;
       }
     }
   }

  private:
   void TestCopyPredArray(uint16_t *src_buf1, uint16_t *src_buf2,
                          int16_t *dst_buf1_ref, int16_t *dst_buf2_ref,
                          int16_t *dst_buf1_test, int16_t *dst_buf2_test, int d0,
                          int d1, int is_speed) {
     const BlockSize &block = GetParam().Block();
     const int bw = block.Width();
     const int bh = block.Height();

     pred_buffer_copy_highbd ref_func = av1_copy_pred_array_highbd_c;
     pred_buffer_copy_highbd test_func = GetParam().TestFunction();
     if (is_speed)
       CopyPredArraySpeed(ref_func, test_func, src_buf1, src_buf2, dst_buf1_ref,
                          dst_buf2_ref, dst_buf1_test, dst_buf2_test, d0, d1, bw,
                          bh);
     else
       CopyPredArray(ref_func, test_func, src_buf1, src_buf2, dst_buf1_ref,
                     dst_buf2_ref, dst_buf1_test, dst_buf2_test, d0, d1, bw, bh);
   }

   void CopyPredArray(pred_buffer_copy_highbd ref_func,
                      pred_buffer_copy_highbd test_func,
                      const uint16_t *src_buf1, uint16_t *src_buf2,
                      int16_t *dst_buf1_ref, int16_t *dst_buf2_ref,
                      int16_t *dst_buf1_test, int16_t *dst_buf2_test,
                      const int d0, const int d1, const int bw, const int bh) {
     ref_func(src_buf1, src_buf2, dst_buf1_ref, dst_buf2_ref, bw, bh, d0, d1, 0);
     test_func(src_buf1, src_buf2, dst_buf1_test, dst_buf2_test, bw, bh, d0, d1,
               0);

     AssertOutputBufferEq(dst_buf1_ref, dst_buf1_test, bw, bh, bw);
     AssertOutputBufferEq(dst_buf2_ref, dst_buf2_test, bw, bh, bw);
   }

   void CopyPredArraySpeed(pred_buffer_copy_highbd ref_func,
                           pred_buffer_copy_highbd test_func,
                           const uint16_t *src_buf1, uint16_t *src_buf2,
                           int16_t *dst_buf1_ref, int16_t *dst_buf2_ref,
                           int16_t *dst_buf1_test, int16_t *dst_buf2_test,
                           const int d0, const int d1, const int bw,
                           const int bh) {
     const int bw_log2 = bw >> MI_SIZE_LOG2;
     const int bh_log2 = bh >> MI_SIZE_LOG2;
     printf("bw=%d, bh=%d\n", bw, bh);
     const int numIter = 2097152 / (bw_log2 * bh_log2);
     aom_usec_timer timer_ref;
     aom_usec_timer timer_test;

     aom_usec_timer_start(&timer_ref);
     for (int count = 0; count < numIter; count++)
       ref_func(src_buf1, src_buf2, dst_buf1_ref, dst_buf2_ref, bw, bh, d0, d1,
                0);
     aom_usec_timer_mark(&timer_ref);

     aom_usec_timer_start(&timer_test);
     for (int count = 0; count < numIter; count++)
       test_func(src_buf1, src_buf2, dst_buf1_test, dst_buf2_test, bw, bh, d0,
                 d1, 0);
     aom_usec_timer_mark(&timer_test);

     const int total_time_ref =
         static_cast<int>(aom_usec_timer_elapsed(&timer_ref));
     const int total_time_test =
         static_cast<int>(aom_usec_timer_elapsed(&timer_test));

     printf("ref_time = %d \t simd_time = %d \t Gain = %4.2f \n", total_time_ref,
            total_time_test,
            (static_cast<float>(total_time_ref) /
             static_cast<float>(total_time_test)));
   }

   uint16_t *src_buf1_;
   uint16_t *src_buf2_;
   int16_t *dst_buf1_ref_;
   int16_t *dst_buf2_ref_;
   int16_t *dst_buf1_test_;
   int16_t *dst_buf2_test_;
   static constexpr int kMaxOrderHintBits = 8;
 };

 TEST_P(AV1OptFlowCopyPredHighbdTest, CheckOutput) { Run(0); }
 TEST_P(AV1OptFlowCopyPredHighbdTest, DISABLED_Speed) { Run(1); }

 INSTANTIATE_TEST_SUITE_P(
     C, AV1OptFlowCopyPredHighbdTest,
     BuildOptFlowHighbdParams(av1_copy_pred_array_highbd_c));

 #if HAVE_SSE4_1
 INSTANTIATE_TEST_SUITE_P(
     SSE4_1, AV1OptFlowCopyPredHighbdTest,
     BuildOptFlowHighbdParams(av1_copy_pred_array_highbd_sse4_1));
 #endif
 #endif  // OPFL_BILINEAR_GRAD || OPFL_BICUBIC_GRAD

 #if CONFIG_AFFINE_REFINEMENT
 #if OPFL_COMBINE_INTERP_GRAD_LS
 typedef void (*calc_affine_autocorrelation_matrix)(
     const int16_t *pdiff, int pstride, const int16_t *gx, const int16_t *gy,
     int gstride, int bw, int bh, int64_t *mat_a, int64_t *vec_b);

 class AV1CalcAffineAutocorrelationMatrixTest
     : public AV1OptFlowTest<calc_affine_autocorrelation_matrix> {
  public:
   AV1CalcAffineAutocorrelationMatrixTest() {
     const BlockSize &block = GetParam().Block();
     const int bw = block.Width();
     const int bh = block.Height();

     gx_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t));
     gy_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t));
     pdiff_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t));
   }

   ~AV1CalcAffineAutocorrelationMatrixTest() {
     aom_free(gx_);
     aom_free(gy_);
     aom_free(pdiff_);
   }

   void RunTest(const int is_speed) {
     const BlockSize &block = GetParam().Block();
     const int bd = GetParam().BitDepth();
     const int bw_log2 = block.Width() >> MI_SIZE_LOG2;
     const int bh_log2 = block.Height() >> MI_SIZE_LOG2;
     const int numIter = is_speed ? 1 : 16384 / (bw_log2 * bh_log2);

     for (int count = 0; count < numIter; count++) {
       RandomInput16(gx_, GetParam(), 16);
       RandomInput16(gy_, GetParam(), 16);
       RandomInput16(pdiff_, GetParam(), bd + 1);

       TestCalcAffineAutoCorrelationMatrix(pdiff_, gx_, gy_, is_speed);
     }
     if (is_speed) return;

     // Extreme value test
     for (int count = 0; count < numIter; count++) {
       RandomInput16Extreme(gx_, GetParam(), 16);
       RandomInput16Extreme(gy_, GetParam(), 16);
       RandomInput16Extreme(pdiff_, GetParam(), bd + 1);

       TestCalcAffineAutoCorrelationMatrix(pdiff_, gx_, gy_, is_speed);
     }
   }

  private:
   void TestCalcAffineAutoCorrelationMatrix(const int16_t *pdiff, int16_t *gx,
                                            int16_t *gy, const int is_speed) {
     const BlockSize &block = GetParam().Block();
     const int bw = block.Width();
     const int bh = block.Height();
     int pstride = bw;
     int gstride = bw;

     calc_affine_autocorrelation_matrix ref_func =
         av1_calc_affine_autocorrelation_matrix_c;
     calc_affine_autocorrelation_matrix test_func = GetParam().TestFunction();

     if (is_speed)
       CalcAffineAutoCorrelationMatrixSpeed(ref_func, test_func, pdiff, pstride,
                                            gx, gy, gstride, bw, bh);
     else
       AffineAutoCorrelationMatrix(ref_func, test_func, pdiff, pstride, gx, gy,
                                   gstride, bw, bh);
   }

   void AffineAutoCorrelationMatrix(calc_affine_autocorrelation_matrix ref_func,
                                    calc_affine_autocorrelation_matrix test_func,
                                    const int16_t *pdiff, int pstride,
                                    const int16_t *gx, const int16_t *gy,
                                    int gstride, int bw, int bh) {
     DECLARE_ALIGNED(32, int64_t, mat_ref[16]);
     DECLARE_ALIGNED(32, int64_t, mat_test[16]);
     DECLARE_ALIGNED(32, int64_t, vec_ref[4]);
     DECLARE_ALIGNED(32, int64_t, vec_test[4]);
     memset(mat_ref, 0, sizeof(mat_ref));
     memset(mat_test, 0, sizeof(mat_test));
     memset(vec_ref, 0, sizeof(vec_ref));
     memset(vec_test, 0, sizeof(vec_test));
     ref_func(pdiff, pstride, gx, gy, gstride, bw, bh, mat_ref, vec_ref);
     test_func(pdiff, pstride, gx, gy, gstride, bw, bh, mat_test, vec_test);

     int failed = 0;
     for (int i = 0; i < 16; ++i) {
       if (mat_ref[i] != mat_test[i]) {
         failed = 1;
         printf("Mat [%4d] ref %6" PRId64 " test %6" PRId64 " \n", i, mat_ref[i],
                mat_test[i]);
         break;
       }
     }

     for (int i = 0; i < 4; ++i) {
       if (vec_ref[i] != vec_test[i]) {
         failed = 1;
         printf("Vec [%4d] ref %6" PRId64 " test %6" PRId64 " \n", i, vec_ref[i],
                vec_test[i]);
         break;
       }
     }
     ASSERT_EQ(failed, 0);
   }

   void CalcAffineAutoCorrelationMatrixSpeed(
       calc_affine_autocorrelation_matrix ref_func,
       calc_affine_autocorrelation_matrix test_func, const int16_t *pdiff,
       int pstride, const int16_t *gx, const int16_t *gy, int gstride, int bw,
       int bh) {
     DECLARE_ALIGNED(32, int64_t, mat_ref[16]);
     DECLARE_ALIGNED(32, int64_t, mat_test[16]);
     DECLARE_ALIGNED(32, int64_t, vec_ref[4]);
     DECLARE_ALIGNED(32, int64_t, vec_test[4]);
     memset(mat_ref, 0, sizeof(mat_ref));
     memset(mat_test, 0, sizeof(mat_test));
     memset(vec_ref, 0, sizeof(vec_ref));
     memset(vec_test, 0, sizeof(vec_test));
     const int knumIter = 1000000;
     aom_usec_timer timer_ref;
     aom_usec_timer timer_test;

     aom_usec_timer_start(&timer_ref);
     for (int count = 0; count < knumIter; count++) {
       ref_func(pdiff, pstride, gx, gy, gstride, bw, bh, mat_ref, vec_ref);
     }
     aom_usec_timer_mark(&timer_ref);

     aom_usec_timer_start(&timer_test);
     for (int count = 0; count < knumIter; count++) {
       test_func(pdiff, pstride, gx, gy, gstride, bw, bh, mat_test, vec_test);
     }
     aom_usec_timer_mark(&timer_test);

     const int total_time_ref =
         static_cast<int>(aom_usec_timer_elapsed(&timer_ref));
     const int total_time_test =
         static_cast<int>(aom_usec_timer_elapsed(&timer_test));

     printf(
         "Block size: %dx%d, ref_time = %d \t simd_time = %d \t Scaling = %4.2f "
         "\n",
         bw, bh, total_time_ref, total_time_test,
         (static_cast<float>(total_time_ref) /
          static_cast<float>(total_time_test)));
   }

   int16_t *gx_;
   int16_t *gy_;
   int16_t *pdiff_;
 };

 TEST_P(AV1CalcAffineAutocorrelationMatrixTest, CheckOutput) { RunTest(0); }
 TEST_P(AV1CalcAffineAutocorrelationMatrixTest, DISABLED_Speed) { RunTest(1); }

 INSTANTIATE_TEST_SUITE_P(
     C, AV1CalcAffineAutocorrelationMatrixTest,
     BuildOptFlowHighbdParams(av1_calc_affine_autocorrelation_matrix_c));

 #if HAVE_AVX2
 INSTANTIATE_TEST_SUITE_P(
     AVX2, AV1CalcAffineAutocorrelationMatrixTest,
     BuildOptFlowHighbdParams(av1_calc_affine_autocorrelation_matrix_avx2));
 #endif  // HAVE_AVX2

 #if AFFINE_AVERAGING_BITS > 0
 typedef void (*av1_avg_pooling_pdiff_gradients_fun)(
     int16_t *pdiff, const int pstride, int16_t *gx, int16_t *gy,
     const int gstride, const int bw, const int bh, const int n);

 class AV1AvgPoolingPdiffGradientTest
     : public AV1OptFlowTest<av1_avg_pooling_pdiff_gradients_fun> {
  public:
   AV1AvgPoolingPdiffGradientTest() {
     const BlockSize &block = GetParam().Block();
     const int bw = block.Width();
     const int bh = block.Height();

     gx_avg1_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t));
     gy_avg1_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t));
     pdiff_avg1_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t));
     gx_avg2_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t));
     gy_avg2_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t));
     pdiff_avg2_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t));
   }

   ~AV1AvgPoolingPdiffGradientTest() {
     aom_free(gx_avg1_);
     aom_free(gy_avg1_);
     aom_free(pdiff_avg1_);
     aom_free(gx_avg2_);
     aom_free(gy_avg2_);
     aom_free(pdiff_avg2_);
   }

   void RunTest(const int is_speed) {
     const BlockSize &block = GetParam().Block();
     const int bd = GetParam().BitDepth();
     const int bw_log2 = block.Width() >> MI_SIZE_LOG2;
     const int bh_log2 = block.Height() >> MI_SIZE_LOG2;
     const int numIter = is_speed ? 1 : 16384 / (bw_log2 * bh_log2);

     // AVX2 version only supports avg pooling from larger size to 16x16
     if (block.Width() <= 8 || block.Height() <= 8) return;

     for (int count = 0; count < numIter;) {
       RandomInput16(gx_avg1_, GetParam(), 16);
       RandomInput16(gy_avg1_, GetParam(), 16);
       RandomInput16(pdiff_avg1_, GetParam(), bd + 1);
       memcpy(gx_avg2_, gx_avg1_,
              sizeof(int16_t) * block.Width() * block.Height());
       memcpy(gy_avg2_, gy_avg1_,
              sizeof(int16_t) * block.Width() * block.Height());
       memcpy(pdiff_avg2_, pdiff_avg1_,
              sizeof(int16_t) * block.Width() * block.Height());

       TestAvgPoolingPdiffGrad(pdiff_avg1_, gx_avg1_, gy_avg1_, pdiff_avg2_,
                               gx_avg2_, gy_avg2_, is_speed);
       count++;
     }
     if (is_speed) return;

     // Extreme value test
     for (int count = 0; count < numIter; count++) {
       RandomInput16Extreme(gx_avg1_, GetParam(), 16);
       RandomInput16Extreme(gy_avg1_, GetParam(), 16);
       RandomInput16Extreme(pdiff_avg1_, GetParam(), bd + 1);
       memcpy(gx_avg2_, gx_avg1_,
              sizeof(int16_t) * block.Width() * block.Height());
       memcpy(gy_avg2_, gy_avg1_,
              sizeof(int16_t) * block.Width() * block.Height());
       memcpy(pdiff_avg2_, pdiff_avg1_,
              sizeof(int16_t) * block.Width() * block.Height());

       TestAvgPoolingPdiffGrad(pdiff_avg1_, gx_avg1_, gy_avg1_, pdiff_avg2_,
                               gx_avg2_, gy_avg2_, is_speed);
     }
   }

  private:
   void TestAvgPoolingPdiffGrad(int16_t *pdiff_avg1, int16_t *gx_avg1,
                                int16_t *gy_avg1, int16_t *pdiff_avg2,
                                int16_t *gx_avg2, int16_t *gy_avg2,
                                const int is_speed) {
     const BlockSize &block = GetParam().Block();
     const int bw = block.Width();
     const int bh = block.Height();
     int pstride = bw;
     int gstride = bw;

     av1_avg_pooling_pdiff_gradients_fun ref_func =
         av1_avg_pooling_pdiff_gradients_c;
     av1_avg_pooling_pdiff_gradients_fun test_func = GetParam().TestFunction();

     if (is_speed)
       AvgPoolingPdiffGradSpeed(ref_func, test_func, pstride, gstride, bw, bh,
                                pdiff_avg1, gx_avg1, gy_avg1, pdiff_avg2,
                                gx_avg2, gy_avg2);
     else
       AvgPoolingPdiffGrad(ref_func, test_func, pstride, gstride, bw, bh,
                           pdiff_avg1, gx_avg1, gy_avg1, pdiff_avg2, gx_avg2,
                           gy_avg2);
   }

   void AvgPoolingPdiffGrad(av1_avg_pooling_pdiff_gradients_fun ref_func,
                            av1_avg_pooling_pdiff_gradients_fun test_func,
                            int pstride, int gstride, int bw, int bh,
                            int16_t *pdiff_avg1, int16_t *gx_avg1,
                            int16_t *gy_avg1, int16_t *pdiff_avg2,
                            int16_t *gx_avg2, int16_t *gy_avg2) {
     int n = AOMMIN(AOMMIN(bw, bh), 16);
     ref_func(pdiff_avg1, pstride, gx_avg1, gy_avg1, gstride, bw, bh, n);
     test_func(pdiff_avg2, pstride, gx_avg2, gy_avg2, gstride, bw, bh, n);
     AssertOutputBufferEq(pdiff_avg1, pdiff_avg2, n, n, bw);
     AssertOutputBufferEq(gx_avg1, gx_avg2, n, n, bw);
     AssertOutputBufferEq(gy_avg1, gy_avg2, n, n, bw);
   }

   void AvgPoolingPdiffGradSpeed(av1_avg_pooling_pdiff_gradients_fun ref_func,
                                 av1_avg_pooling_pdiff_gradients_fun test_func,
                                 int pstride, int gstride, int bw, int bh,
                                 int16_t *pdiff_avg1, int16_t *gx_avg1,
                                 int16_t *gy_avg1, int16_t *pdiff_avg2,
                                 int16_t *gx_avg2, int16_t *gy_avg2) {
     int n = AOMMIN(AOMMIN(bw, bh), 16);

     const int numIter = 1000000;
     aom_usec_timer timer_ref;
     aom_usec_timer timer_test;

     aom_usec_timer_start(&timer_ref);
     for (int count = 0; count < numIter; count++) {
       ref_func(pdiff_avg1, pstride, gx_avg1, gy_avg1, gstride, bw, bh, n);
     }
     aom_usec_timer_mark(&timer_ref);

     aom_usec_timer_start(&timer_test);
     for (int count = 0; count < numIter; count++) {
       test_func(pdiff_avg2, pstride, gx_avg2, gy_avg2, gstride, bw, bh, n);
     }
     aom_usec_timer_mark(&timer_test);

     const int total_time_ref =
         static_cast<int>(aom_usec_timer_elapsed(&timer_ref));
     const int total_time_test =
         static_cast<int>(aom_usec_timer_elapsed(&timer_test));

     printf(
         "Block size: %dx%d, C time = %d \t SIMD time = %d \t Scaling = %4.2f "
         "\n",
         bw, bh, total_time_ref, total_time_test,
         (static_cast<float>(total_time_ref) /
          static_cast<float>(total_time_test)));
   }

   int16_t *gx_avg1_;
   int16_t *gy_avg1_;
   int16_t *pdiff_avg1_;
   int16_t *gx_avg2_;
   int16_t *gy_avg2_;
   int16_t *pdiff_avg2_;
 };

 TEST_P(AV1AvgPoolingPdiffGradientTest, CheckOutput) { RunTest(0); }
 TEST_P(AV1AvgPoolingPdiffGradientTest, DISABLED_Speed) { RunTest(1); }

 INSTANTIATE_TEST_SUITE_P(
     C, AV1AvgPoolingPdiffGradientTest,
     BuildOptFlowHighbdParams(av1_avg_pooling_pdiff_gradients_c));

 #if HAVE_AVX2
 INSTANTIATE_TEST_SUITE_P(
     AVX2, AV1AvgPoolingPdiffGradientTest,
     BuildOptFlowHighbdParams(av1_avg_pooling_pdiff_gradients_avx2));
 #endif  // HAVE_AVX2
 #endif  // AFFINE_AVERAGING_BITS > 0
 #endif  // OPFL_COMBINE_INTERP_GRAD_LS
 #endif  // CONFIG_AFFINE_REFINEMENT
 }  // namespace

 #endif  // CONFIG_OPTFLOW_REFINEMENT