test/av1_convolve_test.cc - avm - Git at Google

 /*
  * Copyright (c) 2021, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 3-Clause Clear License
  * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
  * License was not distributed with this source code in the LICENSE file, you
  * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
  * Alliance for Open Media Patent License 1.0 was not distributed with this
  * source code in the PATENTS file, you can obtain it at
  * aomedia.org/license/patent-license/.
  */

 #include <ostream>
 #include <set>
 #include <vector>
 #include "aom_ports/aom_timer.h"
 #include "config/av1_rtcd.h"
 #include "config/aom_dsp_rtcd.h"
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "test/acm_random.h"
 #pragma GCC diagnostic pop
 #include "test/clear_system_state.h"
 #include "test/util.h"
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"

 #if CONFIG_LR_IMPROVEMENTS
 #include "av1/common/restoration.h"
 #endif  // CONFIG_LR_IMPROVEMENTS

 namespace {

 // TODO(any): Remove following INTERP_FILTERS_ALL define, so that 12-tap filter
 // is tested once 12-tap filter SIMD is done.
 #undef INTERP_FILTERS_ALL
 #define INTERP_FILTERS_ALL 4

 // All single reference convolve tests are parameterized on block size,
 // bit-depth, and function to test.
 //
 // Note that parameterizing on these variables (and not other parameters) is
 // a conscious decision - Jenkins needs some degree of parallelization to run
 // the tests within the time limit, but if the number of parameters increases
 // too much, the gtest framework does not handle it well (increased overhead per
 // test, huge amount of output to stdout, etc.).
 //
 // Also note that the test suites must be named with the architecture, e.g.,
 // C, C_X, AVX2_X, ... The test suite that runs on Jenkins sometimes runs tests
 // that cannot deal with intrinsics (e.g., the Valgrind tests on 32-bit x86
 // binaries) and will disable tests using a filter like
 // --gtest_filter=-:SSE4_1.*. If the test suites are not named this way, the
 // testing infrastructure will not selectively filter them properly.
 class BlockSize {
  public:
   BlockSize(int w, int h) : width_(w), height_(h) {}

   int Width() const { return width_; }
   int Height() const { return height_; }

   bool operator<(const BlockSize &other) const {
     if (Width() == other.Width()) {
       return Height() < other.Height();
     }
     return Width() < other.Width();
   }

   bool operator==(const BlockSize &other) const {
     return Width() == other.Width() && Height() == other.Height();
   }

  private:
   int width_;
   int height_;
 };

 // Block size / bit depth / test function used to parameterize the tests.
 template <typename T>
 class TestParam {
  public:
   TestParam(const BlockSize &block, int bd, T test_func)
       : block_(block), bd_(bd), test_func_(test_func) {}

   const BlockSize &Block() const { return block_; }
   int BitDepth() const { return bd_; }
   T TestFunction() const { return test_func_; }

   bool operator==(const TestParam &other) const {
     return Block() == other.Block() && BitDepth() == other.BitDepth() &&
            TestFunction() == other.TestFunction();
   }

  private:
   BlockSize block_;
   int bd_;
   T test_func_;
 };

 template <typename T>
 std::ostream &operator<<(std::ostream &os, const TestParam<T> &test_arg) {
   return os << "TestParam { width:" << test_arg.Block().Width()
             << " height:" << test_arg.Block().Height()
             << " bd:" << test_arg.BitDepth() << " }";
 }

 // Generate the list of all block widths / heights that need to be tested,
 // includes chroma and luma sizes, for the given bit-depths. The test
 // function is the same for all generated parameters.
 template <typename T>
 std::vector<TestParam<T>> GetTestParams(std::initializer_list<int> bit_depths,
                                         T test_func) {
   std::set<BlockSize> sizes;
   for (int b = BLOCK_4X4; b < BLOCK_SIZES_ALL; ++b) {
     const int w = block_size_wide[b];
     const int h = block_size_high[b];
     sizes.insert(BlockSize(w, h));
     // Add in smaller chroma sizes as well.
     if (w == 4 || h == 4) {
       sizes.insert(BlockSize(w / 2, h / 2));
     }
   }
   std::vector<TestParam<T>> result;
   for (const BlockSize &block : sizes) {
     for (int bd : bit_depths) {
       result.push_back(TestParam<T>(block, bd, test_func));
     }
   }
   return result;
 }

 // Test the test-parameters generators work as expected.
 class AV1ConvolveParametersTest : public ::testing::Test {};

 template <typename T>
 std::vector<TestParam<T>> GetHighbdTestParams(T test_func) {
   return GetTestParams({ 10, 12 }, test_func);
 }

 template <typename T>
 ::testing::internal::ParamGenerator<TestParam<T>> BuildHighbdParams(
     T test_func) {
   return ::testing::ValuesIn(GetHighbdTestParams(test_func));
 }

 TEST_F(AV1ConvolveParametersTest, GetHighbdTestParams) {
   auto v = GetHighbdTestParams(av1_highbd_convolve_x_sr_c);
 #if CONFIG_FLEX_PARTITION
   ASSERT_EQ(80U, v.size());
 #else
   ASSERT_EQ(60U, v.size());
 #endif  // CONFIG_FLEX_PARTITION
   int num_10 = 0;
   int num_12 = 0;
   for (const auto &p : v) {
     ASSERT_TRUE(p.BitDepth() == 10 || p.BitDepth() == 12);
     bool same_fn = av1_highbd_convolve_x_sr_c == p.TestFunction();
     ASSERT_TRUE(same_fn);
     if (p.BitDepth() == 10) {
       ++num_10;
     } else {
       ++num_12;
     }
   }
   ASSERT_EQ(num_10, num_12);
 }

 // AV1ConvolveTest is the base class that all convolve tests should derive from.
 // It provides storage/methods for generating randomized buffers for both
 // low bit-depth and high bit-depth, and setup/teardown methods for clearing
 // system state. Implementors can get the bit-depth / block-size /
 // test function by calling GetParam().
 template <typename T>
 class AV1ConvolveTest : public ::testing::TestWithParam<TestParam<T>> {
  public:
   virtual ~AV1ConvolveTest() { TearDown(); }

   virtual void SetUp() override {
     rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
   }

   virtual void TearDown() override { libaom_test::ClearSystemState(); }

   // Randomizes the 8-bit input buffer and returns a pointer to it. Note that
   // the pointer is safe to use with an 8-tap filter. The stride can range
   // from width to (width + kPadding). Also note that the pointer is to the
   // same memory location.
   static constexpr int kInputPadding = 8;

   // Get a pointer to a buffer with stride == width. Note that we must have
   // the test param passed in explicitly -- the gtest framework does not
   // support calling GetParam() within a templatized class.
   // Note that FirstRandomInput8 always returns the same pointer -- if two
   // inputs are needed, also use SecondRandomInput8.
   const uint8_t *FirstRandomInput8(const TestParam<T> &param) {
     // Note we can't call GetParam() directly -- gtest does not support
     // this for parameterized types.
     return RandomInput8(input8_1_, param);
   }

   const uint8_t *SecondRandomInput8(const TestParam<T> &param) {
     return RandomInput8(input8_2_, param);
   }

   // Some of the intrinsics perform writes in 32 byte chunks. Moreover, some
   // of the instrinsics assume that the stride is also a multiple of 32.
   // To satisfy these constraints and also remain simple, output buffer strides
   // are assumed MAX_SB_SIZE.
   static constexpr int kOutputStride = MAX_SB_SIZE;

   // Check that two 8-bit output buffers are identical.
   void AssertOutputBufferEq(const uint8_t *p1, const uint8_t *p2, int width,
                             int height) {
     ASSERT_TRUE(p1 != p2) << "Buffers must be at different memory locations";
     for (int j = 0; j < height; ++j) {
       if (memcmp(p1, p2, sizeof(*p1) * width) == 0) {
         p1 += kOutputStride;
         p2 += kOutputStride;
         continue;
       }
       for (int i = 0; i < width; ++i) {
         ASSERT_EQ(p1[i], p2[i])
             << width << "x" << height << " Pixel mismatch at (" << i << ", "
             << j << ")";
       }
     }
   }

   // Check that two 16-bit output buffers are identical.
   void AssertOutputBufferEq(const uint16_t *p1, const uint16_t *p2, int width,
                             int height) {
     ASSERT_TRUE(p1 != p2) << "Buffers must be in different memory locations";
     for (int j = 0; j < height; ++j) {
       if (memcmp(p1, p2, sizeof(*p1) * width) == 0) {
         p1 += kOutputStride;
         p2 += kOutputStride;
         continue;
       }
       for (int i = 0; i < width; ++i) {
         ASSERT_EQ(p1[i], p2[i])
             << width << "x" << height << " Pixel mismatch at (" << i << ", "
             << j << ")";
       }
     }
   }

   // Note that the randomized values are capped by bit-depth.
   const uint16_t *FirstRandomInput16(const TestParam<T> &param) {
     return RandomInput16(input16_1_, param);
   }

   const uint16_t *SecondRandomInput16(const TestParam<T> &param) {
     return RandomInput16(input16_2_, param);
   }

 #if CONFIG_LR_IMPROVEMENTS
   const uint16_t *FirstRandomInput16Extreme(const TestParam<T> &param) {
     return RandomInput16Extreme(input16_1_, param);
   }
 #endif  // CONFIG_LR_IMPROVEMENTS

  private:
   const uint8_t *RandomInput8(uint8_t *p, const TestParam<T> &param) {
     EXPECT_EQ(8, param.BitDepth());
     EXPECT_GE(MAX_SB_SIZE, param.Block().Width());
     EXPECT_GE(MAX_SB_SIZE, param.Block().Height());
     const int padded_width = param.Block().Width() + kInputPadding;
     const int padded_height = param.Block().Height() + kInputPadding;
     Randomize(p, padded_width * padded_height);
     return p + (kInputPadding / 2) * padded_width + kInputPadding / 2;
   }

   void Randomize(uint8_t *p, int size) {
     for (int i = 0; i < size; ++i) {
       p[i] = rnd_.Rand8();
     }
   }

   const uint16_t *RandomInput16(uint16_t *p, const TestParam<T> &param) {
     // Check that this is only called with high bit-depths.
     EXPECT_TRUE(param.BitDepth() == 10 || param.BitDepth() == 12);
     EXPECT_GE(MAX_SB_SIZE, param.Block().Width());
     EXPECT_GE(MAX_SB_SIZE, param.Block().Height());
     const int padded_width = param.Block().Width() + kInputPadding;
     const int padded_height = param.Block().Height() + kInputPadding;
     Randomize(p, padded_width * padded_height, param.BitDepth());
     return p + (kInputPadding / 2) * padded_width + kInputPadding / 2;
   }

   void Randomize(uint16_t *p, int size, int bit_depth) {
     for (int i = 0; i < size; ++i) {
       p[i] = rnd_.Rand16() & ((1 << bit_depth) - 1);
     }
   }

 #if CONFIG_LR_IMPROVEMENTS
   const uint16_t *RandomInput16Extreme(uint16_t *p, const TestParam<T> &param) {
     // Check that this is only called with high bit-depths.
     EXPECT_TRUE(param.BitDepth() == 10 || param.BitDepth() == 12);
     EXPECT_GE(MAX_SB_SIZE, param.Block().Width());
     EXPECT_GE(MAX_SB_SIZE, param.Block().Height());
     const int padded_width = param.Block().Width() + kInputPadding;
     const int padded_height = param.Block().Height() + kInputPadding;
     RandomizeExtreme(p, padded_width * padded_height, param.BitDepth());
     return p + (kInputPadding / 2) * padded_width + kInputPadding / 2;
   }

   void RandomizeExtreme(uint16_t *p, int size, int max_bit_range) {
     EXPECT_GE(12, max_bit_range);
     const int max_val = (1 << max_bit_range) - 1;
     for (int i = 0; i < size; ++i) {
       p[i] = static_cast<uint16_t>(RandBool() ? max_val : 0);
     }
   }

   int RandBool() {
     const uint32_t value = rnd_.Rand8();
     // There's a bit more entropy in the upper bits of this implementation.
     return (value >> 7) & 0x1;
   }
 #endif  // CONFIG_LR_IMPROVEMENTS

   static constexpr int kInputStride = MAX_SB_SIZE + kInputPadding;

   libaom_test::ACMRandom rnd_;
   // Statically allocate all the memory that is needed for the tests. Note
   // that we cannot allocate output memory here. It must use DECLARE_ALIGNED,
   // which is a C99 feature and interacts badly with C++ member variables.
   uint8_t input8_1_[kInputStride * kInputStride];
   uint8_t input8_2_[kInputStride * kInputStride];
   uint16_t input16_1_[kInputStride * kInputStride];
   uint16_t input16_2_[kInputStride * kInputStride];
 };

 /////////////////////////////////////////////////////////
 // Single reference convolve-x functions (high bit-depth)
 /////////////////////////////////////////////////////////
 typedef void (*highbd_convolve_x_func)(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
     int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
     ConvolveParams *conv_params, int bd);

 class AV1ConvolveXHighbdTest : public AV1ConvolveTest<highbd_convolve_x_func> {
  public:
   void RunTest() {
     for (int sub_x = 0; sub_x < 16; ++sub_x) {
       for (int filter = EIGHTTAP_REGULAR; filter < INTERP_FILTERS_ALL;
            ++filter) {
         InterpFilter f = static_cast<InterpFilter>(filter);
         TestConvolve(sub_x, f);
       }
     }
   }

  private:
   void TestConvolve(const int sub_x, const InterpFilter filter) {
     const int width = GetParam().Block().Width();
     const int height = GetParam().Block().Height();
     const int bit_depth = GetParam().BitDepth();
     const InterpFilterParams *filter_params_x =
         av1_get_interp_filter_params_with_block_size(filter, width);
     ConvolveParams conv_params1 =
         get_conv_params_no_round(0, 0, NULL, 0, 0, bit_depth);
     const uint16_t *input = FirstRandomInput16(GetParam());
     DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
     av1_highbd_convolve_x_sr_c(input, width, reference, kOutputStride, width,
                                height, filter_params_x, sub_x, &conv_params1,
                                bit_depth);

     ConvolveParams conv_params2 =
         get_conv_params_no_round(0, 0, NULL, 0, 0, bit_depth);
     DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
     GetParam().TestFunction()(input, width, test, kOutputStride, width, height,
                               filter_params_x, sub_x, &conv_params2, bit_depth);
     AssertOutputBufferEq(reference, test, width, height);
   }
 };

 TEST_P(AV1ConvolveXHighbdTest, RunTest) { RunTest(); }

 INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveXHighbdTest,
                          BuildHighbdParams(av1_highbd_convolve_x_sr_c));

 #if HAVE_SSSE3
 INSTANTIATE_TEST_SUITE_P(SSSE3, AV1ConvolveXHighbdTest,
                          BuildHighbdParams(av1_highbd_convolve_x_sr_ssse3));
 #endif

 #if HAVE_AVX2
 INSTANTIATE_TEST_SUITE_P(AVX2, AV1ConvolveXHighbdTest,
                          BuildHighbdParams(av1_highbd_convolve_x_sr_avx2));
 #endif

 /////////////////////////////////////////////////////////
 // Single reference convolve-y functions (high bit-depth)
 /////////////////////////////////////////////////////////
 typedef void (*highbd_convolve_y_func)(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
     int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn,
     int bd);

 class AV1ConvolveYHighbdTest : public AV1ConvolveTest<highbd_convolve_y_func> {
  public:
   void RunTest() {
     for (int sub_y = 0; sub_y < 16; ++sub_y) {
       for (int filter = EIGHTTAP_REGULAR; filter < INTERP_FILTERS_ALL;
            ++filter) {
         InterpFilter f = static_cast<InterpFilter>(filter);
         TestConvolve(sub_y, f);
       }
     }
   }

  private:
   void TestConvolve(const int sub_y, const InterpFilter filter) {
     const int width = GetParam().Block().Width();
     const int height = GetParam().Block().Height();
     const int bit_depth = GetParam().BitDepth();
     const InterpFilterParams *filter_params_y =
         av1_get_interp_filter_params_with_block_size(filter, height);
     const uint16_t *input = FirstRandomInput16(GetParam());
     DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
     av1_highbd_convolve_y_sr_c(input, width, reference, kOutputStride, width,
                                height, filter_params_y, sub_y, bit_depth);
     DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
     GetParam().TestFunction()(input, width, test, kOutputStride, width, height,
                               filter_params_y, sub_y, bit_depth);
     AssertOutputBufferEq(reference, test, width, height);
   }
 };

 TEST_P(AV1ConvolveYHighbdTest, RunTest) { RunTest(); }

 INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveYHighbdTest,
                          BuildHighbdParams(av1_highbd_convolve_y_sr_c));

 #if HAVE_SSSE3
 INSTANTIATE_TEST_SUITE_P(SSSE3, AV1ConvolveYHighbdTest,
                          BuildHighbdParams(av1_highbd_convolve_y_sr_ssse3));
 #endif

 #if HAVE_AVX2
 INSTANTIATE_TEST_SUITE_P(AVX2, AV1ConvolveYHighbdTest,
                          BuildHighbdParams(av1_highbd_convolve_y_sr_avx2));
 #endif

 ///////////////////////////////////////////////////////////////
 // Single reference convolve-copy functions (high bit-depth)
 ///////////////////////////////////////////////////////////////
 typedef void (*highbd_convolve_copy_func)(const uint16_t *src,
                                           ptrdiff_t src_stride, uint16_t *dst,
                                           ptrdiff_t dst_stride, int w, int h);

 class AV1ConvolveCopyHighbdTest
     : public AV1ConvolveTest<highbd_convolve_copy_func> {
  public:
   void RunTest() {
     const BlockSize &block = GetParam().Block();
     const int width = block.Width();
     const int height = block.Height();
     const uint16_t *input = FirstRandomInput16(GetParam());
     DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
     aom_highbd_convolve_copy_c(input, width, reference, kOutputStride, width,
                                height);
     DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
     GetParam().TestFunction()(input, width, test, kOutputStride, width, height);
     AssertOutputBufferEq(reference, test, width, height);
   }
 };

 TEST_P(AV1ConvolveCopyHighbdTest, RunTest) { RunTest(); }

 INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveCopyHighbdTest,
                          BuildHighbdParams(aom_highbd_convolve_copy_c));

 #if HAVE_SSE2
 INSTANTIATE_TEST_SUITE_P(SSE2, AV1ConvolveCopyHighbdTest,
                          BuildHighbdParams(aom_highbd_convolve_copy_sse2));
 #endif

 #if HAVE_AVX2
 INSTANTIATE_TEST_SUITE_P(AVX2, AV1ConvolveCopyHighbdTest,
                          BuildHighbdParams(aom_highbd_convolve_copy_avx2));
 #endif

 //////////////////////////////////////////////////////////
 // Single reference convolve-2d functions (high bit-depth)
 //////////////////////////////////////////////////////////

 typedef void (*highbd_convolve_2d_func)(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
     int h, const InterpFilterParams *filter_params_x,
     const InterpFilterParams *filter_params_y, const int subpel_x_qn,
     const int subpel_y_qn, ConvolveParams *conv_params, int bd);

 class AV1Convolve2DHighbdTest
     : public AV1ConvolveTest<highbd_convolve_2d_func> {
  public:
   void RunTest() {
     for (int sub_x = 0; sub_x < 16; ++sub_x) {
       for (int sub_y = 0; sub_y < 16; ++sub_y) {
         for (int h_f = EIGHTTAP_REGULAR; h_f < INTERP_FILTERS_ALL; ++h_f) {
           for (int v_f = EIGHTTAP_REGULAR; v_f < INTERP_FILTERS_ALL; ++v_f) {
             TestConvolve(static_cast<InterpFilter>(h_f),
                          static_cast<InterpFilter>(v_f), sub_x, sub_y);
           }
         }
       }
     }
   }

  private:
   void TestConvolve(const InterpFilter h_f, const InterpFilter v_f,
                     const int sub_x, const int sub_y) {
     const int width = GetParam().Block().Width();
     const int height = GetParam().Block().Height();
     const int bit_depth = GetParam().BitDepth();
     const InterpFilterParams *filter_params_x =
         av1_get_interp_filter_params_with_block_size(h_f, width);
     const InterpFilterParams *filter_params_y =
         av1_get_interp_filter_params_with_block_size(v_f, height);
     const uint16_t *input = FirstRandomInput16(GetParam());
     DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
     ConvolveParams conv_params1 =
         get_conv_params_no_round(0, 0, NULL, 0, 0, bit_depth);
     av1_highbd_convolve_2d_sr_c(input, width, reference, kOutputStride, width,
                                 height, filter_params_x, filter_params_y, sub_x,
                                 sub_y, &conv_params1, bit_depth);
     DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
     ConvolveParams conv_params2 =
         get_conv_params_no_round(0, 0, NULL, 0, 0, bit_depth);
     GetParam().TestFunction()(input, width, test, kOutputStride, width, height,
                               filter_params_x, filter_params_y, sub_x, sub_y,
                               &conv_params2, bit_depth);
     AssertOutputBufferEq(reference, test, width, height);
   }
 };

 TEST_P(AV1Convolve2DHighbdTest, RunTest) { RunTest(); }

 INSTANTIATE_TEST_SUITE_P(C, AV1Convolve2DHighbdTest,
                          BuildHighbdParams(av1_highbd_convolve_2d_sr_c));

 #if HAVE_SSSE3
 INSTANTIATE_TEST_SUITE_P(SSSE3, AV1Convolve2DHighbdTest,
                          BuildHighbdParams(av1_highbd_convolve_2d_sr_ssse3));
 #endif

 #if HAVE_AVX2
 INSTANTIATE_TEST_SUITE_P(AVX2, AV1Convolve2DHighbdTest,
                          BuildHighbdParams(av1_highbd_convolve_2d_sr_avx2));
 #endif

 //////////////////////////
 // Compound Convolve Tests
 //////////////////////////

 // The compound functions do not work for chroma block sizes. Provide
 // a function to generate test parameters for just luma block sizes.
 template <typename T>
 std::vector<TestParam<T>> GetLumaTestParams(
     std::initializer_list<int> bit_depths, T test_func) {
   std::set<BlockSize> sizes;
   for (int b = BLOCK_4X4; b < BLOCK_SIZES_ALL; ++b) {
     const int w = block_size_wide[b];
     const int h = block_size_high[b];
     sizes.insert(BlockSize(w, h));
   }
   std::vector<TestParam<T>> result;
   for (int bit_depth : bit_depths) {
     for (const auto &block : sizes) {
       result.push_back(TestParam<T>(block, bit_depth, test_func));
     }
   }
   return result;
 }

 template <typename T>
 std::vector<TestParam<T>> GetHighbdLumaTestParams(T test_func) {
   return GetLumaTestParams({ 10, 12 }, test_func);
 }

 TEST_F(AV1ConvolveParametersTest, GetHighbdLumaTestParams) {
   auto v = GetHighbdLumaTestParams(av1_highbd_dist_wtd_convolve_x_c);
   ASSERT_EQ(static_cast<size_t>(BLOCK_SIZES_ALL * 2), v.size());
   int num_10 = 0;
   int num_12 = 0;
   for (const auto &e : v) {
     ASSERT_TRUE(10 == e.BitDepth() || 12 == e.BitDepth());
     bool same_fn = av1_highbd_dist_wtd_convolve_x_c == e.TestFunction();
     ASSERT_TRUE(same_fn);
     if (e.BitDepth() == 10) {
       ++num_10;
     } else {
       ++num_12;
     }
   }
   ASSERT_EQ(num_10, num_12);
 }

 template <typename T>
 ::testing::internal::ParamGenerator<TestParam<T>> BuildHighbdLumaParams(
     T test_func) {
   return ::testing::ValuesIn(GetHighbdLumaTestParams(test_func));
 }

 // Compound cases also need to test different frame offsets and weightings.
 class CompoundParam {
  public:
   CompoundParam(int fwd_offset, int bck_offset)
       : fwd_offset_(fwd_offset), bck_offset_(bck_offset) {}

   bool UseWtdCompAvg() const {
     return bck_offset_ != (1 << (DIST_PRECISION_BITS - 1)) ||
            fwd_offset_ != (1 << (DIST_PRECISION_BITS - 1));
   }
   int FwdOffset() const { return fwd_offset_; }
   int BckOffset() const { return bck_offset_; }

  private:
   int fwd_offset_;
   int bck_offset_;
 };

 std::vector<CompoundParam> GetCompoundParams() {
   std::vector<CompoundParam> result;
   result.push_back(CompoundParam(1 << (DIST_PRECISION_BITS - 1),
                                  1 << (DIST_PRECISION_BITS - 1)));
   for (int k = 0; k < 2; ++k) {
     for (int l = 0; l < 4; ++l) {
       result.push_back(CompoundParam(quant_dist_lookup_table[l][k],
                                      quant_dist_lookup_table[l][1 - k]));
     }
   }
   return result;
 }

 TEST_F(AV1ConvolveParametersTest, GetCompoundParams) {
   auto v = GetCompoundParams();
   ASSERT_EQ(9U, v.size());
   ASSERT_FALSE(v[0].UseWtdCompAvg());
   for (size_t i = 1; i < v.size(); ++i) {
     ASSERT_TRUE(v[i].UseWtdCompAvg());
   }
 }

 /////////////////////////////////////////////////
 // Compound convolve-x functions (high bit-depth)
 /////////////////////////////////////////////////
 ConvolveParams GetConvolveParams(int do_average, CONV_BUF_TYPE *conv_buf,
                                  int width, int bit_depth,
                                  const CompoundParam &compound) {
   ConvolveParams conv_params =
       get_conv_params_no_round(do_average, 0, conv_buf, width, 1, bit_depth);
   (void)compound;
   conv_params.fwd_offset = compound.FwdOffset();
   conv_params.bck_offset = compound.BckOffset();
   return conv_params;
 }

 class AV1ConvolveXHighbdCompoundTest
     : public AV1ConvolveTest<highbd_convolve_x_func> {
  public:
   void RunTest() {
     auto compound_params = GetCompoundParams();
     for (int sub_pix = 0; sub_pix < 16; ++sub_pix) {
       for (int f = EIGHTTAP_REGULAR; f < INTERP_FILTERS_ALL; ++f) {
         for (const auto &c : compound_params) {
           TestConvolve(sub_pix, static_cast<InterpFilter>(f), c);
         }
       }
     }
   }

  protected:
   virtual const InterpFilterParams *FilterParams(InterpFilter f,
                                                  const BlockSize &block) const {
     return av1_get_interp_filter_params_with_block_size(f, block.Width());
   }

   virtual highbd_convolve_x_func ReferenceFunc() const {
     return av1_highbd_dist_wtd_convolve_x_c;
   }

  private:
   void TestConvolve(const int sub_pix, const InterpFilter filter,
                     const CompoundParam &compound) {
     const int width = GetParam().Block().Width();
     const int height = GetParam().Block().Height();

     const uint16_t *input1 = FirstRandomInput16(GetParam());
     const uint16_t *input2 = SecondRandomInput16(GetParam());
     DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
     DECLARE_ALIGNED(32, CONV_BUF_TYPE, reference_conv_buf[MAX_SB_SQUARE]);
     Convolve(ReferenceFunc(), input1, input2, reference, reference_conv_buf,
              compound, sub_pix, filter);

     DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
     DECLARE_ALIGNED(32, CONV_BUF_TYPE, test_conv_buf[MAX_SB_SQUARE]);
     Convolve(GetParam().TestFunction(), input1, input2, test, test_conv_buf,
              compound, sub_pix, filter);

     AssertOutputBufferEq(reference_conv_buf, test_conv_buf, width, height);
     AssertOutputBufferEq(reference, test, width, height);
   }

   void Convolve(highbd_convolve_x_func test_func, const uint16_t *src1,
                 const uint16_t *src2, uint16_t *dst, CONV_BUF_TYPE *conv_buf,
                 const CompoundParam &compound, const int sub_pix,
                 const InterpFilter filter) {
     const int width = GetParam().Block().Width();
     const int height = GetParam().Block().Height();
     const int bit_depth = GetParam().BitDepth();
     const InterpFilterParams *filter_params =
         FilterParams(filter, GetParam().Block());
     ConvolveParams conv_params =
         GetConvolveParams(0, conv_buf, kOutputStride, bit_depth, compound);
     test_func(src1, width, dst, kOutputStride, width, height, filter_params,
               sub_pix, &conv_params, bit_depth);
     conv_params =
         GetConvolveParams(1, conv_buf, kOutputStride, bit_depth, compound);
     test_func(src2, width, dst, kOutputStride, width, height, filter_params,
               sub_pix, &conv_params, bit_depth);
   }
 };

 TEST_P(AV1ConvolveXHighbdCompoundTest, RunTest) { RunTest(); }

 INSTANTIATE_TEST_SUITE_P(
     C, AV1ConvolveXHighbdCompoundTest,
     BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_x_c));

 #if HAVE_SSE4_1
 INSTANTIATE_TEST_SUITE_P(
     SSE4_1, AV1ConvolveXHighbdCompoundTest,
     BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_x_sse4_1));
 #endif

 #if HAVE_AVX2
 INSTANTIATE_TEST_SUITE_P(
     AVX2, AV1ConvolveXHighbdCompoundTest,
     BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_x_avx2));
 #endif

 /////////////////////////////////////////////////
 // Compound convolve-y functions (high bit-depth)
 /////////////////////////////////////////////////

 // Again, the X and Y convolve functions have the same type signature and logic.
 class AV1ConvolveYHighbdCompoundTest : public AV1ConvolveXHighbdCompoundTest {
   virtual highbd_convolve_x_func ReferenceFunc() const override {
     return av1_highbd_dist_wtd_convolve_y_c;
   }
   virtual const InterpFilterParams *FilterParams(
       InterpFilter f, const BlockSize &block) const override {
     return av1_get_interp_filter_params_with_block_size(f, block.Height());
   }
 };

 TEST_P(AV1ConvolveYHighbdCompoundTest, RunTest) { RunTest(); }

 INSTANTIATE_TEST_SUITE_P(
     C, AV1ConvolveYHighbdCompoundTest,
     BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_y_c));

 #if HAVE_SSE4_1
 INSTANTIATE_TEST_SUITE_P(
     SSE4_1, AV1ConvolveYHighbdCompoundTest,
     BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_y_sse4_1));
 #endif

 #if HAVE_AVX2
 INSTANTIATE_TEST_SUITE_P(
     AVX2, AV1ConvolveYHighbdCompoundTest,
     BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_y_avx2));
 #endif

 ///////////////////////////////////////////////////////
 // Compound convolve-2d-copy functions (high bit-depth)
 ///////////////////////////////////////////////////////
 typedef void (*highbd_compound_conv_2d_copy_func)(const uint16_t *src,
                                                   int src_stride, uint16_t *dst,
                                                   int dst_stride, int w, int h,
                                                   ConvolveParams *conv_params,
                                                   int bd);

 class AV1Convolve2DCopyHighbdCompoundTest
     : public AV1ConvolveTest<highbd_compound_conv_2d_copy_func> {
  public:
   void RunTest() {
     auto compound_params = GetCompoundParams();
     for (const auto &compound : compound_params) {
       TestConvolve(compound);
     }
   }

  public:
   void SpeedTest() {
     auto compound_params = GetCompoundParams();
     for (const auto &compound : compound_params) {
       SpeedTestConvolve(compound);
     }
   }

  private:
   void SpeedTestConvolve(const CompoundParam &compound) {
     const BlockSize &block = GetParam().Block();
     const int width = block.Width();
     const int height = block.Height();
     const int bit_depth = GetParam().BitDepth();
     int nob = 100000;

     const uint16_t *input = FirstRandomInput16(GetParam());
     DECLARE_ALIGNED(32, uint16_t, conv_buf[MAX_SB_SQUARE]);
     highbd_compound_conv_2d_copy_func test_func = GetParam().TestFunction();

     ConvolveParams conv_params =
         GetConvolveParams(0, conv_buf, kOutputStride, bit_depth, compound);
     ConvolveParams conv_params_do_avg =
         GetConvolveParams(1, conv_buf, kOutputStride, bit_depth, compound);

     aom_usec_timer timer;
     aom_usec_timer_start(&timer);
     for (int i = 0; i < nob; i++) {
       av1_highbd_dist_wtd_convolve_2d_copy_c(input, width, conv_buf,
                                              kOutputStride, width, height,
                                              &conv_params, bit_depth);
       av1_highbd_dist_wtd_convolve_2d_copy_c(input, width, conv_buf,
                                              kOutputStride, width, height,
                                              &conv_params_do_avg, bit_depth);
     }
     aom_usec_timer_mark(&timer);
     const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));

     aom_usec_timer timer1;
     aom_usec_timer_start(&timer1);
     for (int i = 0; i < nob; i++) {
       test_func(input, width, conv_buf, kOutputStride, width, height,
                 &conv_params, bit_depth);
       test_func(input, width, conv_buf, kOutputStride, width, height,
                 &conv_params_do_avg, bit_depth);
     }
     aom_usec_timer_mark(&timer1);
     const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
     printf("%d x %d block: bd: %d, Scaling = %.2f\n", width, height, bit_depth,
            (double)elapsed_time / elapsed_time1);
   }

  private:
   void TestConvolve(const CompoundParam &compound) {
     const BlockSize &block = GetParam().Block();
     const int width = block.Width();
     const int height = block.Height();

     const uint16_t *input1 = FirstRandomInput16(GetParam());
     const uint16_t *input2 = SecondRandomInput16(GetParam());
     DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
     DECLARE_ALIGNED(32, CONV_BUF_TYPE, reference_conv_buf[MAX_SB_SQUARE]);
     Convolve(av1_highbd_dist_wtd_convolve_2d_copy_c, input1, input2, reference,
              reference_conv_buf, compound);

     DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
     DECLARE_ALIGNED(32, CONV_BUF_TYPE, test_conv_buf[MAX_SB_SQUARE]);
     Convolve(GetParam().TestFunction(), input1, input2, test, test_conv_buf,
              compound);

     AssertOutputBufferEq(reference_conv_buf, test_conv_buf, width, height);
     AssertOutputBufferEq(reference, test, width, height);
   }

   void Convolve(highbd_compound_conv_2d_copy_func test_func,
                 const uint16_t *src1, const uint16_t *src2, uint16_t *dst,
                 uint16_t *conv_buf, const CompoundParam &compound) {
     const BlockSize &block = GetParam().Block();
     const int width = block.Width();
     const int height = block.Height();
     const int bit_depth = GetParam().BitDepth();

     ConvolveParams conv_params =
         GetConvolveParams(0, conv_buf, kOutputStride, bit_depth, compound);
     test_func(src1, width, dst, kOutputStride, width, height, &conv_params,
               bit_depth);

     conv_params =
         GetConvolveParams(1, conv_buf, kOutputStride, bit_depth, compound);
     test_func(src2, width, dst, kOutputStride, width, height, &conv_params,
               bit_depth);
   }
 };

 TEST_P(AV1Convolve2DCopyHighbdCompoundTest, RunTest) { RunTest(); }
 TEST_P(AV1Convolve2DCopyHighbdCompoundTest, DISABLED_SpeedTest) { SpeedTest(); }

 INSTANTIATE_TEST_SUITE_P(
     C, AV1Convolve2DCopyHighbdCompoundTest,
     BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_2d_copy_c));

 #if HAVE_SSE4_1
 INSTANTIATE_TEST_SUITE_P(
     SSE4_1, AV1Convolve2DCopyHighbdCompoundTest,
     BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_2d_copy_sse4_1));
 #endif

 #if HAVE_AVX2
 INSTANTIATE_TEST_SUITE_P(
     AVX2, AV1Convolve2DCopyHighbdCompoundTest,
     BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_2d_copy_avx2));
 #endif

 //////////////////////////////////////////////////
 // Compound convolve-2d functions (high bit-depth)
 //////////////////////////////////////////////////

 class AV1Convolve2DHighbdCompoundTest
     : public AV1ConvolveTest<highbd_convolve_2d_func> {
  public:
   void RunTest() {
     auto compound_params = GetCompoundParams();
     for (int h_f = EIGHTTAP_REGULAR; h_f < INTERP_FILTERS_ALL; ++h_f) {
       for (int v_f = EIGHTTAP_REGULAR; v_f < INTERP_FILTERS_ALL; ++v_f) {
         for (int sub_x = 0; sub_x < 16; ++sub_x) {
           for (int sub_y = 0; sub_y < 16; ++sub_y) {
             for (const auto &compound : compound_params) {
               TestConvolve(static_cast<InterpFilter>(h_f),
                            static_cast<InterpFilter>(v_f), sub_x, sub_y,
                            compound);
             }
           }
         }
       }
     }
   }

  private:
   void TestConvolve(const InterpFilter h_f, const InterpFilter v_f,
                     const int sub_x, const int sub_y,
                     const CompoundParam &compound) {
     const BlockSize &block = GetParam().Block();
     const int width = block.Width();
     const int height = block.Height();
     const uint16_t *input1 = FirstRandomInput16(GetParam());
     const uint16_t *input2 = SecondRandomInput16(GetParam());
     DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
     DECLARE_ALIGNED(32, CONV_BUF_TYPE, reference_conv_buf[MAX_SB_SQUARE]);
     Convolve(av1_highbd_dist_wtd_convolve_2d_c, input1, input2, reference,
              reference_conv_buf, compound, h_f, v_f, sub_x, sub_y);

     DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
     DECLARE_ALIGNED(32, CONV_BUF_TYPE, test_conv_buf[MAX_SB_SQUARE]);
     Convolve(GetParam().TestFunction(), input1, input2, test, test_conv_buf,
              compound, h_f, v_f, sub_x, sub_y);

     AssertOutputBufferEq(reference_conv_buf, test_conv_buf, width, height);
     AssertOutputBufferEq(reference, test, width, height);
   }

  private:
   void Convolve(highbd_convolve_2d_func test_func, const uint16_t *src1,
                 const uint16_t *src2, uint16_t *dst, uint16_t *conv_buf,
                 const CompoundParam &compound, const InterpFilter h_f,
                 const InterpFilter v_f, const int sub_x, const int sub_y) {
     const BlockSize &block = GetParam().Block();
     const int width = block.Width();
     const int height = block.Height();

     const InterpFilterParams *filter_params_x =
         av1_get_interp_filter_params_with_block_size(h_f, width);
     const InterpFilterParams *filter_params_y =
         av1_get_interp_filter_params_with_block_size(v_f, height);
     const int bit_depth = GetParam().BitDepth();
     ConvolveParams conv_params =
         GetConvolveParams(0, conv_buf, kOutputStride, bit_depth, compound);
     test_func(src1, width, dst, kOutputStride, width, height, filter_params_x,
               filter_params_y, sub_x, sub_y, &conv_params, bit_depth);

     conv_params =
         GetConvolveParams(1, conv_buf, kOutputStride, bit_depth, compound);
     test_func(src2, width, dst, kOutputStride, width, height, filter_params_x,
               filter_params_y, sub_x, sub_y, &conv_params, bit_depth);
   }
 };

 TEST_P(AV1Convolve2DHighbdCompoundTest, RunTest) { RunTest(); }

 INSTANTIATE_TEST_SUITE_P(
     C, AV1Convolve2DHighbdCompoundTest,
     BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_2d_c));

 #if HAVE_SSE4_1
 INSTANTIATE_TEST_SUITE_P(
     SSE4_1, AV1Convolve2DHighbdCompoundTest,
     BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_2d_sse4_1));
 #endif

 #if HAVE_AVX2
 INSTANTIATE_TEST_SUITE_P(
     AVX2, AV1Convolve2DHighbdCompoundTest,
     BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_2d_avx2));
 #endif

 //////////////////////////////////////////////////////////
 // Nonseparable convolve-2d functions (high bit-depth)
 //////////////////////////////////////////////////////////

 #if CONFIG_LR_IMPROVEMENTS
 typedef void (*highbd_convolve_nonsep_2d_func)(
     const uint16_t *src, int src_stride,
     const NonsepFilterConfig *filter_config, const int16_t *filter,
     uint16_t *dst, int dst_stride, int bit_depth, int block_row_begin,
     int block_row_end, int block_col_begin, int block_col_end);

 class AV1ConvolveNonSep2DHighbdTest
     : public AV1ConvolveTest<highbd_convolve_nonsep_2d_func> {
  public:
   void RunTest(RestorationType rtype) {
     for (int i = 0; i < kTestIterations; i++) {
       SetFilterTaps();
       TestConvolve(FilterTaps_, rtype);
     }
   }
   void RunSpeedTest(RestorationType rtype) {
     SpeedTestConvolve(FilterTaps_, rtype);
   };

  private:
   void BitMatchTest(const uint16_t *input, int input_stride, int width,
                     int height, const int16_t *filter, uint16_t *reference,
                     uint16_t *test, int dst_stride, int bit_depth,
                     int block_row_begin, int block_row_end, int block_col_begin,
                     int block_col_end, RestorationType rtype) {
     const NonsepFilterConfig *filter_config[2] = { NULL, NULL };
     highbd_convolve_nonsep_2d_func ref_func = av1_convolve_symmetric_highbd_c;
     const int num_planes = 2;

     if (rtype == RESTORE_PC_WIENER) {
       ref_func = av1_convolve_symmetric_highbd_c;
       filter_config[0] = &UnconstrainedSumFilterConfig_;
       filter_config[1] = &PcWienerNonsepFilterConfigChroma_;
     }

     // When CONFIG_WIENER_NONSEP=1, luma and chroma plane uses different number
     // of filter taps and both needs to be tested. Here, luma is tested for
     // 12/13-tap filtering whereas chroma is tested for 6-tap filtering.
     if (rtype == RESTORE_WIENER_NONSEP) {
       ref_func = av1_convolve_symmetric_subtract_center_highbd_c;
       filter_config[0] = &UnitSumFilterConfig_;
       filter_config[1] = &UnitSumFilterConfigChroma_;
     }

     assert(filter_config[0] != NULL && filter_config[1] != NULL);

     for (int plane = 0; plane < num_planes; plane++) {
       ref_func(input, input_stride, filter_config[plane], filter, reference,
                dst_stride, bit_depth, block_row_begin, block_row_end,
                block_col_begin, block_col_end);
       GetParam().TestFunction()(input, input_stride, filter_config[plane],
                                 filter, test, dst_stride, bit_depth,
                                 block_row_begin, block_row_end, block_col_begin,
                                 block_col_end);
       AssertOutputBufferEq(reference, test, width, height);
     }
   }
   void TestConvolve(const int16_t *filter, RestorationType rtype) {
     const int width = GetParam().Block().Width();
     const int height = GetParam().Block().Height();
     const int bit_depth = GetParam().BitDepth();

     const uint16_t *input = FirstRandomInput16(GetParam());
     DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
     DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);

     ASSERT_TRUE(kInputPadding >= kMaxTapOffset)
         << "Not enough padding for 7x7 filters";
     const uint16_t *centered_input =
         input + kMaxTapOffset * width + kMaxTapOffset;
     const int input_stride = width;
     BitMatchTest(centered_input, input_stride, width, height, filter, reference,
                  test, kOutputStride, bit_depth, 0, height, 0, width, rtype);
     // Extreme value test
     const uint16_t *extreme_input = FirstRandomInput16Extreme(GetParam());
     const uint16_t *centered_extreme_input =
         extreme_input + kMaxTapOffset * width + kMaxTapOffset;
     int16_t Extream_Tap_[kNumSymmetricTaps + 1];
     RandomizeExtreamFilterTap(Extream_Tap_, kNumSymmetricTaps + 1,
                               kMaxPrecisionBeforeOverflow);
     BitMatchTest(centered_extreme_input, input_stride, width, height,
                  Extream_Tap_, reference, test, kOutputStride, bit_depth, 0,
                  height, 0, width, rtype);
   }

   void SpeedTestConvolve(const int16_t *filter, RestorationType rtype) {
     const int width = GetParam().Block().Width();
     const int height = GetParam().Block().Height();
     const int bit_depth = GetParam().BitDepth();
     const int num_planes = 2;

     const uint16_t *input = FirstRandomInput16(GetParam());
     DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
     DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);

     ASSERT_TRUE(kInputPadding >= kMaxTapOffset)
         << "Not enough padding for 7x7 filters";
     const uint16_t *centered_input =
         input + kMaxTapOffset * width + kMaxTapOffset;

     // Calculate time taken for C function
     const NonsepFilterConfig *filter_config[2] = { NULL, NULL };
     highbd_convolve_nonsep_2d_func ref_func = av1_convolve_symmetric_highbd_c;

     if (rtype == RESTORE_PC_WIENER) {
       ref_func = av1_convolve_symmetric_highbd_c;
       filter_config[0] = &UnconstrainedSumFilterConfig_;
       filter_config[1] = &PcWienerNonsepFilterConfigChroma_;
     }

     // When CONFIG_WIENER_NONSEP=1, luma and chroma uses different number of
     // filter taps and both needs to be tested. Here, luma is tested for
     // 12/13-tap filtering whereas chroma is tested for 6-tap filtering.
     if (rtype == RESTORE_WIENER_NONSEP) {
       ref_func = av1_convolve_symmetric_subtract_center_highbd_c;
       filter_config[0] = &UnitSumFilterConfig_;
       filter_config[1] = &UnitSumFilterConfigChroma_;
     }

     for (int plane = 0; plane < num_planes; plane++) {
       // Calculate time taken by reference/c function
       aom_usec_timer timer;
       aom_usec_timer_start(&timer);
       for (int i = 0; i < kSpeedIterations; ++i) {
         ref_func(centered_input, width, filter_config[plane], filter, reference,
                  kOutputStride, bit_depth, 0, height, 0, width);
       }
       aom_usec_timer_mark(&timer);
       auto elapsed_time_c = aom_usec_timer_elapsed(&timer);

       // Calculate time taken by optimized/intrinsic function
       aom_usec_timer_start(&timer);
       for (int i = 0; i < kSpeedIterations; ++i) {
         GetParam().TestFunction()(centered_input, width, filter_config[plane],
                                   filter, test, kOutputStride, bit_depth, 0,
                                   height, 0, width);
       }
       aom_usec_timer_mark(&timer);
       auto elapsed_time_opt = aom_usec_timer_elapsed(&timer);

       float c_time_per_pixel =
           (float)1000.0 * elapsed_time_c / (kSpeedIterations * width * height);
       float opt_time_per_pixel = (float)1000.0 * elapsed_time_opt /
                                  (kSpeedIterations * width * height);
       float scaling = c_time_per_pixel / opt_time_per_pixel;
       printf(
           "plane=%3d, %3dx%-3d: c_time_per_pixel=%10.5f, "
           "opt_time_per_pixel=%10.5f,  scaling=%f \n",
           plane, width, height, c_time_per_pixel, opt_time_per_pixel, scaling);
     }
   }

   // Generates NonsepFilterConfig compliant origin symmetric filter tap values.
   // The first (2 * kNumSymmetricTaps) are for the CONFIG_WIENER_NONSEP use case
   // where the center tap is constrained so that filter sums to one. The last
   // added tap at (2 * kNumSymmetricTaps) is unconstrained and intended for
   // CONFIG_PC_WIENER use case.
   void SetFilterTaps() {
     Randomize(FilterTaps_, kNumSymmetricTaps + 1, kMaxPrecisionBeforeOverflow);
   }

   // Fills the array p with signed integers.
   void Randomize(int16_t *p, int size, int max_bit_range) {
     ASSERT_TRUE(max_bit_range < 16) << "max_bit_range has to be less than 16";
     for (int i = 0; i < size; ++i) {
       p[i] = rnd_.Rand15Signed() & ((1 << max_bit_range) - 1);
     }
   }

   // Fills the array p with maximum and minimum possible integers.
   void RandomizeExtreamFilterTap(int16_t *p, int size, int max_bit_range) {
     ASSERT_TRUE(max_bit_range < 16) << "max_bit_range has to be less than 16";
     const int sign_max_val = (1 << (max_bit_range - 1)) - 1;
     for (int i = 0; i < size; ++i) {
       p[i] = static_cast<uint16_t>(RandBool() ? sign_max_val
                                               : -(sign_max_val + 1));
     }
   }

   int RandBool() {
     const uint32_t value = rnd_.Rand8();
     // There's a bit more entropy in the upper bits of this implementation.
     return (value >> 7) & 0x1;
   }

   libaom_test::ACMRandom rnd_;
   static constexpr int kMaxPrecisionBeforeOverflow = 12;
   static constexpr int kNumSymmetricTaps = 12;
   static constexpr int kNumSymmetricTapsChroma = 6;
   static constexpr int kMaxTapOffset = 3;  // Filters are 7x7.
   static constexpr int kSpeedIterations = 10000;
   static constexpr int kTestIterations = 100;

   // Configuration for nonseparable 7x7 filters for DIAMOND shape.
   // Format is offset (i) row and (ii) column from center pixel
   // and the (iii) filter-tap index that multiplies the pixel at
   // the respective offset.
   const int NonsepConfig_[25][3] = {
     { -3, 0, 0 },  { 3, 0, 0 },  { -2, -1, 1 }, { 2, 1, 1 },   { -2, 0, 2 },
     { 2, 0, 2 },   { -2, 1, 3 }, { 2, -1, 3 },  { -1, -2, 4 }, { 1, 2, 4 },
     { -1, -1, 5 }, { 1, 1, 5 },  { -1, 0, 6 },  { 1, 0, 6 },   { -1, 1, 7 },
     { 1, -1, 7 },  { -1, 2, 8 }, { 1, -2, 8 },  { 0, -3, 9 },  { 0, 3, 9 },
     { 0, -2, 10 }, { 0, 2, 10 }, { 0, -1, 11 }, { 0, 1, 11 },  { 0, 0, 12 },
   };

   const int wienerns_wout_subtract_center_config_uv_from_uv_[13][3] = {
     { 1, 0, 0 },   { -1, 0, 0 }, { 0, 1, 1 },  { 0, -1, 1 }, { 1, 1, 2 },
     { -1, -1, 2 }, { -1, 1, 3 }, { 1, -1, 3 }, { 2, 0, 4 },  { -2, 0, 4 },
     { 0, 2, 5 },   { 0, -2, 5 }, { 0, 0, 6 },
   };

   // Filters use all unique taps.
   const NonsepFilterConfig UnconstrainedSumFilterConfig_ = {
     kMaxPrecisionBeforeOverflow,
     2 * kNumSymmetricTaps + 1,
     0,
     NonsepConfig_,
     NULL,
     0,
     0
   };

   const NonsepFilterConfig PcWienerNonsepFilterConfigChroma_ = {
     kMaxPrecisionBeforeOverflow,
     2 * kNumSymmetricTapsChroma + 1,
     0,
     wienerns_wout_subtract_center_config_uv_from_uv_,
     NULL,
     0,
     0
   };

   // Configuration for UnitSumFilterConfig_ wiener nonseparable 7x7 filters for
   // DIAMOND shape. Format is offset (i) row and (ii) column from center pixel
   // and the (iii) filter-tap index that multiplies the pixel at the respective
   // offset.
   const int WienerNonsepConfig_[25][3] = {
     { 1, 0, 0 },
     { -1, 0, 0 },
     { 0, 1, 1 },
     { 0, -1, 1 },
     { 2, 0, 2 },
     { -2, 0, 2 },
     { 0, 2, 3 },
     { 0, -2, 3 },
     { 1, 1, 4 },
     { -1, -1, 4 },
     { -1, 1, 5 },
     { 1, -1, 5 },
     { 2, 1, 6 },
     { -2, -1, 6 },
     { 2, -1, 7 },
     { -2, 1, 7 },
     { 1, 2, 8 },
     { -1, -2, 8 },
     { 1, -2, 9 },
     { -1, 2, 9 },
     { 3, 0, 10 },
     { -3, 0, 10 },
     { 0, 3, 11 },
     { 0, -3, 11 },
 #if USE_CENTER_WIENER_NONSEP
     { 0, 0, 12 },
 #endif  // USE_CENTER_WIENER_NONSEP
   };

   const int WienerNonsepConfigChroma_[12][3] = {
     { 1, 0, 0 }, { -1, 0, 0 },  { 0, 1, 1 },  { 0, -1, 1 },
     { 1, 1, 2 }, { -1, -1, 2 }, { -1, 1, 3 }, { 1, -1, 3 },
     { 2, 0, 4 }, { -2, 0, 4 },  { 0, 2, 5 },  { 0, -2, 5 },
   };

   // Filters use only the first (2 * kNumSymmetricTaps) taps. Center tap is
   // constrained.
   const NonsepFilterConfig UnitSumFilterConfig_ = {
     kMaxPrecisionBeforeOverflow,
 #if USE_CENTER_WIENER_NONSEP
     2 * kNumSymmetricTaps + 1,
 #else
     2 * kNumSymmetricTaps,
 #endif  // USE_CENTER_WIENER_NONSEP
     0,
     WienerNonsepConfig_,
     NULL,
     0,
     1
   };

   // Config used for filtering of chroma when CONFIG_WIENER_NONSEP=1.
   const NonsepFilterConfig UnitSumFilterConfigChroma_ = {
     kMaxPrecisionBeforeOverflow,
     2 * kNumSymmetricTapsChroma,
     0,
     WienerNonsepConfigChroma_,
     NULL,
     0,
     1
   };

   int16_t FilterTaps_[kNumSymmetricTaps + 1];
 };

 TEST_P(AV1ConvolveNonSep2DHighbdTest, RunTest) { RunTest(RESTORE_PC_WIENER); }

 TEST_P(AV1ConvolveNonSep2DHighbdTest, DISABLED_Speed) {
   RunSpeedTest(RESTORE_PC_WIENER);
 }

 #if HAVE_AVX2
 INSTANTIATE_TEST_SUITE_P(AVX2, AV1ConvolveNonSep2DHighbdTest,
                          BuildHighbdParams(av1_convolve_symmetric_highbd_avx2));
 #endif

 class AV1ConvolveWienerNonSep2DHighbdTest
     : public AV1ConvolveNonSep2DHighbdTest {};

 TEST_P(AV1ConvolveWienerNonSep2DHighbdTest, RunTest) {
   RunTest(RESTORE_WIENER_NONSEP);
 }
 TEST_P(AV1ConvolveWienerNonSep2DHighbdTest, DISABLED_Speed) {
   RunSpeedTest(RESTORE_WIENER_NONSEP);
 }

 #if HAVE_AVX2
 INSTANTIATE_TEST_SUITE_P(
     AVX2, AV1ConvolveWienerNonSep2DHighbdTest,
     BuildHighbdParams(av1_convolve_symmetric_subtract_center_highbd_avx2));
 #endif

 #endif  // CONFIG_LR_IMPROVEMENTS

 //////////////////////////////////////////////////////////
 // Nonseparable convolve-2d Dual functions (high bit-depth)
 //////////////////////////////////////////////////////////

 #if CONFIG_WIENER_NONSEP_CROSS_FILT
 typedef void (*highbd_convolve_nonsep_dual_2d_func)(
     const uint16_t *dgd, int dgd_stride, const uint16_t *dgd_dual,
     int dgd_dual_stride, const NonsepFilterConfig *filter_config,
     const int16_t *filter, uint16_t *dst, int dst_stride, int bit_depth,
     int block_row_begin, int block_row_end, int block_col_begin,
     int block_col_end);

 class AV1ConvolveNon_Sep_dual2DHighbdTest
     : public AV1ConvolveTest<highbd_convolve_nonsep_dual_2d_func> {
  public:
   void RunTest(int is_subtract_center) {
     for (int i = 0; i < kTestIterations; i++) {
       SetFilterTaps();
       TestConvolve(FilterTaps_, is_subtract_center);
     }
   }
   void RunSpeedTest(int is_subtract_center) {
     SpeedTestConvolve(FilterTaps_, is_subtract_center);
   };

  private:
   libaom_test::ACMRandom rnd_;
   static constexpr int kMaxPrecisionBeforeOverflow = 12;
   static constexpr int kNumSymmetricTaps = 6;
   // In dual filtering, 7 taps (6 symmetric + 1 center) are required for each of
   // the buffer.
   static constexpr int kNumSubtractCenterOffTaps = (2 * kNumSymmetricTaps) + 2;
   static constexpr int kMaxTapOffset = 2;  // Filters are 5x5.
   static constexpr int kSpeedIterations = 10000;
   static constexpr int kTestIterations = 100;

   // Declare the filter taps for worst case (i.e., for subtract center off
   // case).
   int16_t FilterTaps_[kNumSubtractCenterOffTaps];

   // Fills the array p with signed integers.
   void Randomize(int16_t *p, int size, int max_bit_range) {
     ASSERT_TRUE(max_bit_range < 16) << "max_bit_range has to be less than 16";
     for (int i = 0; i < size; ++i) {
       p[i] = rnd_.Rand15Signed() & ((1 << max_bit_range) - 1);
     }
   }

   void SetFilterTaps() {
     Randomize(FilterTaps_, kNumSubtractCenterOffTaps,
               kMaxPrecisionBeforeOverflow);
   }

   int RandBool() {
     const uint32_t value = rnd_.Rand8();
     // There's a bit more entropy in the upper bits of this implementation.
     return (value >> 7) & 0x1;
   }

   // Fills the array p with maximum and minimum possible integers.
   void RandomizeExtreamFilterTap(int16_t *p, int size, int max_bit_range) {
     ASSERT_TRUE(max_bit_range < 16) << "max_bit_range has to be less than 16";
     const int sign_max_val = (1 << (max_bit_range - 1)) - 1;
     for (int i = 0; i < size; ++i) {
       p[i] = static_cast<uint16_t>(RandBool() ? sign_max_val
                                               : -(sign_max_val + 1));
     }
   }

   void BitMatchTest(const uint16_t *dgd, const uint16_t *dgd_dual,
                     int dgd_stride, int width, int height,
                     const int16_t *filter, uint16_t *reference, uint16_t *test,
                     int dst_stride, int bit_depth, int block_row_begin,
                     int block_row_end, int block_col_begin, int block_col_end,
                     int is_subtract_center) {
     // Set filter_config and reference function appropriately.
     highbd_convolve_nonsep_dual_2d_func ref_func;
     const NonsepFilterConfig *filter_cfg;

     filter_cfg = &DualFilterWithCenterConfig_;
     ref_func = av1_convolve_symmetric_dual_subtract_center_highbd_c;

     if (!is_subtract_center) {
       ref_func = av1_convolve_symmetric_dual_highbd_c;
       filter_cfg = &DualFilterWithoutCenterConfig_;
     }
     // Reference function
     ref_func(dgd, dgd_stride, dgd_dual, dgd_stride, filter_cfg, filter,
              reference, dst_stride, bit_depth, block_row_begin, block_row_end,
              block_col_begin, block_col_end);

     // Test function
     GetParam().TestFunction()(dgd, dgd_stride, dgd_dual, dgd_stride, filter_cfg,
                               filter, test, dst_stride, bit_depth,
                               block_row_begin, block_row_end, block_col_begin,
                               block_col_end);

     // Compare the output of reference and test for bit match
     AssertOutputBufferEq(reference, test, width, height);
   }

   void TestConvolve(const int16_t *filter, int is_subtract_center) {
     const int width = GetParam().Block().Width();
     const int height = GetParam().Block().Height();
     const int bit_depth = GetParam().BitDepth();

     const uint16_t *dgd = FirstRandomInput16(GetParam());
     const uint16_t *dgd_dual = FirstRandomInput16(GetParam());
     DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
     DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);

     ASSERT_TRUE(kInputPadding >= kMaxTapOffset)
         << "Not enough padding for 5x5 filters";
     const uint16_t *centered_input1 =
         dgd + kMaxTapOffset * width + kMaxTapOffset;
     const uint16_t *centered_input2 =
         dgd_dual + kMaxTapOffset * width + kMaxTapOffset;
     const int input_stride = width;
     BitMatchTest(centered_input1, centered_input2, input_stride, width, height,
                  filter, reference, test, kOutputStride, bit_depth, 0, height,
                  0, width, is_subtract_center);
     // Extreme value test
     const uint16_t *extreme_input1 = FirstRandomInput16Extreme(GetParam());
     const uint16_t *extreme_input2 = FirstRandomInput16Extreme(GetParam());
     const uint16_t *centered_extreme_input1 =
         extreme_input1 + kMaxTapOffset * width + kMaxTapOffset;
     const uint16_t *centered_extreme_input2 =
         extreme_input2 + kMaxTapOffset * width + kMaxTapOffset;
     int16_t Extream_Tap_[kNumSubtractCenterOffTaps];
     RandomizeExtreamFilterTap(Extream_Tap_, kNumSubtractCenterOffTaps,
                               kMaxPrecisionBeforeOverflow);
     BitMatchTest(centered_extreme_input1, centered_extreme_input2, input_stride,
                  width, height, Extream_Tap_, reference, test, kOutputStride,
                  bit_depth, 0, height, 0, width, is_subtract_center);
   }

   void SpeedTestConvolve(const int16_t *filter, int is_subtract_center) {
     const int width = GetParam().Block().Width();
     const int height = GetParam().Block().Height();
     const int bit_depth = GetParam().BitDepth();

     const uint16_t *dgd = FirstRandomInput16(GetParam());
     const uint16_t *dgd_dual = FirstRandomInput16(GetParam());
     DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
     DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);

     ASSERT_TRUE(kInputPadding >= kMaxTapOffset)
         << "Not enough padding for 5x5 filters";
     const uint16_t *centered_input1 =
         dgd + kMaxTapOffset * width + kMaxTapOffset;
     const uint16_t *centered_input2 =
         dgd_dual + kMaxTapOffset * width + kMaxTapOffset;

     // Set filter_config and reference function appropriately.
     highbd_convolve_nonsep_dual_2d_func ref_func;
     const NonsepFilterConfig *filter_cfg;

     filter_cfg = &DualFilterWithCenterConfig_;
     ref_func = av1_convolve_symmetric_dual_subtract_center_highbd_c;

     if (!is_subtract_center) {
       ref_func = av1_convolve_symmetric_dual_highbd_c;
       filter_cfg = &DualFilterWithoutCenterConfig_;
     }

     // Calculate time taken by reference/c function
     aom_usec_timer timer;
     aom_usec_timer_start(&timer);
     for (int i = 0; i < kSpeedIterations; ++i) {
       ref_func(centered_input1, width, centered_input2, width, filter_cfg,
                filter, reference, kOutputStride, bit_depth, 0, height, 0,
                width);
     }
     aom_usec_timer_mark(&timer);
     auto elapsed_time_c = aom_usec_timer_elapsed(&timer);

     // Calculate time taken by optimized/intrinsic function
     aom_usec_timer_start(&timer);
     for (int i = 0; i < kSpeedIterations; ++i) {
       GetParam().TestFunction()(centered_input1, width, centered_input2, width,
                                 filter_cfg, filter, test, kOutputStride,
                                 bit_depth, 0, height, 0, width);
     }
     aom_usec_timer_mark(&timer);
     auto elapsed_time_opt = aom_usec_timer_elapsed(&timer);

     float c_time_per_pixel =
         (float)1000.0 * elapsed_time_c / (kSpeedIterations * width * height);
     float opt_time_per_pixel =
         (float)1000.0 * elapsed_time_opt / (kSpeedIterations * width * height);
     float scaling = c_time_per_pixel / opt_time_per_pixel;
     printf(
         " %3dx%-3d: c_time_per_pixel=%10.5f, "
         "opt_time_per_pixel=%10.5f,  scaling=%f \n",
         width, height, c_time_per_pixel, opt_time_per_pixel, scaling);
   }

   const int wienerns_config_uv_from_uv[12][3] = {
     { 1, 0, 0 }, { -1, 0, 0 },  { 0, 1, 1 },  { 0, -1, 1 },
     { 1, 1, 2 }, { -1, -1, 2 }, { -1, 1, 3 }, { 1, -1, 3 },
     { 2, 0, 4 }, { -2, 0, 4 },  { 0, 2, 5 },  { 0, -2, 5 },
   };

   const int wienerns_config_uv_from_y[12][3] = {
     { 1, 0, 6 },  { -1, 0, 6 },  { 0, 1, 7 },  { 0, -1, 7 },
     { 1, 1, 8 },  { -1, -1, 8 }, { -1, 1, 9 }, { 1, -1, 9 },
     { 2, 0, 10 }, { -2, 0, 10 }, { 0, 2, 11 }, { 0, -2, 11 },
   };

   const int wienerns_wout_subtract_center_config_uv_from_uv[13][3] = {
     { 1, 0, 0 },   { -1, 0, 0 }, { 0, 1, 1 },  { 0, -1, 1 }, { 1, 1, 2 },
     { -1, -1, 2 }, { -1, 1, 3 }, { 1, -1, 3 }, { 2, 0, 4 },  { -2, 0, 4 },
     { 0, 2, 5 },   { 0, -2, 5 }, { 0, 0, 6 },
   };

   // Adjust the beginning tap to account for the above change and add a tap at
   // (0, 0).
   const int wienerns_wout_subtract_center_config_uv_from_y[13][3] = {
     { 1, 0, 7 },   { -1, 0, 7 },  { 0, 1, 8 },   { 0, -1, 8 }, { 1, 1, 9 },
     { -1, -1, 9 }, { -1, 1, 10 }, { 1, -1, 10 }, { 2, 0, 11 }, { -2, 0, 11 },
     { 0, 2, 12 },  { 0, -2, 12 }, { 0, 0, 13 },
   };

   const NonsepFilterConfig DualFilterWithCenterConfig_ = {
     kMaxPrecisionBeforeOverflow,  // prec_bits;
     sizeof(wienerns_config_uv_from_uv) /
         sizeof(wienerns_config_uv_from_uv[0]),  // num_pixels;
     sizeof(wienerns_config_uv_from_y) /
         sizeof(wienerns_config_uv_from_y[0]),  // num_pixels2
     wienerns_config_uv_from_uv,                // config
     wienerns_config_uv_from_y,                 // config2
     0,                                         // strict_bounds
     1                                          // subtract_center
   };

   const NonsepFilterConfig DualFilterWithoutCenterConfig_ = {
     kMaxPrecisionBeforeOverflow,  // prec_bits;
     sizeof(wienerns_wout_subtract_center_config_uv_from_uv) /
         sizeof(
             wienerns_wout_subtract_center_config_uv_from_uv[0]),  // num_pixels;
     sizeof(wienerns_wout_subtract_center_config_uv_from_y) /
         sizeof(
             wienerns_wout_subtract_center_config_uv_from_y[0]),  // num_pixels2
     wienerns_wout_subtract_center_config_uv_from_uv,             // config
     wienerns_wout_subtract_center_config_uv_from_y,              // config2
     0,  // strict_bounds
     0   // subtract_center
   };
 };

 TEST_P(AV1ConvolveNon_Sep_dual2DHighbdTest, RunTest) { RunTest(1); }
 TEST_P(AV1ConvolveNon_Sep_dual2DHighbdTest, DISABLED_Speed) { RunSpeedTest(1); }

 #if HAVE_AVX2
 INSTANTIATE_TEST_SUITE_P(
     AVX2, AV1ConvolveNon_Sep_dual2DHighbdTest,
     BuildHighbdParams(av1_convolve_symmetric_dual_subtract_center_highbd_avx2));
 #endif  // HAVE_AVX2

 /* Dual with subtract center off unit-test*/
 class AV1ConvolveDualWithoutsubtract2DHighbdTest
     : public AV1ConvolveNon_Sep_dual2DHighbdTest {};

 TEST_P(AV1ConvolveDualWithoutsubtract2DHighbdTest, RunTest) { RunTest(0); }
 TEST_P(AV1ConvolveDualWithoutsubtract2DHighbdTest, DISABLED_Speed) {
   RunSpeedTest(0);
 }

 #if HAVE_AVX2
 INSTANTIATE_TEST_SUITE_P(
     AVX2, AV1ConvolveDualWithoutsubtract2DHighbdTest,
     BuildHighbdParams(av1_convolve_symmetric_dual_highbd_avx2));
 #endif

 #endif  // CONFIG_WIENER_NONSEP_CROSS_FILT

 //////////////////////////////////////////////////////////
 // Unit-test corresponds to buffer accumulations to derive filter
 // index for each block size (pc_wiener_block_size: 4x4)
 //////////////////////////////////////////////////////////

 #if CONFIG_LR_IMPROVEMENTS

 // Generate the list of all block widths / heights that need to be tested for
 // pc_wiener.
 template <typename T>
 std::vector<TestParam<T>> GetPCWienerTestParams(
     std::initializer_list<int> bit_depths, T test_func) {
   std::set<BlockSize> sizes;
   for (int b = BLOCK_4X4; b < BLOCK_SIZES_ALL; ++b) {
     const int w = block_size_wide[b];
     const int h = block_size_high[b];
     if (w > RESTORATION_PROC_UNIT_SIZE || h > RESTORATION_PROC_UNIT_SIZE) {
       continue;
     }
     sizes.insert(BlockSize(w, h));
     // Add in smaller chroma sizes as well.
     if (w == 4 || h == 4) {
       sizes.insert(BlockSize(w / 2, h / 2));
     }
   }
   std::vector<TestParam<T>> result;
   for (const BlockSize &block : sizes) {
     for (int bd : bit_depths) {
       result.push_back(TestParam<T>(block, bd, test_func));
     }
   }
   return result;
 }
 template <typename T>
 ::testing::internal::ParamGenerator<TestParam<T>> BuildHighbdPCWienerParams(
     T test_func) {
   return ::testing::ValuesIn(GetPCWienerTestParams({ 10, 12 }, test_func));
 }

 typedef void (*fill_directional_feature_buffers_highbd_func)(
     int *feature_sum_buffers[], int16_t *feature_line_buffers[], int row,
     int buffer_row, const uint16_t *dgd, int dgd_stride, int width,
     int feature_lead, int feature_lag);

 class AV1FillDirFeatureBufHighbdTest
     : public AV1ConvolveTest<fill_directional_feature_buffers_highbd_func> {
  public:
   void RunTest() {
     for (int i = 0; i < kTestIterations; i++) {
       // Set buffer values here.
       SetBufferValues();
       TestConvolve();
     }
   }

   void RunSpeedTest() { SpeedTestConvolve(); };

  protected:
   virtual void SetUp() {
     for (int j = 0; j < NUM_FEATURE_LINE_BUFFERS; ++j) {
       feature_line_buffers_c_[j] = static_cast<int16_t *>(
           (aom_malloc(buffer_width_ * sizeof(*feature_line_buffers_c_[j]))));
       ASSERT_NE(feature_line_buffers_c_[j], nullptr);

       feature_line_buffers_simd_[j] = static_cast<int16_t *>(
           (aom_malloc(buffer_width_ * sizeof(*feature_line_buffers_simd_[j]))));
       ASSERT_NE(feature_line_buffers_simd_[j], nullptr);
     }

     for (int j = 0; j < NUM_PC_WIENER_FEATURES; ++j) {
       feature_sum_buffers_c_[j] = static_cast<int *>(
           (aom_malloc(buffer_width_ * sizeof(*feature_sum_buffers_c_[j]))));
       ASSERT_NE(feature_sum_buffers_c_[j], nullptr);

       feature_sum_buffers_simd_[j] = static_cast<int *>(
           (aom_malloc(buffer_width_ * sizeof(*feature_sum_buffers_simd_[j]))));
       ASSERT_NE(feature_sum_buffers_simd_[j], nullptr);
     }
   }

   virtual void TearDown() {
     for (int j = 0; j < NUM_FEATURE_LINE_BUFFERS; ++j) {
       aom_free(feature_line_buffers_c_[j]);
       feature_line_buffers_c_[j] = NULL;
       aom_free(feature_line_buffers_simd_[j]);
       feature_line_buffers_simd_[j] = NULL;
     }

     for (int j = 0; j < NUM_PC_WIENER_FEATURES; ++j) {
       aom_free(feature_sum_buffers_c_[j]);
       feature_sum_buffers_c_[j] = NULL;
       aom_free(feature_sum_buffers_simd_[j]);
       feature_sum_buffers_simd_[j] = NULL;
     }
   }

   void SetBufferValues() {
     const int bitdepth = GetParam().BitDepth();
     for (int j = 0; j < NUM_FEATURE_LINE_BUFFERS; ++j) {
       Randomize(feature_line_buffers_c_[j], buffer_width_, bitdepth);
       memcpy(feature_line_buffers_simd_[j], feature_line_buffers_c_[j],
              buffer_width_ * sizeof(*feature_line_buffers_simd_[j]));
     }

     for (int j = 0; j < NUM_PC_WIENER_FEATURES; ++j) {
       RandomizeSigned31(feature_sum_buffers_c_[j], buffer_width_, 31);
       memcpy(feature_sum_buffers_simd_[j], feature_sum_buffers_c_[j],
              buffer_width_ * sizeof(*feature_sum_buffers_simd_[j]));
     }
   }

  private:
   libaom_test::ACMRandom rnd_;
   static constexpr int kSpeedIterations = 10000;
   static constexpr int kTestIterations = 100;

   void TestConvolve() {
     const int width = GetParam().Block().Width();
     const int height = GetParam().Block().Height();
     // Input buffer allocation.
     const uint16_t *input = FirstRandomInput16(GetParam());
     const int input_stride = width;

     // C function call
     for (int i = 0; i < height; ++i) {
       const int row_to_process = AOMMIN(i + feature_lag, height + 3 - 2);
       fill_directional_feature_buffers_highbd_c(
           feature_sum_buffers_c_, feature_line_buffers_c_, row_to_process,
           feature_length - 1, input, input_stride, width, feature_lead,
           feature_lag);
     }

     // SIMD function call
     for (int i = 0; i < height; ++i) {
       const int row_to_process = AOMMIN(i + feature_lag, height + 3 - 2);
       GetParam().TestFunction()(feature_sum_buffers_simd_,
                                 feature_line_buffers_simd_, row_to_process,
                                 feature_length - 1, input, input_stride, width,
                                 feature_lead, feature_lag);
     }

     // Compare the outputs of C and SIMD
     for (int i = 0; i < NUM_PC_WIENER_FEATURES; i++) {
       int *c_buf = feature_sum_buffers_c_[i];
       int *simd_buf = feature_sum_buffers_simd_[i];
       for (int j = 0; j < buffer_width_; ++j) {
         ASSERT_EQ(c_buf[j], simd_buf[j])
             << "feature_buf=" << i << " Pixel mismatch at width (" << i << ")";
       }
     }
   }

   void SpeedTestConvolve() {
     const int width = GetParam().Block().Width();
     const int height = GetParam().Block().Height();

     // Input buffer allocation.
     const uint16_t *input = FirstRandomInput16(GetParam());
     const int input_stride = width;

     // Calculate time taken for C function
     aom_usec_timer timer;
     aom_usec_timer_start(&timer);
     for (int i = 0; i < kSpeedIterations; ++i) {
       for (int i = 0; i < height; ++i) {
         const int row_to_process = AOMMIN(i + feature_lag, height + 3 - 2);
         fill_directional_feature_buffers_highbd_c(
             feature_sum_buffers_c_, feature_line_buffers_c_, row_to_process,
             feature_length - 1, input, input_stride, width, feature_lead,
             feature_lag);
       }
     }
     aom_usec_timer_mark(&timer);
     auto elapsed_time_c = aom_usec_timer_elapsed(&timer);

     // Calculate time taken by optimized/intrinsic function
     aom_usec_timer_start(&timer);
     for (int i = 0; i < kSpeedIterations; ++i) {
       for (int i = 0; i < height; ++i) {
         const int row_to_process = AOMMIN(i + feature_lag, height + 3 - 2);
         GetParam().TestFunction()(feature_sum_buffers_simd_,
                                   feature_line_buffers_simd_, row_to_process,
                                   feature_length - 1, input, input_stride,
                                   width, feature_lead, feature_lag);
       }
     }
     aom_usec_timer_mark(&timer);
     auto elapsed_time_opt = aom_usec_timer_elapsed(&timer);

     float c_time_per_pixel =
         (float)1000.0 * elapsed_time_c / (kSpeedIterations * width * height);
     float opt_time_per_pixel =
         (float)1000.0 * elapsed_time_opt / (kSpeedIterations * width * height);
     float scaling = c_time_per_pixel / opt_time_per_pixel;
     printf(
         "%3dx%-3d: c_time_per_pixel=%10.5f, "
         "opt_time_per_pixel=%10.5f,  scaling=%f \n",
         width, height, c_time_per_pixel, opt_time_per_pixel, scaling);
   }

   // Fills the array p with signed integers.
   void Randomize(int16_t *p, int size, int max_bit_range) {
     ASSERT_TRUE(max_bit_range < 16) << "max_bit_range has to be less than 16";
     for (int i = 0; i < size; ++i) {
       p[i] = rnd_.Rand15Signed() & ((1 << max_bit_range) - 1);
     }
   }

   // Fills the array p with signed integers of 31 bit range.
   void RandomizeSigned31(int *p, int size, uint32_t max_bit_range) {
     assert(max_bit_range <= 31);
     uint32_t mask = (uint32_t)(1 << max_bit_range) - 1;
     for (int i = 0; i < size; ++i) {
       p[i] = (int)(rnd_.Rand31() & mask);
     }
   }

   int *feature_sum_buffers_c_[NUM_PC_WIENER_FEATURES];
   int *feature_sum_buffers_simd_[NUM_PC_WIENER_FEATURES];
   int16_t *feature_line_buffers_c_[NUM_FEATURE_LINE_BUFFERS];
   int16_t *feature_line_buffers_simd_[NUM_FEATURE_LINE_BUFFERS];
   const int feature_lead = PC_WIENER_FEATURE_LEAD_LUMA;
   const int feature_lag = PC_WIENER_FEATURE_LAG_LUMA;
   const int feature_length = PC_WIENER_FEATURE_LENGTH_LUMA;
   const int buffer_width_ = MAX_SB_SIZE + kInputPadding;
 };

 TEST_P(AV1FillDirFeatureBufHighbdTest, RunTest) { RunTest(); }

 TEST_P(AV1FillDirFeatureBufHighbdTest, DISABLED_Speed) { RunSpeedTest(); }

 #if HAVE_AVX2
 INSTANTIATE_TEST_SUITE_P(
     AVX2, AV1FillDirFeatureBufHighbdTest,
     BuildHighbdPCWienerParams(fill_directional_feature_buffers_highbd_avx2));
 #endif  // HAVE_AVX2

 typedef void (*FillTSkipSumBufferFunc)(int row, const uint8_t *tskip,
                                        int tskip_stride,
                                        int8_t *tskip_sum_buffer, int width,
                                        int height, int tskip_lead,
                                        int tskip_lag, bool use_strict_bounds);

 typedef std::tuple<const FillTSkipSumBufferFunc> AV1FillTSkipSumBufferFuncParam;

 class AV1Fill_TSkip_Sum_BufferTest
     : public ::testing::TestWithParam<AV1FillTSkipSumBufferFuncParam> {
  public:
   virtual void SetUp() { target_func_ = GET_PARAM(0); }

   void RunTest() {
     for (int i = 0; i < kTestIterations; i++) {
       TestTSkipSum();
     }
   }
   void RunSpeedTest() { SpeedTestTSkipSum(); };

  private:
   libaom_test::ACMRandom rnd_;
   FillTSkipSumBufferFunc target_func_;

   static constexpr int kSpeedIterations = 10000;
   static constexpr int kTestIterations = 100;
   static constexpr int kNumPlanes = 1;
   static constexpr int kWidth = RESTORATION_PROC_UNIT_SIZE;
   static constexpr int kHeight = RESTORATION_PROC_UNIT_SIZE;
   static constexpr int kInputWidth = MI_SIZE_64X64;
   static constexpr int kInputStride = MI_SIZE_64X64;
   static constexpr int kOutputWidth =
       (RESTORATION_PROC_UNIT_SIZE + PC_WIENER_FEATURE_LENGTH_LUMA - 1);

   uint8_t input_buffer_[MI_SIZE_64X64 * MI_SIZE_64X64];
   int8_t ref_buffer_[kOutputWidth];
   int8_t test_buffer_[kOutputWidth];
   const bool tskip_strict_ = true;

   int RandBool() {
     const uint32_t value = rnd_.Rand8();
     // There's a bit more entropy in the upper bits of this implementation.
     return (value >> 7) & 0x1;
   }

   void TestTSkipSum() {
     for (int i = 0; i < kInputWidth * kInputStride; ++i) {
       input_buffer_[i] = static_cast<uint8_t>(RandBool() ? 1 : 0);
     }

     for (int plane = 0; plane < kNumPlanes; ++plane) {
       const int is_uv = (plane > 0);
       const int ss_x = is_uv ? 1 : 0;
       const int ss_y = is_uv ? 1 : 0;
       const int plane_width = kWidth >> ss_x;
       const int plane_height = kHeight >> ss_y;
       const int tskip_lead = PC_WIENER_TSKIP_LEAD_LUMA;
       const int tskip_lag = PC_WIENER_TSKIP_LAG_LUMA;

       memset(ref_buffer_, 0, sizeof(*ref_buffer_) * kOutputWidth);
       memset(test_buffer_, 0, sizeof(*test_buffer_) * kOutputWidth);

       // Reference function
       for (int row = -tskip_lead; row < (tskip_lag + plane_height); ++row) {
         av1_fill_tskip_sum_buffer_c(row, input_buffer_, kInputStride,
                                     ref_buffer_, plane_width, plane_height,
                                     tskip_lead, tskip_lag, tskip_strict_);
       }

       // Test function
       for (int row = -tskip_lead; row < (tskip_lag + plane_height); ++row) {
         target_func_(row, input_buffer_, kInputStride, test_buffer_,
                      plane_width, plane_height, tskip_lead, tskip_lag,
                      tskip_strict_);
       }

       // Compare the output of reference and test for bit match
       for (int i = 0; i < kOutputWidth; ++i) {
         ASSERT_EQ(ref_buffer_[i], test_buffer_[i])
             << " Mismatch at (" << i << ")";
       }
     }
   }

   void SpeedTestTSkipSum() {
     for (int i = 0; i < kInputWidth * kInputStride; ++i) {
       input_buffer_[i] = static_cast<uint8_t>(RandBool() ? 1 : 0);
     }

     for (int plane = 0; plane < kNumPlanes; ++plane) {
       const int is_uv = (plane > 0);
       const int ss_x = is_uv ? 1 : 0;
       const int ss_y = is_uv ? 1 : 0;
       const int plane_width = kWidth >> ss_x;
       const int plane_height = kHeight >> ss_y;
       const int tskip_lead = PC_WIENER_TSKIP_LEAD_LUMA;
       const int tskip_lag = PC_WIENER_TSKIP_LAG_LUMA;

       memset(ref_buffer_, 0, sizeof(*ref_buffer_) * kOutputWidth);
       memset(test_buffer_, 0, sizeof(*test_buffer_) * kOutputWidth);

       // Calculate time taken by reference/c function
       aom_usec_timer timer;
       aom_usec_timer_start(&timer);
       for (int i = 0; i < kSpeedIterations; ++i) {
         // Reference function
         for (int row = -tskip_lead; row < (tskip_lag + plane_height - 1);
              ++row) {
           av1_fill_tskip_sum_buffer_c(row, input_buffer_, kInputStride,
                                       ref_buffer_, plane_width, plane_height,
                                       tskip_lead, tskip_lag, tskip_strict_);
         }
       }
       aom_usec_timer_mark(&timer);
       auto elapsed_time_c = aom_usec_timer_elapsed(&timer);

       // Calculate time taken by optimized/intrinsic function
       aom_usec_timer_start(&timer);
       for (int i = 0; i < kSpeedIterations; ++i) {
         for (int row = -tskip_lead; row < (tskip_lag + plane_height - 1);
              ++row) {
           target_func_(row, input_buffer_, kInputStride, test_buffer_,
                        plane_width, plane_height, tskip_lead, tskip_lag,
                        tskip_strict_);
         }
       }
       aom_usec_timer_mark(&timer);
       auto elapsed_time_opt = aom_usec_timer_elapsed(&timer);

       float c_time_per_pixel =
           (float)1000.0 * elapsed_time_c / kSpeedIterations;
       float opt_time_per_pixel =
           (float)1000.0 * elapsed_time_opt / kSpeedIterations;
       float scaling = c_time_per_pixel / opt_time_per_pixel;
       printf(
           " %3dx%-3d: c_time_per_pixel=%10.5f, "
           "opt_time_per_pixel=%10.5f,  scaling=%f \n",
           plane_width, plane_height, c_time_per_pixel, opt_time_per_pixel,
           scaling);
     }
   }
 };

 TEST_P(AV1Fill_TSkip_Sum_BufferTest, RunTest) { RunTest(); }
 TEST_P(AV1Fill_TSkip_Sum_BufferTest, DISABLED_Speed) { RunSpeedTest(); }

 #if HAVE_AVX2
 INSTANTIATE_TEST_SUITE_P(AVX2, AV1Fill_TSkip_Sum_BufferTest,
                          ::testing::Values(av1_fill_tskip_sum_buffer_avx2));
 #endif  // HAVE_AVX2

 //////////////////////////////////////////////////////////
 //       unit-test for 'directional_feature_accum'      //
 //////////////////////////////////////////////////////////
 typedef void (*FillDirFeatureAccumFunc)(
     int dir_feature_accum[NUM_PC_WIENER_FEATURES][PC_WIENER_FEATURE_ACC_SIZE],
     int *feature_sum_buf[NUM_PC_WIENER_FEATURES], int width, int col_offset,
     int feature_lead, int feature_lag);

 typedef std::tuple<const FillDirFeatureAccumFunc>
     AV1FillDirFeatureAccumFuncParam;

 class AV1FeatureDirAccumHighbdTest
     : public ::testing::TestWithParam<AV1FillDirFeatureAccumFuncParam> {
  public:
   void RunTest() {
     for (int i = 0; i < kTestIterations; i++) {
       FillInputBufs();
       TestFillDirFeatureAccum();
     }
   }

   void RunSpeedTest() { SpeedTestConvolve(); };

   virtual void SetUp() {
     target_func_ = GET_PARAM(0);

     for (int j = 0; j < NUM_PC_WIENER_FEATURES; ++j) {
       feature_sum_buf[j] =
           (int *)(aom_malloc(kInputWidth * sizeof(*feature_sum_buf[j])));
     }
   }

   virtual void TearDown() {
     for (int j = 0; j < NUM_PC_WIENER_FEATURES; ++j) {
       aom_free(feature_sum_buf[j]);
       feature_sum_buf[j] = NULL;
     }
   }

  private:
   libaom_test::ACMRandom rnd_;
   FillDirFeatureAccumFunc target_func_;

   static constexpr int kSpeedIterations = 1000000;
   static constexpr int kTestIterations = 100;
   static constexpr int kNumPlanes = 2;
   static constexpr int kWidth = RESTORATION_PROC_UNIT_SIZE;
   static constexpr int kInputWidth =
       (RESTORATION_PROC_UNIT_SIZE + PC_WIENER_FEATURE_LENGTH_LUMA - 1);

   int *feature_sum_buf[NUM_PC_WIENER_FEATURES];
   int dir_feature_accum_buf_c[NUM_PC_WIENER_FEATURES]
                              [PC_WIENER_FEATURE_ACC_SIZE] = { { 0 } };
   int dir_feature_accum_buf_simd[NUM_PC_WIENER_FEATURES]
                                 [PC_WIENER_FEATURE_ACC_SIZE] = { { 0 } };
   int RandBool() {
     const uint32_t value = rnd_.Rand8();
     // There's a bit more entropy in the upper bits of this implementation.
     return (value >> 7) & 0x1;
   }

   void FillInputBufs() {
     for (int i = 0; i < NUM_PC_WIENER_FEATURES; ++i) {
       for (int j = 0; j < kInputWidth; ++j) {
         // For the extreme values case, the maimum input that feature_sum_buf
         // can take is (kInputWidth * 2 * input_max_value). Hence, clipping the
         // value generated to 23 bit.
         const int max_range = (1 << 23);
         const int value = rnd_.Rand31() % max_range;
         feature_sum_buf[i][j] =
             static_cast<uint8_t>(RandBool() ? value : -value);
       }
     }
     // Reset output buffers
     av1_zero(dir_feature_accum_buf_c);
     av1_zero(dir_feature_accum_buf_simd);
   }

   void TestFillDirFeatureAccum() {
     for (int plane = 0; plane < kNumPlanes; ++plane) {
       const int is_uv = (plane > 0);
       const int ss_x = is_uv ? 1 : 0;
       const int plane_width = kWidth >> ss_x;
       const int feature_lead = PC_WIENER_FEATURE_LEAD_LUMA;
       const int feature_lag = PC_WIENER_FEATURE_LAG_LUMA;

       // Reset output buffers
       av1_zero(dir_feature_accum_buf_c);
       av1_zero(dir_feature_accum_buf_simd);

       // C function call
       av1_fill_directional_feature_accumulators_c(
           dir_feature_accum_buf_c, feature_sum_buf, plane_width, feature_lag,
           feature_lead, feature_lag);

       // SIMD function call
       target_func_(dir_feature_accum_buf_simd, feature_sum_buf, plane_width,
                    feature_lag, feature_lead, feature_lag);

       // Compare the output of reference and test for bit match
       for (int i = 0; i < NUM_PC_WIENER_FEATURES; i++) {
         for (int j = 0; j < PC_WIENER_FEATURE_ACC_SIZE; j++) {
           ASSERT_EQ(dir_feature_accum_buf_c[i][j],
                     dir_feature_accum_buf_simd[i][j])
               << " Feature_Buf: Pixel mismatch at (" << i << ", " << j << ", "
               << plane_width << ")";
         }
       }
     }
   }

   void SpeedTestConvolve() {
     for (int plane = 0; plane < kNumPlanes; ++plane) {
       const int is_uv = (plane > 0);
       const int ss_x = is_uv ? 1 : 0;
       const int plane_width = kWidth >> ss_x;
       const int feature_lead = PC_WIENER_FEATURE_LEAD_LUMA;
       const int feature_lag = PC_WIENER_FEATURE_LAG_LUMA;
       FillInputBufs();

       // Calculate time taken by reference/c function
       aom_usec_timer timer;
       aom_usec_timer_start(&timer);
       for (int i = 0; i < kSpeedIterations; ++i) {
         av1_fill_directional_feature_accumulators_c(
             dir_feature_accum_buf_c, feature_sum_buf, plane_width, feature_lag,
             feature_lead, feature_lag);
       }
       aom_usec_timer_mark(&timer);
       auto elapsed_time_c = aom_usec_timer_elapsed(&timer);

       // Calculate time taken by optimized/intrinsic function
       aom_usec_timer_start(&timer);
       for (int i = 0; i < kSpeedIterations; ++i) {
         target_func_(dir_feature_accum_buf_simd, feature_sum_buf, plane_width,
                      feature_lag, feature_lead, feature_lag);
       }
       aom_usec_timer_mark(&timer);
       auto elapsed_time_opt = aom_usec_timer_elapsed(&timer);

       float c_time_per_pixel =
           (float)1000.0 * elapsed_time_c / (kSpeedIterations * plane_width);
       float opt_time_per_pixel =
           (float)1000.0 * elapsed_time_opt / (kSpeedIterations * plane_width);
       float scaling = c_time_per_pixel / opt_time_per_pixel;
       printf(
           " %3d: c_time_per_pixel=%10.5f, "
           "opt_time_per_pixel=%10.5f,  scaling=%f \n",
           plane_width, c_time_per_pixel, opt_time_per_pixel, scaling);
     }
   }
 };

 TEST_P(AV1FeatureDirAccumHighbdTest, RunTest) { RunTest(); }
 TEST_P(AV1FeatureDirAccumHighbdTest, DISABLED_Speed) { RunSpeedTest(); }

 #if HAVE_AVX2
 INSTANTIATE_TEST_SUITE_P(
     AVX2, AV1FeatureDirAccumHighbdTest,
     ::testing::Values(av1_fill_directional_feature_accumulators_avx2));
 #endif  // HAVE_AVX2

 //////////////////////////////////////////////////////////
 //     unit-test for 'fill_tskip_feature_accumulator'   //
 //////////////////////////////////////////////////////////
 typedef void (*FillTskip_Accumulator_func)(
     int16_t tskip_feature_accum[PC_WIENER_FEATURE_ACC_SIZE],
     int8_t *tskip_sum_buff, int width, int col_offset, int tskip_lead,
     int tskip_lag);
 typedef std::tuple<const FillTskip_Accumulator_func>
     AV1FillTSkipAccumBufferFuncParam;

 class AV1TskipAccumHighbdTest
     : public ::testing::TestWithParam<AV1FillTSkipAccumBufferFuncParam> {
  public:
   virtual void SetUp() { target_func_ = GET_PARAM(0); }

   void RunTest() {
     for (int i = 0; i < kTestIterations; i++) TestTskipAccum();
   }

   void RunSpeedTest() { SpeedTestTskipAccum(); };

  private:
   libaom_test::ACMRandom rnd_;
   FillTskip_Accumulator_func target_func_;

   static constexpr int kSpeedIterations = 1000000;
   static constexpr int kTestIterations = 100;
   static constexpr int kNumPlanes = 2;
   static constexpr int kWidth = RESTORATION_PROC_UNIT_SIZE;
   static constexpr int kInputWidth =
       (RESTORATION_PROC_UNIT_SIZE + PC_WIENER_FEATURE_LENGTH_LUMA - 1);

   int8_t *tskip_sum_buf;
   int16_t tskip_feature_accum_c[PC_WIENER_FEATURE_ACC_SIZE] = { 0 };
   int16_t tskip_feature_accum_simd[PC_WIENER_FEATURE_ACC_SIZE] = { 0 };

   void buffer_alloc_and_set_data() {
     tskip_sum_buf =
         (int8_t *)(aom_malloc(kInputWidth * sizeof(*tskip_sum_buf)));
     // Input buffer filling. Tskip buffer max value will not cross width of
     // restoration unit size. Hence, the generated values are clipped to the
     // same.
     for (int i = 0; i < kInputWidth; ++i) {
       const int8_t value =
           static_cast<int8_t>(rnd_.Rand8() % RESTORATION_PROC_UNIT_SIZE);
       tskip_sum_buf[i] = static_cast<uint8_t>(RandBool() ? value : -value);
     }
   }

   int RandBool() {
     const uint32_t value = rnd_.Rand8();
     // There's a bit more entropy in the upper bits of this implementation.
     return (value >> 7) & 0x1;
   }

   void TestTskipAccum() {
     // Allocate memory and fill input buffer
     buffer_alloc_and_set_data();

     // Loop over luma and chroma plane
     for (int plane = 0; plane < kNumPlanes; ++plane) {
       const int is_uv = (plane > 0);
       const int ss_x = is_uv ? 1 : 0;
       const int plane_width = kWidth >> ss_x;
       const int tskip_lead = PC_WIENER_TSKIP_LEAD_LUMA;
       const int tskip_lag = PC_WIENER_TSKIP_LAG_LUMA;
       av1_zero(tskip_feature_accum_c);
       av1_zero(tskip_feature_accum_simd);

       // C function call
       av1_fill_tskip_feature_accumulator_c(tskip_feature_accum_c, tskip_sum_buf,
                                            plane_width, tskip_lag, tskip_lead,
                                            tskip_lag);

       // SIMD function call
       target_func_(tskip_feature_accum_simd, tskip_sum_buf, plane_width,
                    tskip_lag, tskip_lead, tskip_lag);

       // Compare the output of reference and test for bit match
       for (int i = 0; i < PC_WIENER_FEATURE_ACC_SIZE; i++) {
         ASSERT_EQ(tskip_feature_accum_c[i], tskip_feature_accum_simd[i])
             << " Feature_Buf: Pixel mismatch at (" << i << "," << plane_width
             << ")";
       }
     }
     aom_free(tskip_sum_buf);
     tskip_sum_buf = NULL;
   }

   void SpeedTestTskipAccum() {
     // Allocate memory and fill input buffer
     buffer_alloc_and_set_data();

     for (int plane = 0; plane < kNumPlanes; ++plane) {
       const int is_uv = (plane > 0);
       const int ss_x = is_uv ? 1 : 0;
       const int plane_width = kWidth >> ss_x;
       const int tskip_lead = PC_WIENER_TSKIP_LEAD_LUMA;
       const int tskip_lag = PC_WIENER_TSKIP_LAG_LUMA;

       // Calculate time taken by reference/c function
       aom_usec_timer timer;
       aom_usec_timer_start(&timer);
       for (int i = 0; i < kSpeedIterations; ++i) {
         av1_fill_tskip_feature_accumulator_c(tskip_feature_accum_c,
                                              tskip_sum_buf, plane_width,
                                              tskip_lag, tskip_lead, tskip_lag);
       }
       aom_usec_timer_mark(&timer);
       auto elapsed_time_c = aom_usec_timer_elapsed(&timer);

       // Calculate time taken by optimized/intrinsic function
       aom_usec_timer_start(&timer);
       for (int i = 0; i < kSpeedIterations; ++i) {
         target_func_(tskip_feature_accum_simd, tskip_sum_buf, plane_width,
                      tskip_lag, tskip_lead, tskip_lag);
       }
       aom_usec_timer_mark(&timer);
       auto elapsed_time_opt = aom_usec_timer_elapsed(&timer);

       float c_time_per_pixel =
           (float)1000.0 * elapsed_time_c / (kSpeedIterations * plane_width);
       float opt_time_per_pixel =
           (float)1000.0 * elapsed_time_opt / (kSpeedIterations * plane_width);
       float scaling = c_time_per_pixel / opt_time_per_pixel;
       printf(
           " %3d: c_time_per_pixel=%10.5f, "
           "opt_time_per_pixel=%10.5f,  scaling=%f \n",
           plane_width, c_time_per_pixel, opt_time_per_pixel, scaling);
     }
     aom_free(tskip_sum_buf);
     tskip_sum_buf = NULL;
   }
 };

 TEST_P(AV1TskipAccumHighbdTest, RunTest) { RunTest(); }
 TEST_P(AV1TskipAccumHighbdTest, DISABLED_Speed) { RunSpeedTest(); }

 #if HAVE_AVX2
 INSTANTIATE_TEST_SUITE_P(
     AVX2, AV1TskipAccumHighbdTest,
     ::testing::Values(av1_fill_tskip_feature_accumulator_avx2));
 #endif  // HAVE_AVX2
 #endif  // CONFIG_LR_IMPROVEMENTS
 }  // namespace