| /* | 
 |  * Copyright (c) 2021, Alliance for Open Media. All rights reserved | 
 |  * | 
 |  * This source code is subject to the terms of the BSD 3-Clause Clear License | 
 |  * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear | 
 |  * License was not distributed with this source code in the LICENSE file, you | 
 |  * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the | 
 |  * Alliance for Open Media Patent License 1.0 was not distributed with this | 
 |  * source code in the PATENTS file, you can obtain it at | 
 |  * aomedia.org/license/patent-license/. | 
 |  */ | 
 |  | 
 | #include <ostream> | 
 | #include <set> | 
 | #include <vector> | 
 | #include "aom_ports/aom_timer.h" | 
 | #include "config/av1_rtcd.h" | 
 | #include "config/aom_dsp_rtcd.h" | 
 | #pragma GCC diagnostic push | 
 | #pragma GCC diagnostic ignored "-Wuninitialized" | 
 | #include "test/acm_random.h" | 
 | #pragma GCC diagnostic pop | 
 | #include "test/clear_system_state.h" | 
 | #include "test/util.h" | 
 | #include "third_party/googletest/src/googletest/include/gtest/gtest.h" | 
 |  | 
 | #include "av1/common/restoration.h" | 
 |  | 
 | namespace { | 
 |  | 
 | // TODO(any): Remove following INTERP_FILTERS_ALL define, so that 12-tap filter | 
 | // is tested once 12-tap filter SIMD is done. | 
 | #undef INTERP_FILTERS_ALL | 
 | #define INTERP_FILTERS_ALL 4 | 
 |  | 
 | // All single reference convolve tests are parameterized on block size, | 
 | // bit-depth, and function to test. | 
 | // | 
 | // Note that parameterizing on these variables (and not other parameters) is | 
 | // a conscious decision - Jenkins needs some degree of parallelization to run | 
 | // the tests within the time limit, but if the number of parameters increases | 
 | // too much, the gtest framework does not handle it well (increased overhead per | 
 | // test, huge amount of output to stdout, etc.). | 
 | // | 
 | // Also note that the test suites must be named with the architecture, e.g., | 
 | // C, C_X, AVX2_X, ... The test suite that runs on Jenkins sometimes runs tests | 
 | // that cannot deal with intrinsics (e.g., the Valgrind tests on 32-bit x86 | 
 | // binaries) and will disable tests using a filter like | 
 | // --gtest_filter=-:SSE4_1.*. If the test suites are not named this way, the | 
 | // testing infrastructure will not selectively filter them properly. | 
 | class BlockSize { | 
 |  public: | 
 |   BlockSize(int w, int h) : width_(w), height_(h) {} | 
 |  | 
 |   int Width() const { return width_; } | 
 |   int Height() const { return height_; } | 
 |  | 
 |   bool operator<(const BlockSize &other) const { | 
 |     if (Width() == other.Width()) { | 
 |       return Height() < other.Height(); | 
 |     } | 
 |     return Width() < other.Width(); | 
 |   } | 
 |  | 
 |   bool operator==(const BlockSize &other) const { | 
 |     return Width() == other.Width() && Height() == other.Height(); | 
 |   } | 
 |  | 
 |  private: | 
 |   int width_; | 
 |   int height_; | 
 | }; | 
 |  | 
 | // Block size / bit depth / test function used to parameterize the tests. | 
 | template <typename T> | 
 | class TestParam { | 
 |  public: | 
 |   TestParam(const BlockSize &block, int bd, T test_func) | 
 |       : block_(block), bd_(bd), test_func_(test_func) {} | 
 |  | 
 |   const BlockSize &Block() const { return block_; } | 
 |   int BitDepth() const { return bd_; } | 
 |   T TestFunction() const { return test_func_; } | 
 |  | 
 |   bool operator==(const TestParam &other) const { | 
 |     return Block() == other.Block() && BitDepth() == other.BitDepth() && | 
 |            TestFunction() == other.TestFunction(); | 
 |   } | 
 |  | 
 |  private: | 
 |   BlockSize block_; | 
 |   int bd_; | 
 |   T test_func_; | 
 | }; | 
 |  | 
 | template <typename T> | 
 | std::ostream &operator<<(std::ostream &os, const TestParam<T> &test_arg) { | 
 |   return os << "TestParam { width:" << test_arg.Block().Width() | 
 |             << " height:" << test_arg.Block().Height() | 
 |             << " bd:" << test_arg.BitDepth() << " }"; | 
 | } | 
 |  | 
 | // Generate the list of all block widths / heights that need to be tested, | 
 | // includes chroma and luma sizes, for the given bit-depths. The test | 
 | // function is the same for all generated parameters. | 
 | template <typename T> | 
 | std::vector<TestParam<T>> GetTestParams(std::initializer_list<int> bit_depths, | 
 |                                         T test_func) { | 
 |   std::set<BlockSize> sizes; | 
 |   for (int b = BLOCK_4X4; b < BLOCK_SIZES_ALL; ++b) { | 
 |     const int w = block_size_wide[b]; | 
 |     const int h = block_size_high[b]; | 
 |     sizes.insert(BlockSize(w, h)); | 
 |     // Add in smaller chroma sizes as well. | 
 |     if (w == 4 || h == 4) { | 
 |       sizes.insert(BlockSize(w / 2, h / 2)); | 
 |     } | 
 |   } | 
 |   sizes.insert(BlockSize(24, 24)); | 
 |   std::vector<TestParam<T>> result; | 
 |   for (const BlockSize &block : sizes) { | 
 |     for (int bd : bit_depths) { | 
 |       result.push_back(TestParam<T>(block, bd, test_func)); | 
 |     } | 
 |   } | 
 |   return result; | 
 | } | 
 |  | 
 | // Test the test-parameters generators work as expected. | 
 | class AV1ConvolveParametersTest : public ::testing::Test {}; | 
 |  | 
 | template <typename T> | 
 | std::vector<TestParam<T>> GetHighbdTestParams(T test_func) { | 
 |   return GetTestParams({ 10, 12 }, test_func); | 
 | } | 
 |  | 
 | template <typename T> | 
 | ::testing::internal::ParamGenerator<TestParam<T>> BuildHighbdParams( | 
 |     T test_func) { | 
 |   return ::testing::ValuesIn(GetHighbdTestParams(test_func)); | 
 | } | 
 |  | 
 | TEST_F(AV1ConvolveParametersTest, GetHighbdTestParams) { | 
 |   auto v = GetHighbdTestParams(av1_highbd_convolve_x_sr_c); | 
 | #if CONFIG_EXT_RECUR_PARTITIONS | 
 |   ASSERT_EQ(82U, v.size()); | 
 | #else | 
 |   ASSERT_EQ(60U, v.size()); | 
 | #endif  // CONFIG_EXT_RECUR_PARTITIONS | 
 |   int num_10 = 0; | 
 |   int num_12 = 0; | 
 |   for (const auto &p : v) { | 
 |     ASSERT_TRUE(p.BitDepth() == 10 || p.BitDepth() == 12); | 
 |     bool same_fn = av1_highbd_convolve_x_sr_c == p.TestFunction(); | 
 |     ASSERT_TRUE(same_fn); | 
 |     if (p.BitDepth() == 10) { | 
 |       ++num_10; | 
 |     } else { | 
 |       ++num_12; | 
 |     } | 
 |   } | 
 |   ASSERT_EQ(num_10, num_12); | 
 | } | 
 |  | 
 | // AV1ConvolveTest is the base class that all convolve tests should derive from. | 
 | // It provides storage/methods for generating randomized buffers for both | 
 | // low bit-depth and high bit-depth, and setup/teardown methods for clearing | 
 | // system state. Implementors can get the bit-depth / block-size / | 
 | // test function by calling GetParam(). | 
 | template <typename T> | 
 | class AV1ConvolveTest : public ::testing::TestWithParam<TestParam<T>> { | 
 |  public: | 
 |   virtual ~AV1ConvolveTest() { TearDown(); } | 
 |  | 
 |   virtual void SetUp() override { | 
 |     rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed()); | 
 |   } | 
 |  | 
 |   virtual void TearDown() override { libaom_test::ClearSystemState(); } | 
 |  | 
 |   // Randomizes the 8-bit input buffer and returns a pointer to it. Note that | 
 |   // the pointer is safe to use with an 8-tap filter. The stride can range | 
 |   // from width to (width + kPadding). Also note that the pointer is to the | 
 |   // same memory location. | 
 |   static constexpr int kInputPadding = 8; | 
 |  | 
 |   // Get a pointer to a buffer with stride == width. Note that we must have | 
 |   // the test param passed in explicitly -- the gtest framework does not | 
 |   // support calling GetParam() within a templatized class. | 
 |   // Note that FirstRandomInput8 always returns the same pointer -- if two | 
 |   // inputs are needed, also use SecondRandomInput8. | 
 |   const uint8_t *FirstRandomInput8(const TestParam<T> ¶m) { | 
 |     // Note we can't call GetParam() directly -- gtest does not support | 
 |     // this for parameterized types. | 
 |     return RandomInput8(input8_1_, param); | 
 |   } | 
 |  | 
 |   const uint8_t *SecondRandomInput8(const TestParam<T> ¶m) { | 
 |     return RandomInput8(input8_2_, param); | 
 |   } | 
 |  | 
 |   // Some of the intrinsics perform writes in 32 byte chunks. Moreover, some | 
 |   // of the instrinsics assume that the stride is also a multiple of 32. | 
 |   // To satisfy these constraints and also remain simple, output buffer strides | 
 |   // are assumed MAX_SB_SIZE. | 
 |   static constexpr int kOutputStride = MAX_SB_SIZE; | 
 |  | 
 |   // Check that two 8-bit output buffers are identical. | 
 |   void AssertOutputBufferEq(const uint8_t *p1, const uint8_t *p2, int width, | 
 |                             int height) { | 
 |     ASSERT_TRUE(p1 != p2) << "Buffers must be at different memory locations"; | 
 |     for (int j = 0; j < height; ++j) { | 
 |       if (memcmp(p1, p2, sizeof(*p1) * width) == 0) { | 
 |         p1 += kOutputStride; | 
 |         p2 += kOutputStride; | 
 |         continue; | 
 |       } | 
 |       for (int i = 0; i < width; ++i) { | 
 |         ASSERT_EQ(p1[i], p2[i]) | 
 |             << width << "x" << height << " Pixel mismatch at (" << i << ", " | 
 |             << j << ")"; | 
 |       } | 
 |     } | 
 |   } | 
 |  | 
 |   // Check that two 16-bit output buffers are identical. | 
 |   void AssertOutputBufferEq(const uint16_t *p1, const uint16_t *p2, int width, | 
 |                             int height) { | 
 |     ASSERT_TRUE(p1 != p2) << "Buffers must be in different memory locations"; | 
 |     for (int j = 0; j < height; ++j) { | 
 |       if (memcmp(p1, p2, sizeof(*p1) * width) == 0) { | 
 |         p1 += kOutputStride; | 
 |         p2 += kOutputStride; | 
 |         continue; | 
 |       } | 
 |       for (int i = 0; i < width; ++i) { | 
 |         ASSERT_EQ(p1[i], p2[i]) | 
 |             << width << "x" << height << " Pixel mismatch at (" << i << ", " | 
 |             << j << ")"; | 
 |       } | 
 |     } | 
 |   } | 
 |  | 
 |   // Note that the randomized values are capped by bit-depth. | 
 |   const uint16_t *FirstRandomInput12(const TestParam<T> ¶m) { | 
 |     return RandomInput12(input16_1_, param); | 
 |   } | 
 |  | 
 |   const uint16_t *SecondRandomInput12(const TestParam<T> ¶m) { | 
 |     return RandomInput12(input16_2_, param); | 
 |   } | 
 |  | 
 |   const uint16_t *FirstRandomInput16Extreme(const TestParam<T> ¶m) { | 
 |     return RandomInput16Extreme(input16_1_, param); | 
 |   } | 
 |  | 
 |  private: | 
 |   const uint8_t *RandomInput8(uint8_t *p, const TestParam<T> ¶m) { | 
 |     EXPECT_EQ(8, param.BitDepth()); | 
 |     EXPECT_GE(MAX_SB_SIZE, param.Block().Width()); | 
 |     EXPECT_GE(MAX_SB_SIZE, param.Block().Height()); | 
 |     const int padded_width = param.Block().Width() + kInputPadding; | 
 |     const int padded_height = param.Block().Height() + kInputPadding; | 
 |     Randomize(p, padded_width * padded_height); | 
 |     return p + (kInputPadding / 2) * padded_width + kInputPadding / 2; | 
 |   } | 
 |  | 
 |   void Randomize(uint8_t *p, int size) { | 
 |     for (int i = 0; i < size; ++i) { | 
 |       p[i] = rnd_.Rand8(); | 
 |     } | 
 |   } | 
 |  | 
 |   const uint16_t *RandomInput12(uint16_t *p, const TestParam<T> ¶m) { | 
 |     // Check that this is only called with high bit-depths up to 12. | 
 |     EXPECT_TRUE(param.BitDepth() == 10 || param.BitDepth() == 12); | 
 |     EXPECT_GE(MAX_SB_SIZE, param.Block().Width()); | 
 |     EXPECT_GE(MAX_SB_SIZE, param.Block().Height()); | 
 |     const int padded_width = param.Block().Width() + kInputPadding; | 
 |     const int padded_height = param.Block().Height() + kInputPadding; | 
 |     Randomize12(p, padded_width * padded_height, param.BitDepth()); | 
 |     return p + (kInputPadding / 2) * padded_width + kInputPadding / 2; | 
 |   } | 
 |  | 
 |   void Randomize12(uint16_t *p, int size, int bit_depth) { | 
 |     EXPECT_TRUE(bit_depth == 10 || bit_depth == 12); | 
 |     // Make sure bitdepth is capped in case error not triggered | 
 |     const int bd_capped = bit_depth >= 12 ? 12 : bit_depth; | 
 |     for (int i = 0; i < size; ++i) { | 
 |       p[i] = (uint16_t)Clamp(rnd_.Rand12(), 0, (1 << bd_capped) - 1); | 
 |     } | 
 |   } | 
 |  | 
 |   int Clamp(int value, int low, int high) { | 
 |     return value < low ? low : (value > high ? high : value); | 
 |   } | 
 |  | 
 |   const uint16_t *RandomInput16Extreme(uint16_t *p, const TestParam<T> ¶m) { | 
 |     // Check that this is only called with high bit-depths. | 
 |     EXPECT_TRUE(param.BitDepth() == 10 || param.BitDepth() == 12); | 
 |     EXPECT_GE(MAX_SB_SIZE, param.Block().Width()); | 
 |     EXPECT_GE(MAX_SB_SIZE, param.Block().Height()); | 
 |     const int padded_width = param.Block().Width() + kInputPadding; | 
 |     const int padded_height = param.Block().Height() + kInputPadding; | 
 |     RandomizeExtreme(p, padded_width * padded_height, param.BitDepth()); | 
 |     return p + (kInputPadding / 2) * padded_width + kInputPadding / 2; | 
 |   } | 
 |  | 
 |   void RandomizeExtreme(uint16_t *p, int size, int max_bit_range) { | 
 |     EXPECT_GE(12, max_bit_range); | 
 |     const int max_val = (1 << max_bit_range) - 1; | 
 |     for (int i = 0; i < size; ++i) { | 
 |       p[i] = static_cast<uint16_t>(RandBool() ? max_val : 0); | 
 |     } | 
 |   } | 
 |  | 
 |   int RandBool() { | 
 |     const uint32_t value = rnd_.Rand8(); | 
 |     // There's a bit more entropy in the upper bits of this implementation. | 
 |     return (value >> 7) & 0x1; | 
 |   } | 
 |  | 
 |   static constexpr int kInputStride = MAX_SB_SIZE + kInputPadding; | 
 |  | 
 |   libaom_test::ACMRandom rnd_; | 
 |   // Statically allocate all the memory that is needed for the tests. Note | 
 |   // that we cannot allocate output memory here. It must use DECLARE_ALIGNED, | 
 |   // which is a C99 feature and interacts badly with C++ member variables. | 
 |   uint8_t input8_1_[kInputStride * kInputStride]; | 
 |   uint8_t input8_2_[kInputStride * kInputStride]; | 
 |   uint16_t input16_1_[kInputStride * kInputStride]; | 
 |   uint16_t input16_2_[kInputStride * kInputStride]; | 
 | }; | 
 |  | 
 | ///////////////////////////////////////////////////////// | 
 | // Single reference convolve-x functions (high bit-depth) | 
 | ///////////////////////////////////////////////////////// | 
 | typedef void (*highbd_convolve_x_func)( | 
 |     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, | 
 |     int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, | 
 |     ConvolveParams *conv_params, int bd); | 
 |  | 
 | class AV1ConvolveXHighbdTest : public AV1ConvolveTest<highbd_convolve_x_func> { | 
 |  public: | 
 |   void RunTest() { | 
 |     for (int sub_x = 0; sub_x < 16; ++sub_x) { | 
 |       for (int filter = EIGHTTAP_REGULAR; filter < INTERP_FILTERS_ALL; | 
 |            ++filter) { | 
 |         InterpFilter f = static_cast<InterpFilter>(filter); | 
 |         TestConvolve(sub_x, f); | 
 |       } | 
 |     } | 
 |   } | 
 |  | 
 |  private: | 
 |   void TestConvolve(const int sub_x, const InterpFilter filter) { | 
 |     const int width = GetParam().Block().Width(); | 
 |     const int height = GetParam().Block().Height(); | 
 |     const int bit_depth = GetParam().BitDepth(); | 
 |     const InterpFilterParams *filter_params_x = | 
 |         av1_get_interp_filter_params_with_block_size(filter, width); | 
 |     ConvolveParams conv_params1 = | 
 |         get_conv_params_no_round(0, 0, NULL, 0, 0, bit_depth); | 
 |     const uint16_t *input = FirstRandomInput12(GetParam()); | 
 |     DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]); | 
 |     av1_highbd_convolve_x_sr_c(input, width, reference, kOutputStride, width, | 
 |                                height, filter_params_x, sub_x, &conv_params1, | 
 |                                bit_depth); | 
 |  | 
 |     ConvolveParams conv_params2 = | 
 |         get_conv_params_no_round(0, 0, NULL, 0, 0, bit_depth); | 
 |     DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]); | 
 |     GetParam().TestFunction()(input, width, test, kOutputStride, width, height, | 
 |                               filter_params_x, sub_x, &conv_params2, bit_depth); | 
 |     AssertOutputBufferEq(reference, test, width, height); | 
 |   } | 
 | }; | 
 |  | 
 | TEST_P(AV1ConvolveXHighbdTest, RunTest) { RunTest(); } | 
 |  | 
 | INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveXHighbdTest, | 
 |                          BuildHighbdParams(av1_highbd_convolve_x_sr_c)); | 
 |  | 
 | #if HAVE_SSSE3 | 
 | INSTANTIATE_TEST_SUITE_P(SSSE3, AV1ConvolveXHighbdTest, | 
 |                          BuildHighbdParams(av1_highbd_convolve_x_sr_ssse3)); | 
 | #endif | 
 |  | 
 | #if HAVE_AVX2 | 
 | INSTANTIATE_TEST_SUITE_P(AVX2, AV1ConvolveXHighbdTest, | 
 |                          BuildHighbdParams(av1_highbd_convolve_x_sr_avx2)); | 
 | #endif | 
 |  | 
 | ///////////////////////////////////////////////////////// | 
 | // Single reference convolve-y functions (high bit-depth) | 
 | ///////////////////////////////////////////////////////// | 
 | typedef void (*highbd_convolve_y_func)( | 
 |     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, | 
 |     int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, | 
 |     int bd); | 
 |  | 
 | class AV1ConvolveYHighbdTest : public AV1ConvolveTest<highbd_convolve_y_func> { | 
 |  public: | 
 |   void RunTest() { | 
 |     for (int sub_y = 0; sub_y < 16; ++sub_y) { | 
 |       for (int filter = EIGHTTAP_REGULAR; filter < INTERP_FILTERS_ALL; | 
 |            ++filter) { | 
 |         InterpFilter f = static_cast<InterpFilter>(filter); | 
 |         TestConvolve(sub_y, f); | 
 |       } | 
 |     } | 
 |   } | 
 |  | 
 |  private: | 
 |   void TestConvolve(const int sub_y, const InterpFilter filter) { | 
 |     const int width = GetParam().Block().Width(); | 
 |     const int height = GetParam().Block().Height(); | 
 |     const int bit_depth = GetParam().BitDepth(); | 
 |     const InterpFilterParams *filter_params_y = | 
 |         av1_get_interp_filter_params_with_block_size(filter, height); | 
 |     const uint16_t *input = FirstRandomInput12(GetParam()); | 
 |     DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]); | 
 |     av1_highbd_convolve_y_sr_c(input, width, reference, kOutputStride, width, | 
 |                                height, filter_params_y, sub_y, bit_depth); | 
 |     DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]); | 
 |     GetParam().TestFunction()(input, width, test, kOutputStride, width, height, | 
 |                               filter_params_y, sub_y, bit_depth); | 
 |     AssertOutputBufferEq(reference, test, width, height); | 
 |   } | 
 | }; | 
 |  | 
 | TEST_P(AV1ConvolveYHighbdTest, RunTest) { RunTest(); } | 
 |  | 
 | INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveYHighbdTest, | 
 |                          BuildHighbdParams(av1_highbd_convolve_y_sr_c)); | 
 |  | 
 | #if HAVE_SSSE3 | 
 | INSTANTIATE_TEST_SUITE_P(SSSE3, AV1ConvolveYHighbdTest, | 
 |                          BuildHighbdParams(av1_highbd_convolve_y_sr_ssse3)); | 
 | #endif | 
 |  | 
 | #if HAVE_AVX2 | 
 | INSTANTIATE_TEST_SUITE_P(AVX2, AV1ConvolveYHighbdTest, | 
 |                          BuildHighbdParams(av1_highbd_convolve_y_sr_avx2)); | 
 | #endif | 
 |  | 
 | /////////////////////////////////////////////////////////////// | 
 | // Single reference convolve-copy functions (high bit-depth) | 
 | /////////////////////////////////////////////////////////////// | 
 | typedef void (*highbd_convolve_copy_func)(const uint16_t *src, | 
 |                                           ptrdiff_t src_stride, uint16_t *dst, | 
 |                                           ptrdiff_t dst_stride, int w, int h); | 
 |  | 
 | class AV1ConvolveCopyHighbdTest | 
 |     : public AV1ConvolveTest<highbd_convolve_copy_func> { | 
 |  public: | 
 |   void RunTest() { | 
 |     const BlockSize &block = GetParam().Block(); | 
 |     const int width = block.Width(); | 
 |     const int height = block.Height(); | 
 |     const uint16_t *input = FirstRandomInput12(GetParam()); | 
 |     DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]); | 
 |     aom_highbd_convolve_copy_c(input, width, reference, kOutputStride, width, | 
 |                                height); | 
 |     DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]); | 
 |     GetParam().TestFunction()(input, width, test, kOutputStride, width, height); | 
 |     AssertOutputBufferEq(reference, test, width, height); | 
 |   } | 
 | }; | 
 |  | 
 | TEST_P(AV1ConvolveCopyHighbdTest, RunTest) { RunTest(); } | 
 |  | 
 | INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveCopyHighbdTest, | 
 |                          BuildHighbdParams(aom_highbd_convolve_copy_c)); | 
 |  | 
 | #if HAVE_SSE2 | 
 | INSTANTIATE_TEST_SUITE_P(SSE2, AV1ConvolveCopyHighbdTest, | 
 |                          BuildHighbdParams(aom_highbd_convolve_copy_sse2)); | 
 | #endif | 
 |  | 
 | #if HAVE_AVX2 | 
 | INSTANTIATE_TEST_SUITE_P(AVX2, AV1ConvolveCopyHighbdTest, | 
 |                          BuildHighbdParams(aom_highbd_convolve_copy_avx2)); | 
 | #endif | 
 |  | 
 | ////////////////////////////////////////////////////////// | 
 | // Single reference convolve-2d functions (high bit-depth) | 
 | ////////////////////////////////////////////////////////// | 
 |  | 
 | typedef void (*highbd_convolve_2d_func)( | 
 |     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, | 
 |     int h, const InterpFilterParams *filter_params_x, | 
 |     const InterpFilterParams *filter_params_y, const int subpel_x_qn, | 
 |     const int subpel_y_qn, ConvolveParams *conv_params, int bd); | 
 |  | 
 | class AV1Convolve2DHighbdTest | 
 |     : public AV1ConvolveTest<highbd_convolve_2d_func> { | 
 |  public: | 
 |   void RunTest() { | 
 |     for (int sub_x = 0; sub_x < 16; ++sub_x) { | 
 |       for (int sub_y = 0; sub_y < 16; ++sub_y) { | 
 |         for (int h_f = EIGHTTAP_REGULAR; h_f <= BILINEAR; ++h_f) { | 
 |           for (int v_f = EIGHTTAP_REGULAR; v_f <= BILINEAR; ++v_f) { | 
 |             TestConvolve(static_cast<InterpFilter>(h_f), | 
 |                          static_cast<InterpFilter>(v_f), sub_x, sub_y); | 
 |           } | 
 |         } | 
 |       } | 
 |     } | 
 |   } | 
 |  | 
 |   void SpeedTest() { | 
 |     for (int h_f = EIGHTTAP_REGULAR; h_f <= BILINEAR; ++h_f) { | 
 |       for (int v_f = EIGHTTAP_REGULAR; v_f <= BILINEAR; ++v_f) { | 
 |         TestConvolveSpeed(static_cast<InterpFilter>(h_f), | 
 |                           static_cast<InterpFilter>(v_f), 50000, 8, 8); | 
 |       } | 
 |     } | 
 |   } | 
 |  | 
 |  private: | 
 |   void TestConvolve(const InterpFilter h_f, const InterpFilter v_f, | 
 |                     const int sub_x, const int sub_y) { | 
 |     const int width = GetParam().Block().Width(); | 
 |     const int height = GetParam().Block().Height(); | 
 |     const int bit_depth = GetParam().BitDepth(); | 
 |     const InterpFilterParams *filter_params_x = | 
 |         av1_get_interp_filter_params_with_block_size(h_f, width); | 
 |     const InterpFilterParams *filter_params_y = | 
 |         av1_get_interp_filter_params_with_block_size(v_f, height); | 
 |     const uint16_t *input = FirstRandomInput12(GetParam()); | 
 |     DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]); | 
 |     ConvolveParams conv_params1 = | 
 |         get_conv_params_no_round(0, 0, NULL, 0, 0, bit_depth); | 
 |     av1_highbd_convolve_2d_sr_c(input, width, reference, kOutputStride, width, | 
 |                                 height, filter_params_x, filter_params_y, sub_x, | 
 |                                 sub_y, &conv_params1, bit_depth); | 
 |     DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]); | 
 |     ConvolveParams conv_params2 = | 
 |         get_conv_params_no_round(0, 0, NULL, 0, 0, bit_depth); | 
 |     GetParam().TestFunction()(input, width, test, kOutputStride, width, height, | 
 |                               filter_params_x, filter_params_y, sub_x, sub_y, | 
 |                               &conv_params2, bit_depth); | 
 |     AssertOutputBufferEq(reference, test, width, height); | 
 |   } | 
 |  | 
 |   void TestConvolveSpeed(const InterpFilter h_f, const InterpFilter v_f, | 
 |                          int num_iters, int sub_x, int sub_y) { | 
 |     const int width = GetParam().Block().Width(); | 
 |     const int height = GetParam().Block().Height(); | 
 |     const int bit_depth = GetParam().BitDepth(); | 
 |     const InterpFilterParams *filter_params_x = | 
 |         av1_get_interp_filter_params_with_block_size(h_f, width); | 
 |     const InterpFilterParams *filter_params_y = | 
 |         av1_get_interp_filter_params_with_block_size(v_f, height); | 
 |  | 
 |     const uint16_t *input = FirstRandomInput12(GetParam()); | 
 |     DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]); | 
 |     ConvolveParams conv_params1 = | 
 |         get_conv_params_no_round(0, 0, nullptr, 0, 0, bit_depth); | 
 |     aom_usec_timer timer; | 
 |     aom_usec_timer_start(&timer); | 
 |     for (int i = 0; i < num_iters; ++i) { | 
 |       av1_highbd_convolve_2d_sr_c(input, width, reference, kOutputStride, width, | 
 |                                   height, filter_params_x, filter_params_y, | 
 |                                   sub_x, sub_y, &conv_params1, bit_depth); | 
 |     } | 
 |     aom_usec_timer_mark(&timer); | 
 |     const int time1 = static_cast<int>(aom_usec_timer_elapsed(&timer)); | 
 |  | 
 |     DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]); | 
 |     ConvolveParams conv_params2 = | 
 |         get_conv_params_no_round(0, 0, nullptr, 0, 0, bit_depth); | 
 |     aom_usec_timer_start(&timer); | 
 |     for (int i = 0; i < num_iters; ++i) { | 
 |       GetParam().TestFunction()(input, width, test, kOutputStride, width, | 
 |                                 height, filter_params_x, filter_params_y, sub_x, | 
 |                                 sub_y, &conv_params2, bit_depth); | 
 |     } | 
 |     aom_usec_timer_mark(&timer); | 
 |     const int time2 = static_cast<int>(aom_usec_timer_elapsed(&timer)); | 
 |  | 
 |     printf("%d - %d %3dx%-3d bd: %d ref: %d mod: %d (%3.2f)\n", h_f, v_f, width, | 
 |            height, bit_depth, time1, time2, (double)time1 / time2); | 
 |     AssertOutputBufferEq(reference, test, width, height); | 
 |   } | 
 | }; | 
 |  | 
 | TEST_P(AV1Convolve2DHighbdTest, RunTest) { RunTest(); } | 
 | TEST_P(AV1Convolve2DHighbdTest, DISABLED_Speed) { SpeedTest(); } | 
 |  | 
 | INSTANTIATE_TEST_SUITE_P(C, AV1Convolve2DHighbdTest, | 
 |                          BuildHighbdParams(av1_highbd_convolve_2d_sr_c)); | 
 |  | 
 | #if HAVE_SSSE3 | 
 | INSTANTIATE_TEST_SUITE_P(SSSE3, AV1Convolve2DHighbdTest, | 
 |                          BuildHighbdParams(av1_highbd_convolve_2d_sr_ssse3)); | 
 | #endif | 
 |  | 
 | #if HAVE_AVX2 | 
 | INSTANTIATE_TEST_SUITE_P(AVX2, AV1Convolve2DHighbdTest, | 
 |                          BuildHighbdParams(av1_highbd_convolve_2d_sr_avx2)); | 
 | #endif | 
 |  | 
 | ////////////////////////// | 
 | // Compound Convolve Tests | 
 | ////////////////////////// | 
 |  | 
 | // The compound functions do not work for chroma block sizes. Provide | 
 | // a function to generate test parameters for just luma block sizes. | 
 | template <typename T> | 
 | std::vector<TestParam<T>> GetLumaTestParams( | 
 |     std::initializer_list<int> bit_depths, T test_func) { | 
 |   std::set<BlockSize> sizes; | 
 |   for (int b = BLOCK_4X4; b < BLOCK_SIZES_ALL; ++b) { | 
 |     const int w = block_size_wide[b]; | 
 |     const int h = block_size_high[b]; | 
 |     sizes.insert(BlockSize(w, h)); | 
 |   } | 
 |   std::vector<TestParam<T>> result; | 
 |   for (int bit_depth : bit_depths) { | 
 |     for (const auto &block : sizes) { | 
 |       result.push_back(TestParam<T>(block, bit_depth, test_func)); | 
 |     } | 
 |   } | 
 |   return result; | 
 | } | 
 |  | 
 | template <typename T> | 
 | std::vector<TestParam<T>> GetHighbdLumaTestParams(T test_func) { | 
 |   return GetLumaTestParams({ 10, 12 }, test_func); | 
 | } | 
 |  | 
 | TEST_F(AV1ConvolveParametersTest, GetHighbdLumaTestParams) { | 
 |   auto v = GetHighbdLumaTestParams(av1_highbd_dist_wtd_convolve_x_c); | 
 |   ASSERT_EQ(static_cast<size_t>(BLOCK_SIZES_ALL * 2), v.size()); | 
 |   int num_10 = 0; | 
 |   int num_12 = 0; | 
 |   for (const auto &e : v) { | 
 |     ASSERT_TRUE(10 == e.BitDepth() || 12 == e.BitDepth()); | 
 |     bool same_fn = av1_highbd_dist_wtd_convolve_x_c == e.TestFunction(); | 
 |     ASSERT_TRUE(same_fn); | 
 |     if (e.BitDepth() == 10) { | 
 |       ++num_10; | 
 |     } else { | 
 |       ++num_12; | 
 |     } | 
 |   } | 
 |   ASSERT_EQ(num_10, num_12); | 
 | } | 
 |  | 
 | template <typename T> | 
 | ::testing::internal::ParamGenerator<TestParam<T>> BuildHighbdLumaParams( | 
 |     T test_func) { | 
 |   return ::testing::ValuesIn(GetHighbdLumaTestParams(test_func)); | 
 | } | 
 |  | 
 | // Compound cases also need to test different frame offsets and weightings. | 
 | class CompoundParam { | 
 |  public: | 
 |   CompoundParam(int fwd_offset, int bck_offset) | 
 |       : fwd_offset_(fwd_offset), bck_offset_(bck_offset) {} | 
 |  | 
 |   bool UseWtdCompAvg() const { | 
 |     return bck_offset_ != (1 << (DIST_PRECISION_BITS - 1)) || | 
 |            fwd_offset_ != (1 << (DIST_PRECISION_BITS - 1)); | 
 |   } | 
 |   int FwdOffset() const { return fwd_offset_; } | 
 |   int BckOffset() const { return bck_offset_; } | 
 |  | 
 |  private: | 
 |   int fwd_offset_; | 
 |   int bck_offset_; | 
 | }; | 
 |  | 
 | std::vector<CompoundParam> GetCompoundParams() { | 
 |   std::vector<CompoundParam> result; | 
 |   result.push_back(CompoundParam(1 << (DIST_PRECISION_BITS - 1), | 
 |                                  1 << (DIST_PRECISION_BITS - 1))); | 
 |   for (int k = 0; k < 2; ++k) { | 
 |     for (int l = 0; l < 4; ++l) { | 
 |       result.push_back(CompoundParam(quant_dist_lookup_table[l][k], | 
 |                                      quant_dist_lookup_table[l][1 - k])); | 
 |     } | 
 |   } | 
 |   return result; | 
 | } | 
 |  | 
 | TEST_F(AV1ConvolveParametersTest, GetCompoundParams) { | 
 |   auto v = GetCompoundParams(); | 
 |   ASSERT_EQ(9U, v.size()); | 
 |   ASSERT_FALSE(v[0].UseWtdCompAvg()); | 
 |   for (size_t i = 1; i < v.size(); ++i) { | 
 |     ASSERT_TRUE(v[i].UseWtdCompAvg()); | 
 |   } | 
 | } | 
 |  | 
 | ///////////////////////////////////////////////// | 
 | // Compound convolve-x functions (high bit-depth) | 
 | ///////////////////////////////////////////////// | 
 | ConvolveParams GetConvolveParams(int do_average, CONV_BUF_TYPE *conv_buf, | 
 |                                  int width, int bit_depth, | 
 |                                  const CompoundParam &compound) { | 
 |   ConvolveParams conv_params = | 
 |       get_conv_params_no_round(do_average, 0, conv_buf, width, 1, bit_depth); | 
 |   (void)compound; | 
 |   conv_params.fwd_offset = compound.FwdOffset(); | 
 |   conv_params.bck_offset = compound.BckOffset(); | 
 |   return conv_params; | 
 | } | 
 |  | 
 | class AV1ConvolveXHighbdCompoundTest | 
 |     : public AV1ConvolveTest<highbd_convolve_x_func> { | 
 |  public: | 
 |   void RunTest() { | 
 |     auto compound_params = GetCompoundParams(); | 
 |     for (int sub_pix = 0; sub_pix < 16; ++sub_pix) { | 
 |       for (int f = EIGHTTAP_REGULAR; f < INTERP_FILTERS_ALL; ++f) { | 
 |         for (const auto &c : compound_params) { | 
 |           TestConvolve(sub_pix, static_cast<InterpFilter>(f), c); | 
 |         } | 
 |       } | 
 |     } | 
 |   } | 
 |  | 
 |  protected: | 
 |   virtual const InterpFilterParams *FilterParams(InterpFilter f, | 
 |                                                  const BlockSize &block) const { | 
 |     return av1_get_interp_filter_params_with_block_size(f, block.Width()); | 
 |   } | 
 |  | 
 |   virtual highbd_convolve_x_func ReferenceFunc() const { | 
 |     return av1_highbd_dist_wtd_convolve_x_c; | 
 |   } | 
 |  | 
 |  private: | 
 |   void TestConvolve(const int sub_pix, const InterpFilter filter, | 
 |                     const CompoundParam &compound) { | 
 |     const int width = GetParam().Block().Width(); | 
 |     const int height = GetParam().Block().Height(); | 
 |  | 
 |     const uint16_t *input1 = FirstRandomInput12(GetParam()); | 
 |     const uint16_t *input2 = SecondRandomInput12(GetParam()); | 
 |     DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]); | 
 |     DECLARE_ALIGNED(32, CONV_BUF_TYPE, reference_conv_buf[MAX_SB_SQUARE]); | 
 |     Convolve(ReferenceFunc(), input1, input2, reference, reference_conv_buf, | 
 |              compound, sub_pix, filter); | 
 |  | 
 |     DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]); | 
 |     DECLARE_ALIGNED(32, CONV_BUF_TYPE, test_conv_buf[MAX_SB_SQUARE]); | 
 |     Convolve(GetParam().TestFunction(), input1, input2, test, test_conv_buf, | 
 |              compound, sub_pix, filter); | 
 |  | 
 |     AssertOutputBufferEq(reference_conv_buf, test_conv_buf, width, height); | 
 |     AssertOutputBufferEq(reference, test, width, height); | 
 |   } | 
 |  | 
 |   void Convolve(highbd_convolve_x_func test_func, const uint16_t *src1, | 
 |                 const uint16_t *src2, uint16_t *dst, CONV_BUF_TYPE *conv_buf, | 
 |                 const CompoundParam &compound, const int sub_pix, | 
 |                 const InterpFilter filter) { | 
 |     const int width = GetParam().Block().Width(); | 
 |     const int height = GetParam().Block().Height(); | 
 |     const int bit_depth = GetParam().BitDepth(); | 
 |     const InterpFilterParams *filter_params = | 
 |         FilterParams(filter, GetParam().Block()); | 
 |     ConvolveParams conv_params = | 
 |         GetConvolveParams(0, conv_buf, kOutputStride, bit_depth, compound); | 
 |     test_func(src1, width, dst, kOutputStride, width, height, filter_params, | 
 |               sub_pix, &conv_params, bit_depth); | 
 |     conv_params = | 
 |         GetConvolveParams(1, conv_buf, kOutputStride, bit_depth, compound); | 
 |     test_func(src2, width, dst, kOutputStride, width, height, filter_params, | 
 |               sub_pix, &conv_params, bit_depth); | 
 |   } | 
 | }; | 
 |  | 
 | TEST_P(AV1ConvolveXHighbdCompoundTest, RunTest) { RunTest(); } | 
 |  | 
 | INSTANTIATE_TEST_SUITE_P( | 
 |     C, AV1ConvolveXHighbdCompoundTest, | 
 |     BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_x_c)); | 
 |  | 
 | #if HAVE_SSE4_1 | 
 | INSTANTIATE_TEST_SUITE_P( | 
 |     SSE4_1, AV1ConvolveXHighbdCompoundTest, | 
 |     BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_x_sse4_1)); | 
 | #endif | 
 |  | 
 | #if HAVE_AVX2 | 
 | INSTANTIATE_TEST_SUITE_P( | 
 |     AVX2, AV1ConvolveXHighbdCompoundTest, | 
 |     BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_x_avx2)); | 
 | #endif | 
 |  | 
 | ///////////////////////////////////////////////// | 
 | // Compound convolve-y functions (high bit-depth) | 
 | ///////////////////////////////////////////////// | 
 |  | 
 | // Again, the X and Y convolve functions have the same type signature and logic. | 
 | class AV1ConvolveYHighbdCompoundTest : public AV1ConvolveXHighbdCompoundTest { | 
 |   virtual highbd_convolve_x_func ReferenceFunc() const override { | 
 |     return av1_highbd_dist_wtd_convolve_y_c; | 
 |   } | 
 |   virtual const InterpFilterParams *FilterParams( | 
 |       InterpFilter f, const BlockSize &block) const override { | 
 |     return av1_get_interp_filter_params_with_block_size(f, block.Height()); | 
 |   } | 
 | }; | 
 |  | 
 | TEST_P(AV1ConvolveYHighbdCompoundTest, RunTest) { RunTest(); } | 
 |  | 
 | INSTANTIATE_TEST_SUITE_P( | 
 |     C, AV1ConvolveYHighbdCompoundTest, | 
 |     BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_y_c)); | 
 |  | 
 | #if HAVE_SSE4_1 | 
 | INSTANTIATE_TEST_SUITE_P( | 
 |     SSE4_1, AV1ConvolveYHighbdCompoundTest, | 
 |     BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_y_sse4_1)); | 
 | #endif | 
 |  | 
 | #if HAVE_AVX2 | 
 | INSTANTIATE_TEST_SUITE_P( | 
 |     AVX2, AV1ConvolveYHighbdCompoundTest, | 
 |     BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_y_avx2)); | 
 | #endif | 
 |  | 
 | /////////////////////////////////////////////////////// | 
 | // Compound convolve-2d-copy functions (high bit-depth) | 
 | /////////////////////////////////////////////////////// | 
 | typedef void (*highbd_compound_conv_2d_copy_func)(const uint16_t *src, | 
 |                                                   int src_stride, uint16_t *dst, | 
 |                                                   int dst_stride, int w, int h, | 
 |                                                   ConvolveParams *conv_params, | 
 |                                                   int bd); | 
 |  | 
 | class AV1Convolve2DCopyHighbdCompoundTest | 
 |     : public AV1ConvolveTest<highbd_compound_conv_2d_copy_func> { | 
 |  public: | 
 |   void RunTest() { | 
 |     auto compound_params = GetCompoundParams(); | 
 |     for (const auto &compound : compound_params) { | 
 |       TestConvolve(compound); | 
 |     } | 
 |   } | 
 |  | 
 |  public: | 
 |   void SpeedTest() { | 
 |     auto compound_params = GetCompoundParams(); | 
 |     for (const auto &compound : compound_params) { | 
 |       SpeedTestConvolve(compound); | 
 |     } | 
 |   } | 
 |  | 
 |  private: | 
 |   void SpeedTestConvolve(const CompoundParam &compound) { | 
 |     const BlockSize &block = GetParam().Block(); | 
 |     const int width = block.Width(); | 
 |     const int height = block.Height(); | 
 |     const int bit_depth = GetParam().BitDepth(); | 
 |     int nob = 100000; | 
 |  | 
 |     const uint16_t *input = FirstRandomInput12(GetParam()); | 
 |     DECLARE_ALIGNED(32, uint16_t, conv_buf[MAX_SB_SQUARE]); | 
 |     highbd_compound_conv_2d_copy_func test_func = GetParam().TestFunction(); | 
 |  | 
 |     ConvolveParams conv_params = | 
 |         GetConvolveParams(0, conv_buf, kOutputStride, bit_depth, compound); | 
 |     ConvolveParams conv_params_do_avg = | 
 |         GetConvolveParams(1, conv_buf, kOutputStride, bit_depth, compound); | 
 |  | 
 |     aom_usec_timer timer; | 
 |     aom_usec_timer_start(&timer); | 
 |     for (int i = 0; i < nob; i++) { | 
 |       av1_highbd_dist_wtd_convolve_2d_copy_c(input, width, conv_buf, | 
 |                                              kOutputStride, width, height, | 
 |                                              &conv_params, bit_depth); | 
 |       av1_highbd_dist_wtd_convolve_2d_copy_c(input, width, conv_buf, | 
 |                                              kOutputStride, width, height, | 
 |                                              &conv_params_do_avg, bit_depth); | 
 |     } | 
 |     aom_usec_timer_mark(&timer); | 
 |     const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer)); | 
 |  | 
 |     aom_usec_timer timer1; | 
 |     aom_usec_timer_start(&timer1); | 
 |     for (int i = 0; i < nob; i++) { | 
 |       test_func(input, width, conv_buf, kOutputStride, width, height, | 
 |                 &conv_params, bit_depth); | 
 |       test_func(input, width, conv_buf, kOutputStride, width, height, | 
 |                 &conv_params_do_avg, bit_depth); | 
 |     } | 
 |     aom_usec_timer_mark(&timer1); | 
 |     const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1)); | 
 |     printf("%d x %d block: bd: %d, Scaling = %.2f\n", width, height, bit_depth, | 
 |            (double)elapsed_time / elapsed_time1); | 
 |   } | 
 |  | 
 |  private: | 
 |   void TestConvolve(const CompoundParam &compound) { | 
 |     const BlockSize &block = GetParam().Block(); | 
 |     const int width = block.Width(); | 
 |     const int height = block.Height(); | 
 |  | 
 |     const uint16_t *input1 = FirstRandomInput12(GetParam()); | 
 |     const uint16_t *input2 = SecondRandomInput12(GetParam()); | 
 |     DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]); | 
 |     DECLARE_ALIGNED(32, CONV_BUF_TYPE, reference_conv_buf[MAX_SB_SQUARE]); | 
 |     Convolve(av1_highbd_dist_wtd_convolve_2d_copy_c, input1, input2, reference, | 
 |              reference_conv_buf, compound); | 
 |  | 
 |     DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]); | 
 |     DECLARE_ALIGNED(32, CONV_BUF_TYPE, test_conv_buf[MAX_SB_SQUARE]); | 
 |     Convolve(GetParam().TestFunction(), input1, input2, test, test_conv_buf, | 
 |              compound); | 
 |  | 
 |     AssertOutputBufferEq(reference_conv_buf, test_conv_buf, width, height); | 
 |     AssertOutputBufferEq(reference, test, width, height); | 
 |   } | 
 |  | 
 |   void Convolve(highbd_compound_conv_2d_copy_func test_func, | 
 |                 const uint16_t *src1, const uint16_t *src2, uint16_t *dst, | 
 |                 uint16_t *conv_buf, const CompoundParam &compound) { | 
 |     const BlockSize &block = GetParam().Block(); | 
 |     const int width = block.Width(); | 
 |     const int height = block.Height(); | 
 |     const int bit_depth = GetParam().BitDepth(); | 
 |  | 
 |     ConvolveParams conv_params = | 
 |         GetConvolveParams(0, conv_buf, kOutputStride, bit_depth, compound); | 
 |     test_func(src1, width, dst, kOutputStride, width, height, &conv_params, | 
 |               bit_depth); | 
 |  | 
 |     conv_params = | 
 |         GetConvolveParams(1, conv_buf, kOutputStride, bit_depth, compound); | 
 |     test_func(src2, width, dst, kOutputStride, width, height, &conv_params, | 
 |               bit_depth); | 
 |   } | 
 | }; | 
 |  | 
 | TEST_P(AV1Convolve2DCopyHighbdCompoundTest, RunTest) { RunTest(); } | 
 | TEST_P(AV1Convolve2DCopyHighbdCompoundTest, DISABLED_SpeedTest) { SpeedTest(); } | 
 |  | 
 | INSTANTIATE_TEST_SUITE_P( | 
 |     C, AV1Convolve2DCopyHighbdCompoundTest, | 
 |     BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_2d_copy_c)); | 
 |  | 
 | #if HAVE_SSE4_1 | 
 | INSTANTIATE_TEST_SUITE_P( | 
 |     SSE4_1, AV1Convolve2DCopyHighbdCompoundTest, | 
 |     BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_2d_copy_sse4_1)); | 
 | #endif | 
 |  | 
 | #if HAVE_AVX2 | 
 | INSTANTIATE_TEST_SUITE_P( | 
 |     AVX2, AV1Convolve2DCopyHighbdCompoundTest, | 
 |     BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_2d_copy_avx2)); | 
 | #endif | 
 |  | 
 | ////////////////////////////////////////////////// | 
 | // Compound convolve-2d functions (high bit-depth) | 
 | ////////////////////////////////////////////////// | 
 |  | 
 | class AV1Convolve2DHighbdCompoundTestLarge | 
 |     : public AV1ConvolveTest<highbd_convolve_2d_func> { | 
 |  public: | 
 |   void RunTest() { | 
 |     auto compound_params = GetCompoundParams(); | 
 |     for (int h_f = EIGHTTAP_REGULAR; h_f < INTERP_FILTERS_ALL; ++h_f) { | 
 |       for (int v_f = EIGHTTAP_REGULAR; v_f < INTERP_FILTERS_ALL; ++v_f) { | 
 |         for (int sub_x = 0; sub_x < 16; ++sub_x) { | 
 |           for (int sub_y = 0; sub_y < 16; ++sub_y) { | 
 |             for (const auto &compound : compound_params) { | 
 |               TestConvolve(static_cast<InterpFilter>(h_f), | 
 |                            static_cast<InterpFilter>(v_f), sub_x, sub_y, | 
 |                            compound); | 
 |             } | 
 |           } | 
 |         } | 
 |       } | 
 |     } | 
 |   } | 
 |  | 
 |  private: | 
 |   void TestConvolve(const InterpFilter h_f, const InterpFilter v_f, | 
 |                     const int sub_x, const int sub_y, | 
 |                     const CompoundParam &compound) { | 
 |     const BlockSize &block = GetParam().Block(); | 
 |     const int width = block.Width(); | 
 |     const int height = block.Height(); | 
 |     const uint16_t *input1 = FirstRandomInput12(GetParam()); | 
 |     const uint16_t *input2 = SecondRandomInput12(GetParam()); | 
 |     DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]); | 
 |     DECLARE_ALIGNED(32, CONV_BUF_TYPE, reference_conv_buf[MAX_SB_SQUARE]); | 
 |     Convolve(av1_highbd_dist_wtd_convolve_2d_c, input1, input2, reference, | 
 |              reference_conv_buf, compound, h_f, v_f, sub_x, sub_y); | 
 |  | 
 |     DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]); | 
 |     DECLARE_ALIGNED(32, CONV_BUF_TYPE, test_conv_buf[MAX_SB_SQUARE]); | 
 |     Convolve(GetParam().TestFunction(), input1, input2, test, test_conv_buf, | 
 |              compound, h_f, v_f, sub_x, sub_y); | 
 |  | 
 |     AssertOutputBufferEq(reference_conv_buf, test_conv_buf, width, height); | 
 |     AssertOutputBufferEq(reference, test, width, height); | 
 |   } | 
 |  | 
 |  private: | 
 |   void Convolve(highbd_convolve_2d_func test_func, const uint16_t *src1, | 
 |                 const uint16_t *src2, uint16_t *dst, uint16_t *conv_buf, | 
 |                 const CompoundParam &compound, const InterpFilter h_f, | 
 |                 const InterpFilter v_f, const int sub_x, const int sub_y) { | 
 |     const BlockSize &block = GetParam().Block(); | 
 |     const int width = block.Width(); | 
 |     const int height = block.Height(); | 
 |  | 
 |     const InterpFilterParams *filter_params_x = | 
 |         av1_get_interp_filter_params_with_block_size(h_f, width); | 
 |     const InterpFilterParams *filter_params_y = | 
 |         av1_get_interp_filter_params_with_block_size(v_f, height); | 
 |     const int bit_depth = GetParam().BitDepth(); | 
 |     ConvolveParams conv_params = | 
 |         GetConvolveParams(0, conv_buf, kOutputStride, bit_depth, compound); | 
 |     test_func(src1, width, dst, kOutputStride, width, height, filter_params_x, | 
 |               filter_params_y, sub_x, sub_y, &conv_params, bit_depth); | 
 |  | 
 |     conv_params = | 
 |         GetConvolveParams(1, conv_buf, kOutputStride, bit_depth, compound); | 
 |     test_func(src2, width, dst, kOutputStride, width, height, filter_params_x, | 
 |               filter_params_y, sub_x, sub_y, &conv_params, bit_depth); | 
 |   } | 
 | }; | 
 |  | 
 | TEST_P(AV1Convolve2DHighbdCompoundTestLarge, RunTest) { RunTest(); } | 
 |  | 
 | INSTANTIATE_TEST_SUITE_P( | 
 |     C, AV1Convolve2DHighbdCompoundTestLarge, | 
 |     BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_2d_c)); | 
 |  | 
 | #if HAVE_SSE4_1 | 
 | INSTANTIATE_TEST_SUITE_P( | 
 |     SSE4_1, AV1Convolve2DHighbdCompoundTestLarge, | 
 |     BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_2d_sse4_1)); | 
 | #endif | 
 |  | 
 | #if HAVE_AVX2 | 
 | INSTANTIATE_TEST_SUITE_P( | 
 |     AVX2, AV1Convolve2DHighbdCompoundTestLarge, | 
 |     BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_2d_avx2)); | 
 | #endif | 
 |  | 
 | ////////////////////////////////////////////////////////// | 
 | // Nonseparable convolve-2d functions (high bit-depth) | 
 | ////////////////////////////////////////////////////////// | 
 |  | 
 | typedef void (*highbd_convolve_nonsep_2d_func)( | 
 |     const uint16_t *src, int src_stride, | 
 |     const NonsepFilterConfig *filter_config, const int16_t *filter, | 
 |     uint16_t *dst, int dst_stride, int bit_depth, int block_row_begin, | 
 |     int block_row_end, int block_col_begin, int block_col_end); | 
 |  | 
 | class AV1ConvolveNonSep2DHighbdTest | 
 |     : public AV1ConvolveTest<highbd_convolve_nonsep_2d_func> { | 
 |  public: | 
 |   void RunTest(RestorationType rtype) { | 
 |     for (int i = 0; i < kTestIterations; i++) { | 
 |       SetFilterTaps(); | 
 |       TestConvolve(FilterTaps_, rtype); | 
 |     } | 
 |   } | 
 |   void RunSpeedTest(RestorationType rtype) { | 
 |     SpeedTestConvolve(FilterTaps_, rtype); | 
 |   }; | 
 |  | 
 |  private: | 
 |   void BitMatchTest(const uint16_t *input, int input_stride, int width, | 
 |                     int height, const int16_t *filter, uint16_t *reference, | 
 |                     uint16_t *test, int dst_stride, int bit_depth, | 
 |                     int block_row_begin, int block_row_end, int block_col_begin, | 
 |                     int block_col_end, RestorationType rtype) { | 
 |     const NonsepFilterConfig *filter_config[2] = { NULL, NULL }; | 
 |     highbd_convolve_nonsep_2d_func ref_func = av1_convolve_symmetric_highbd_c; | 
 |     const int num_planes = 2; | 
 |  | 
 |     if (rtype == RESTORE_PC_WIENER) { | 
 |       ref_func = av1_convolve_symmetric_highbd_c; | 
 |       filter_config[0] = &UnconstrainedSumFilterConfig_; | 
 |       filter_config[1] = &PcWienerNonsepFilterConfigChroma_; | 
 |     } | 
 |  | 
 |     // When CONFIG_WIENER_NONSEP=1, luma and chroma plane uses different number | 
 |     // of filter taps and both needs to be tested. Here, luma is tested for | 
 |     // 12/13-tap filtering whereas chroma is tested for 6-tap filtering. | 
 |     if (rtype == RESTORE_WIENER_NONSEP) { | 
 |       ref_func = av1_convolve_symmetric_subtract_center_highbd_c; | 
 |       filter_config[0] = &UnitSumFilterConfig_; | 
 |       filter_config[1] = &UnitSumFilterConfigChroma_; | 
 |     } | 
 |  | 
 |     assert(filter_config[0] != NULL && filter_config[1] != NULL); | 
 |  | 
 |     for (int plane = 0; plane < num_planes; plane++) { | 
 |       ref_func(input, input_stride, filter_config[plane], filter, reference, | 
 |                dst_stride, bit_depth, block_row_begin, block_row_end, | 
 |                block_col_begin, block_col_end); | 
 |       GetParam().TestFunction()(input, input_stride, filter_config[plane], | 
 |                                 filter, test, dst_stride, bit_depth, | 
 |                                 block_row_begin, block_row_end, block_col_begin, | 
 |                                 block_col_end); | 
 |       AssertOutputBufferEq(reference, test, width, height); | 
 |     } | 
 |   } | 
 |   void TestConvolve(const int16_t *filter, RestorationType rtype) { | 
 |     const int width = GetParam().Block().Width(); | 
 |     const int height = GetParam().Block().Height(); | 
 |     const int bit_depth = GetParam().BitDepth(); | 
 |  | 
 |     const uint16_t *input = FirstRandomInput12(GetParam()); | 
 |     DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]); | 
 |     DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]); | 
 |  | 
 |     ASSERT_TRUE(kInputPadding >= kMaxTapOffset) | 
 |         << "Not enough padding for 7x7 filters"; | 
 |     const int input_stride = width; | 
 |     BitMatchTest(input, input_stride, width, height, filter, reference, test, | 
 |                  kOutputStride, bit_depth, 0, height, 0, width, rtype); | 
 |     // Extreme value test | 
 |     const uint16_t *extreme_input = FirstRandomInput16Extreme(GetParam()); | 
 |     int16_t Extream_Tap_[kMaxNumSymmetricTaps + 1]; | 
 |     RandomizeExtreamFilterTap(Extream_Tap_, kMaxNumSymmetricTaps + 1, | 
 |                               kMaxPrecisionBeforeOverflow); | 
 |     BitMatchTest(extreme_input, input_stride, width, height, Extream_Tap_, | 
 |                  reference, test, kOutputStride, bit_depth, 0, height, 0, width, | 
 |                  rtype); | 
 |   } | 
 |  | 
 |   void SpeedTestConvolve(const int16_t *filter, RestorationType rtype) { | 
 |     const int width = GetParam().Block().Width(); | 
 |     const int height = GetParam().Block().Height(); | 
 |     const int bit_depth = GetParam().BitDepth(); | 
 |     const int num_planes = 2; | 
 |  | 
 |     const uint16_t *input = FirstRandomInput12(GetParam()); | 
 |     DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]); | 
 |     DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]); | 
 |  | 
 |     ASSERT_TRUE(kInputPadding >= kMaxTapOffset) | 
 |         << "Not enough padding for 7x7 filters"; | 
 |  | 
 |     // Calculate time taken for C function | 
 |     const NonsepFilterConfig *filter_config[2] = { NULL, NULL }; | 
 |     highbd_convolve_nonsep_2d_func ref_func = av1_convolve_symmetric_highbd_c; | 
 |  | 
 |     if (rtype == RESTORE_PC_WIENER) { | 
 |       ref_func = av1_convolve_symmetric_highbd_c; | 
 |       filter_config[0] = &UnconstrainedSumFilterConfig_; | 
 |       filter_config[1] = &PcWienerNonsepFilterConfigChroma_; | 
 |     } | 
 |  | 
 |     // When CONFIG_WIENER_NONSEP=1, luma and chroma uses different number of | 
 |     // filter taps and both needs to be tested. Here, luma is tested for | 
 |     // 12/13-tap filtering whereas chroma is tested for 6-tap filtering. | 
 |     if (rtype == RESTORE_WIENER_NONSEP) { | 
 |       ref_func = av1_convolve_symmetric_subtract_center_highbd_c; | 
 |       filter_config[0] = &UnitSumFilterConfig_; | 
 |       filter_config[1] = &UnitSumFilterConfigChroma_; | 
 |     } | 
 |  | 
 |     for (int plane = 0; plane < num_planes; plane++) { | 
 |       // Calculate time taken by reference/c function | 
 |       aom_usec_timer timer; | 
 |       aom_usec_timer_start(&timer); | 
 |       for (int i = 0; i < kSpeedIterations; ++i) { | 
 |         ref_func(input, width, filter_config[plane], filter, reference, | 
 |                  kOutputStride, bit_depth, 0, height, 0, width); | 
 |       } | 
 |       aom_usec_timer_mark(&timer); | 
 |       auto elapsed_time_c = aom_usec_timer_elapsed(&timer); | 
 |  | 
 |       // Calculate time taken by optimized/intrinsic function | 
 |       aom_usec_timer_start(&timer); | 
 |       for (int i = 0; i < kSpeedIterations; ++i) { | 
 |         GetParam().TestFunction()(input, width, filter_config[plane], filter, | 
 |                                   test, kOutputStride, bit_depth, 0, height, 0, | 
 |                                   width); | 
 |       } | 
 |       aom_usec_timer_mark(&timer); | 
 |       auto elapsed_time_opt = aom_usec_timer_elapsed(&timer); | 
 |  | 
 |       float c_time_per_pixel = | 
 |           (float)1000.0 * elapsed_time_c / (kSpeedIterations * width * height); | 
 |       float opt_time_per_pixel = (float)1000.0 * elapsed_time_opt / | 
 |                                  (kSpeedIterations * width * height); | 
 |       float scaling = c_time_per_pixel / opt_time_per_pixel; | 
 |       printf( | 
 |           "plane=%3d, %3dx%-3d: c_time_per_pixel=%10.5f, " | 
 |           "opt_time_per_pixel=%10.5f,  scaling=%f \n", | 
 |           plane, width, height, c_time_per_pixel, opt_time_per_pixel, scaling); | 
 |     } | 
 |   } | 
 |  | 
 |   // Generates NonsepFilterConfig compliant origin symmetric filter tap values. | 
 |   // The first (2 * kMaxNumSymmetricTaps) are for the CONFIG_WIENER_NONSEP use | 
 |   // case where the center tap is constrained so that filter sums to one. The | 
 |   // last added tap at (2 * kMaxNumSymmetricTaps) is unconstrained and intended | 
 |   // for CONFIG_PC_WIENER use case. | 
 |   void SetFilterTaps() { | 
 |     Randomize(FilterTaps_, kMaxNumSymmetricTaps + 1, | 
 |               kMaxPrecisionBeforeOverflow); | 
 |   } | 
 |  | 
 |   // Fills the array p with signed integers. | 
 |   void Randomize(int16_t *p, int size, int max_bit_range) { | 
 |     ASSERT_TRUE(max_bit_range < 16) << "max_bit_range has to be less than 16"; | 
 |     for (int i = 0; i < size; ++i) { | 
 |       p[i] = rnd_.Rand15Signed() & ((1 << max_bit_range) - 1); | 
 |     } | 
 |   } | 
 |  | 
 |   // Fills the array p with maximum and minimum possible integers. | 
 |   void RandomizeExtreamFilterTap(int16_t *p, int size, int max_bit_range) { | 
 |     ASSERT_TRUE(max_bit_range < 16) << "max_bit_range has to be less than 16"; | 
 |     const int sign_max_val = (1 << (max_bit_range - 1)) - 1; | 
 |     for (int i = 0; i < size; ++i) { | 
 |       p[i] = | 
 |           static_cast<int16_t>(RandBool() ? sign_max_val : -(sign_max_val + 1)); | 
 |     } | 
 |   } | 
 |  | 
 |   int RandBool() { | 
 |     const uint32_t value = rnd_.Rand8(); | 
 |     // There's a bit more entropy in the upper bits of this implementation. | 
 |     return (value >> 7) & 0x1; | 
 |   } | 
 |  | 
 |   libaom_test::ACMRandom rnd_; | 
 |   static constexpr int kMaxPrecisionBeforeOverflow = 12; | 
 |   static constexpr int kMaxNumSymmetricTaps = 18; | 
 |   static constexpr int kMaxTapOffset = 3;  // Filters are 7x7. | 
 |   static constexpr int kSpeedIterations = 10000; | 
 |   static constexpr int kTestIterations = 100; | 
 |  | 
 |   // Filters use all unique taps. | 
 |   const NonsepFilterConfig UnconstrainedSumFilterConfig_ = { | 
 |     kMaxPrecisionBeforeOverflow, | 
 |     sizeof(wienerns_simd_config_y) / sizeof(wienerns_simd_config_y[0]), | 
 |     0, | 
 |     wienerns_simd_config_y, | 
 |     NULL, | 
 |     0, | 
 |     0, | 
 |     1, | 
 |     1 | 
 |   }; | 
 |  | 
 |   const NonsepFilterConfig PcWienerNonsepFilterConfigChroma_ = { | 
 |     kMaxPrecisionBeforeOverflow, | 
 |     sizeof(wienerns_simd_config_uv_from_uvonly) / | 
 |         sizeof(wienerns_simd_config_uv_from_uvonly[0]), | 
 |     0, | 
 |     wienerns_simd_config_uv_from_uvonly, | 
 |     NULL, | 
 |     0, | 
 |     0, | 
 |     1, | 
 |     1 | 
 |   }; | 
 |  | 
 |   const NonsepFilterConfig UnitSumFilterConfig_ = { | 
 |     kMaxPrecisionBeforeOverflow, | 
 |     sizeof(wienerns_simd_config_y) / sizeof(wienerns_simd_config_y[0]) - 1, | 
 |     0, | 
 |     wienerns_simd_config_y, | 
 |     NULL, | 
 |     0, | 
 |     1, | 
 |     1, | 
 |     1 | 
 |   }; | 
 |  | 
 |   // Config used for filtering of chroma when CONFIG_WIENER_NONSEP=1. | 
 |   const NonsepFilterConfig UnitSumFilterConfigChroma_ = { | 
 |     kMaxPrecisionBeforeOverflow, | 
 |     sizeof(wienerns_simd_config_uv_from_uv) / | 
 |             sizeof(wienerns_simd_config_uv_from_uv[0]) - | 
 |         1, | 
 |     0, | 
 |     wienerns_simd_config_uv_from_uv, | 
 |     NULL, | 
 |     0, | 
 |     1, | 
 |     1, | 
 |     1 | 
 |   }; | 
 |  | 
 |   int16_t FilterTaps_[kMaxNumSymmetricTaps + 1]; | 
 | }; | 
 |  | 
 | TEST_P(AV1ConvolveNonSep2DHighbdTest, DISABLED_RunTest) { | 
 |   RunTest(RESTORE_PC_WIENER); | 
 | } | 
 |  | 
 | TEST_P(AV1ConvolveNonSep2DHighbdTest, DISABLED_Speed) { | 
 |   RunSpeedTest(RESTORE_PC_WIENER); | 
 | } | 
 |  | 
 | #if HAVE_AVX2 | 
 | INSTANTIATE_TEST_SUITE_P(AVX2, AV1ConvolveNonSep2DHighbdTest, | 
 |                          BuildHighbdParams(av1_convolve_symmetric_highbd_avx2)); | 
 | #endif | 
 |  | 
 | class AV1ConvolveWienerNonSep2DHighbdTest | 
 |     : public AV1ConvolveNonSep2DHighbdTest {}; | 
 |  | 
 | TEST_P(AV1ConvolveWienerNonSep2DHighbdTest, RunTest) { | 
 |   RunTest(RESTORE_WIENER_NONSEP); | 
 | } | 
 | TEST_P(AV1ConvolveWienerNonSep2DHighbdTest, DISABLED_Speed) { | 
 |   RunSpeedTest(RESTORE_WIENER_NONSEP); | 
 | } | 
 |  | 
 | #if HAVE_AVX2 | 
 | INSTANTIATE_TEST_SUITE_P( | 
 |     AVX2, AV1ConvolveWienerNonSep2DHighbdTest, | 
 |     BuildHighbdParams(av1_convolve_symmetric_subtract_center_highbd_avx2)); | 
 | #endif | 
 |  | 
 | ////////////////////////////////////////////////////////// | 
 | // Nonseparable convolve-2d Dual functions (high bit-depth) | 
 | ////////////////////////////////////////////////////////// | 
 |  | 
 | typedef void (*highbd_convolve_nonsep_dual_2d_func)( | 
 |     const uint16_t *dgd, int dgd_stride, const uint16_t *dgd_dual, | 
 |     int dgd_dual_stride, const NonsepFilterConfig *filter_config, | 
 |     const int16_t *filter, uint16_t *dst, int dst_stride, int bit_depth, | 
 |     int block_row_begin, int block_row_end, int block_col_begin, | 
 |     int block_col_end); | 
 |  | 
 | class AV1ConvolveNonSep_dual2DHighbdTest | 
 |     : public AV1ConvolveTest<highbd_convolve_nonsep_dual_2d_func> { | 
 |  public: | 
 |   void RunTest(int is_subtract_center) { | 
 |     for (int i = 0; i < kTestIterations; i++) { | 
 |       SetFilterTaps(); | 
 |       TestConvolve(FilterTaps_, is_subtract_center); | 
 |     } | 
 |   } | 
 |   void RunSpeedTest(int is_subtract_center) { | 
 |     SpeedTestConvolve(FilterTaps_, is_subtract_center); | 
 |   }; | 
 |  | 
 |  private: | 
 |   libaom_test::ACMRandom rnd_; | 
 |   static constexpr int kMaxPrecisionBeforeOverflow = 12; | 
 |   static constexpr int kNumSubtractCenterOffTaps = 20; | 
 |   static constexpr int kMaxTapOffset = 2;  // Filters are 5x5. | 
 |   static constexpr int kSpeedIterations = 10000; | 
 |   static constexpr int kTestIterations = 100; | 
 |  | 
 |   // Declare the filter taps for worst case (i.e., for subtract center off | 
 |   // case). | 
 |   int16_t FilterTaps_[kNumSubtractCenterOffTaps]; | 
 |  | 
 |   // Fills the array p with signed integers. | 
 |   void Randomize(int16_t *p, int size, int max_bit_range) { | 
 |     ASSERT_TRUE(max_bit_range < 16) << "max_bit_range has to be less than 16"; | 
 |     for (int i = 0; i < size; ++i) { | 
 |       p[i] = rnd_.Rand15Signed() & ((1 << max_bit_range) - 1); | 
 |     } | 
 |   } | 
 |  | 
 |   void SetFilterTaps() { | 
 |     Randomize(FilterTaps_, kNumSubtractCenterOffTaps, | 
 |               kMaxPrecisionBeforeOverflow); | 
 |   } | 
 |  | 
 |   int RandBool() { | 
 |     const uint32_t value = rnd_.Rand8(); | 
 |     // There's a bit more entropy in the upper bits of this implementation. | 
 |     return (value >> 7) & 0x1; | 
 |   } | 
 |  | 
 |   // Fills the array p with maximum and minimum possible integers. | 
 |   void RandomizeExtreamFilterTap(int16_t *p, int size, int max_bit_range) { | 
 |     ASSERT_TRUE(max_bit_range < 16) << "max_bit_range has to be less than 16"; | 
 |     const int sign_max_val = (1 << (max_bit_range - 1)) - 1; | 
 |     for (int i = 0; i < size; ++i) { | 
 |       p[i] = | 
 |           static_cast<int16_t>(RandBool() ? sign_max_val : -(sign_max_val + 1)); | 
 |     } | 
 |   } | 
 |  | 
 |   void BitMatchTest(const uint16_t *dgd, const uint16_t *dgd_dual, | 
 |                     int dgd_stride, int width, int height, | 
 |                     const int16_t *filter, uint16_t *reference, uint16_t *test, | 
 |                     int dst_stride, int bit_depth, int block_row_begin, | 
 |                     int block_row_end, int block_col_begin, int block_col_end, | 
 |                     int is_subtract_center) { | 
 |     // Set filter_config and reference function appropriately. | 
 |     highbd_convolve_nonsep_dual_2d_func ref_func; | 
 |     const NonsepFilterConfig *filter_cfg; | 
 |  | 
 |     filter_cfg = &DualFilterWithCenterConfig_; | 
 |     ref_func = av1_convolve_symmetric_dual_subtract_center_highbd_c; | 
 |  | 
 |     if (!is_subtract_center) { | 
 |       ref_func = av1_convolve_symmetric_dual_highbd_c; | 
 |       filter_cfg = &DualFilterWithoutCenterConfig_; | 
 |     } | 
 |     // Reference function | 
 |     ref_func(dgd, dgd_stride, dgd_dual, dgd_stride, filter_cfg, filter, | 
 |              reference, dst_stride, bit_depth, block_row_begin, block_row_end, | 
 |              block_col_begin, block_col_end); | 
 |  | 
 |     // Test function | 
 |     GetParam().TestFunction()(dgd, dgd_stride, dgd_dual, dgd_stride, filter_cfg, | 
 |                               filter, test, dst_stride, bit_depth, | 
 |                               block_row_begin, block_row_end, block_col_begin, | 
 |                               block_col_end); | 
 |  | 
 |     // Compare the output of reference and test for bit match | 
 |     AssertOutputBufferEq(reference, test, width, height); | 
 |   } | 
 |  | 
 |   void TestConvolve(const int16_t *filter, int is_subtract_center) { | 
 |     const int width = GetParam().Block().Width(); | 
 |     const int height = GetParam().Block().Height(); | 
 |     const int bit_depth = GetParam().BitDepth(); | 
 |  | 
 |     const uint16_t *dgd = FirstRandomInput12(GetParam()); | 
 |     const uint16_t *dgd_dual = FirstRandomInput12(GetParam()); | 
 |     DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]); | 
 |     DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]); | 
 |  | 
 |     ASSERT_TRUE(kInputPadding >= kMaxTapOffset) | 
 |         << "Not enough padding for 5x5 filters"; | 
 |     const int input_stride = width; | 
 |     BitMatchTest(dgd, dgd_dual, input_stride, width, height, filter, reference, | 
 |                  test, kOutputStride, bit_depth, 0, height, 0, width, | 
 |                  is_subtract_center); | 
 |     // Extreme value test | 
 |     const uint16_t *extreme_input1 = FirstRandomInput16Extreme(GetParam()); | 
 |     const uint16_t *extreme_input2 = FirstRandomInput16Extreme(GetParam()); | 
 |     int16_t Extream_Tap_[kNumSubtractCenterOffTaps]; | 
 |     RandomizeExtreamFilterTap(Extream_Tap_, kNumSubtractCenterOffTaps, | 
 |                               kMaxPrecisionBeforeOverflow); | 
 |     BitMatchTest(extreme_input1, extreme_input2, input_stride, width, height, | 
 |                  Extream_Tap_, reference, test, kOutputStride, bit_depth, 0, | 
 |                  height, 0, width, is_subtract_center); | 
 |   } | 
 |  | 
 |   void SpeedTestConvolve(const int16_t *filter, int is_subtract_center) { | 
 |     const int width = GetParam().Block().Width(); | 
 |     const int height = GetParam().Block().Height(); | 
 |     const int bit_depth = GetParam().BitDepth(); | 
 |  | 
 |     const uint16_t *dgd = FirstRandomInput12(GetParam()); | 
 |     const uint16_t *dgd_dual = FirstRandomInput12(GetParam()); | 
 |     DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]); | 
 |     DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]); | 
 |  | 
 |     ASSERT_TRUE(kInputPadding >= kMaxTapOffset) | 
 |         << "Not enough padding for 5x5 filters"; | 
 |  | 
 |     // Set filter_config and reference function appropriately. | 
 |     highbd_convolve_nonsep_dual_2d_func ref_func; | 
 |     const NonsepFilterConfig *filter_cfg; | 
 |  | 
 |     filter_cfg = &DualFilterWithCenterConfig_; | 
 |     ref_func = av1_convolve_symmetric_dual_subtract_center_highbd_c; | 
 |  | 
 |     if (!is_subtract_center) { | 
 |       ref_func = av1_convolve_symmetric_dual_highbd_c; | 
 |       filter_cfg = &DualFilterWithoutCenterConfig_; | 
 |     } | 
 |  | 
 |     // Calculate time taken by reference/c function | 
 |     aom_usec_timer timer; | 
 |     aom_usec_timer_start(&timer); | 
 |     for (int i = 0; i < kSpeedIterations; ++i) { | 
 |       ref_func(dgd, width, dgd_dual, width, filter_cfg, filter, reference, | 
 |                kOutputStride, bit_depth, 0, height, 0, width); | 
 |     } | 
 |     aom_usec_timer_mark(&timer); | 
 |     auto elapsed_time_c = aom_usec_timer_elapsed(&timer); | 
 |  | 
 |     // Calculate time taken by optimized/intrinsic function | 
 |     aom_usec_timer_start(&timer); | 
 |     for (int i = 0; i < kSpeedIterations; ++i) { | 
 |       GetParam().TestFunction()(dgd, width, dgd_dual, width, filter_cfg, filter, | 
 |                                 test, kOutputStride, bit_depth, 0, height, 0, | 
 |                                 width); | 
 |     } | 
 |     aom_usec_timer_mark(&timer); | 
 |     auto elapsed_time_opt = aom_usec_timer_elapsed(&timer); | 
 |  | 
 |     float c_time_per_pixel = | 
 |         (float)1000.0 * elapsed_time_c / (kSpeedIterations * width * height); | 
 |     float opt_time_per_pixel = | 
 |         (float)1000.0 * elapsed_time_opt / (kSpeedIterations * width * height); | 
 |     float scaling = c_time_per_pixel / opt_time_per_pixel; | 
 |     printf( | 
 |         " %3dx%-3d: c_time_per_pixel=%10.5f, " | 
 |         "opt_time_per_pixel=%10.5f,  scaling=%f \n", | 
 |         width, height, c_time_per_pixel, opt_time_per_pixel, scaling); | 
 |   } | 
 |  | 
 |   const NonsepFilterConfig DualFilterWithCenterConfig_ = { | 
 |     kMaxPrecisionBeforeOverflow,  // prec_bits; | 
 |     sizeof(wienerns_simd_config_uv_from_uv) / | 
 |             sizeof(wienerns_simd_config_uv_from_uv[0]) - | 
 |         1,  // num_pixels; | 
 |     sizeof(wienerns_simd_config_uv_from_y) / | 
 |             sizeof(wienerns_simd_config_uv_from_y[0]) - | 
 |         1,                            // num_pixels2 | 
 |     wienerns_simd_config_uv_from_uv,  // config | 
 |     wienerns_simd_config_uv_from_y,   // config2 | 
 |     0,                                // strict_bounds | 
 |     1,                                // subtract_center | 
 |     1,                                // symmetry config | 
 |     0,                                // symmetry config2 | 
 |   }; | 
 |  | 
 |   const NonsepFilterConfig DualFilterWithoutCenterConfig_ = { | 
 |     kMaxPrecisionBeforeOverflow,  // prec_bits; | 
 |     sizeof(wienerns_simd_config_uv_from_uv) / | 
 |         sizeof(wienerns_simd_config_uv_from_uv[0]),  // num_pixels; | 
 |     sizeof(wienerns_simd_config_uv_from_y) / | 
 |         sizeof(wienerns_simd_config_uv_from_y[0]),  // num_pixels2 | 
 |     wienerns_simd_config_uv_from_uv,                // config | 
 |     wienerns_simd_config_uv_from_y,                 // config2 | 
 |     0,                                              // strict_bounds | 
 |     0,                                              // subtract_center | 
 |     1,                                              // symmetry config | 
 |     0,                                              // symmetry config2 | 
 |   }; | 
 | }; | 
 |  | 
 | TEST_P(AV1ConvolveNonSep_dual2DHighbdTest, RunTest) { RunTest(1); } | 
 | TEST_P(AV1ConvolveNonSep_dual2DHighbdTest, DISABLED_Speed) { RunSpeedTest(1); } | 
 |  | 
 | #if HAVE_AVX2 | 
 | INSTANTIATE_TEST_SUITE_P( | 
 |     AVX2, AV1ConvolveNonSep_dual2DHighbdTest, | 
 |     BuildHighbdParams(av1_convolve_symmetric_dual_subtract_center_highbd_avx2)); | 
 | #endif  // HAVE_AVX2 | 
 |  | 
 | /* Dual with subtract center off unit-test*/ | 
 | class AV1ConvolveDualWithoutsubtract2DHighbdTest | 
 |     : public AV1ConvolveNonSep_dual2DHighbdTest {}; | 
 |  | 
 | TEST_P(AV1ConvolveDualWithoutsubtract2DHighbdTest, RunTest) { RunTest(0); } | 
 | TEST_P(AV1ConvolveDualWithoutsubtract2DHighbdTest, DISABLED_Speed) { | 
 |   RunSpeedTest(0); | 
 | } | 
 |  | 
 | #if HAVE_AVX2 | 
 | INSTANTIATE_TEST_SUITE_P( | 
 |     AVX2, AV1ConvolveDualWithoutsubtract2DHighbdTest, | 
 |     BuildHighbdParams(av1_convolve_symmetric_dual_highbd_avx2)); | 
 | #endif | 
 |  | 
 | ////////////////////////////////////////////////////////// | 
 | // Unit-test corresponds to buffer accumulations to derive filter | 
 | // index for each block size (pc_wiener_block_size: 4x4) | 
 | ////////////////////////////////////////////////////////// | 
 |  | 
 | // Generate the list of all block widths / heights that need to be tested for | 
 | // pc_wiener. | 
 | template <typename T> | 
 | std::vector<TestParam<T>> GetPCWienerTestParams( | 
 |     std::initializer_list<int> bit_depths, T test_func) { | 
 |   std::set<BlockSize> sizes; | 
 |   for (int b = BLOCK_4X4; b < BLOCK_SIZES_ALL; ++b) { | 
 |     const int w = block_size_wide[b]; | 
 |     const int h = block_size_high[b]; | 
 |     if (w > RESTORATION_PROC_UNIT_SIZE || h > RESTORATION_PROC_UNIT_SIZE) { | 
 |       continue; | 
 |     } | 
 |     sizes.insert(BlockSize(w, h)); | 
 |     // Add in smaller chroma sizes as well. | 
 |     if (w == 4 || h == 4) { | 
 |       sizes.insert(BlockSize(w / 2, h / 2)); | 
 |     } | 
 |   } | 
 |   std::vector<TestParam<T>> result; | 
 |   for (const BlockSize &block : sizes) { | 
 |     for (int bd : bit_depths) { | 
 |       result.push_back(TestParam<T>(block, bd, test_func)); | 
 |     } | 
 |   } | 
 |   return result; | 
 | } | 
 | template <typename T> | 
 | ::testing::internal::ParamGenerator<TestParam<T>> BuildHighbdPCWienerParams( | 
 |     T test_func) { | 
 |   return ::testing::ValuesIn(GetPCWienerTestParams({ 10, 12 }, test_func)); | 
 | } | 
 |  | 
 | typedef void (*fill_directional_feature_buffers_highbd_func)( | 
 |     int *feature_sum_buffers[], int16_t *feature_line_buffers[], int row, | 
 |     int buffer_row, const uint16_t *dgd, int dgd_stride, int width, | 
 |     int feature_lead, int feature_lag); | 
 |  | 
 | class AV1FillDirFeatureBufHighbdTest | 
 |     : public AV1ConvolveTest<fill_directional_feature_buffers_highbd_func> { | 
 |  public: | 
 |   void RunTest() { | 
 |     for (int i = 0; i < kTestIterations; i++) { | 
 |       // Set buffer values here. | 
 |       SetBufferValues(); | 
 |       TestConvolve(); | 
 |     } | 
 |   } | 
 |  | 
 |   void RunSpeedTest() { SpeedTestConvolve(); }; | 
 |  | 
 |  protected: | 
 |   virtual void SetUp() { | 
 |     for (int j = 0; j < NUM_FEATURE_LINE_BUFFERS; ++j) { | 
 |       feature_line_buffers_c_[j] = static_cast<int16_t *>( | 
 |           (aom_malloc(buffer_width_ * sizeof(*feature_line_buffers_c_[j])))); | 
 |       ASSERT_NE(feature_line_buffers_c_[j], nullptr); | 
 |  | 
 |       feature_line_buffers_simd_[j] = static_cast<int16_t *>( | 
 |           (aom_malloc(buffer_width_ * sizeof(*feature_line_buffers_simd_[j])))); | 
 |       ASSERT_NE(feature_line_buffers_simd_[j], nullptr); | 
 |     } | 
 |  | 
 |     for (int j = 0; j < NUM_PC_WIENER_FEATURES; ++j) { | 
 |       feature_sum_buffers_c_[j] = static_cast<int *>( | 
 |           (aom_malloc(buffer_width_ * sizeof(*feature_sum_buffers_c_[j])))); | 
 |       ASSERT_NE(feature_sum_buffers_c_[j], nullptr); | 
 |  | 
 |       feature_sum_buffers_simd_[j] = static_cast<int *>( | 
 |           (aom_malloc(buffer_width_ * sizeof(*feature_sum_buffers_simd_[j])))); | 
 |       ASSERT_NE(feature_sum_buffers_simd_[j], nullptr); | 
 |     } | 
 |   } | 
 |  | 
 |   virtual void TearDown() { | 
 |     for (int j = 0; j < NUM_FEATURE_LINE_BUFFERS; ++j) { | 
 |       aom_free(feature_line_buffers_c_[j]); | 
 |       feature_line_buffers_c_[j] = NULL; | 
 |       aom_free(feature_line_buffers_simd_[j]); | 
 |       feature_line_buffers_simd_[j] = NULL; | 
 |     } | 
 |  | 
 |     for (int j = 0; j < NUM_PC_WIENER_FEATURES; ++j) { | 
 |       aom_free(feature_sum_buffers_c_[j]); | 
 |       feature_sum_buffers_c_[j] = NULL; | 
 |       aom_free(feature_sum_buffers_simd_[j]); | 
 |       feature_sum_buffers_simd_[j] = NULL; | 
 |     } | 
 |   } | 
 |  | 
 |   void SetBufferValues() { | 
 |     const int bitdepth = GetParam().BitDepth(); | 
 |     for (int j = 0; j < NUM_FEATURE_LINE_BUFFERS; ++j) { | 
 |       Randomize(feature_line_buffers_c_[j], buffer_width_, bitdepth); | 
 |       memcpy(feature_line_buffers_simd_[j], feature_line_buffers_c_[j], | 
 |              buffer_width_ * sizeof(*feature_line_buffers_simd_[j])); | 
 |     } | 
 |  | 
 |     for (int j = 0; j < NUM_PC_WIENER_FEATURES; ++j) { | 
 |       RandomizeSigned31(feature_sum_buffers_c_[j], buffer_width_, 31); | 
 |       memcpy(feature_sum_buffers_simd_[j], feature_sum_buffers_c_[j], | 
 |              buffer_width_ * sizeof(*feature_sum_buffers_simd_[j])); | 
 |     } | 
 |   } | 
 |  | 
 |  private: | 
 |   libaom_test::ACMRandom rnd_; | 
 |   static constexpr int kSpeedIterations = 10000; | 
 |   static constexpr int kTestIterations = 100; | 
 |  | 
 |   void TestConvolve() { | 
 |     const int width = GetParam().Block().Width(); | 
 |     const int height = GetParam().Block().Height(); | 
 |     // Input buffer allocation. | 
 |     const uint16_t *input = FirstRandomInput12(GetParam()); | 
 |     const int input_stride = width; | 
 |  | 
 |     // C function call | 
 |     for (int i = 0; i < height; ++i) { | 
 |       const int row_to_process = AOMMIN(i + feature_lag, height + 3 - 2); | 
 |       fill_directional_feature_buffers_highbd_c( | 
 |           feature_sum_buffers_c_, feature_line_buffers_c_, row_to_process, | 
 |           feature_length - 1, input, input_stride, width, feature_lead, | 
 |           feature_lag); | 
 |     } | 
 |  | 
 |     // SIMD function call | 
 |     for (int i = 0; i < height; ++i) { | 
 |       const int row_to_process = AOMMIN(i + feature_lag, height + 3 - 2); | 
 |       GetParam().TestFunction()(feature_sum_buffers_simd_, | 
 |                                 feature_line_buffers_simd_, row_to_process, | 
 |                                 feature_length - 1, input, input_stride, width, | 
 |                                 feature_lead, feature_lag); | 
 |     } | 
 |  | 
 |     // Compare the outputs of C and SIMD | 
 |     for (int i = 0; i < NUM_PC_WIENER_FEATURES; i++) { | 
 |       int *c_buf = feature_sum_buffers_c_[i]; | 
 |       int *simd_buf = feature_sum_buffers_simd_[i]; | 
 |       for (int j = 0; j < buffer_width_; ++j) { | 
 |         ASSERT_EQ(c_buf[j], simd_buf[j]) | 
 |             << "feature_buf=" << i << " Pixel mismatch at width (" << i << ")"; | 
 |       } | 
 |     } | 
 |   } | 
 |  | 
 |   void SpeedTestConvolve() { | 
 |     const int width = GetParam().Block().Width(); | 
 |     const int height = GetParam().Block().Height(); | 
 |  | 
 |     // Input buffer allocation. | 
 |     const uint16_t *input = FirstRandomInput12(GetParam()); | 
 |     const int input_stride = width; | 
 |  | 
 |     // Calculate time taken for C function | 
 |     aom_usec_timer timer; | 
 |     aom_usec_timer_start(&timer); | 
 |     for (int i = 0; i < kSpeedIterations; ++i) { | 
 |       for (int i = 0; i < height; ++i) { | 
 |         const int row_to_process = AOMMIN(i + feature_lag, height + 3 - 2); | 
 |         fill_directional_feature_buffers_highbd_c( | 
 |             feature_sum_buffers_c_, feature_line_buffers_c_, row_to_process, | 
 |             feature_length - 1, input, input_stride, width, feature_lead, | 
 |             feature_lag); | 
 |       } | 
 |     } | 
 |     aom_usec_timer_mark(&timer); | 
 |     auto elapsed_time_c = aom_usec_timer_elapsed(&timer); | 
 |  | 
 |     // Calculate time taken by optimized/intrinsic function | 
 |     aom_usec_timer_start(&timer); | 
 |     for (int i = 0; i < kSpeedIterations; ++i) { | 
 |       for (int i = 0; i < height; ++i) { | 
 |         const int row_to_process = AOMMIN(i + feature_lag, height + 3 - 2); | 
 |         GetParam().TestFunction()(feature_sum_buffers_simd_, | 
 |                                   feature_line_buffers_simd_, row_to_process, | 
 |                                   feature_length - 1, input, input_stride, | 
 |                                   width, feature_lead, feature_lag); | 
 |       } | 
 |     } | 
 |     aom_usec_timer_mark(&timer); | 
 |     auto elapsed_time_opt = aom_usec_timer_elapsed(&timer); | 
 |  | 
 |     float c_time_per_pixel = | 
 |         (float)1000.0 * elapsed_time_c / (kSpeedIterations * width * height); | 
 |     float opt_time_per_pixel = | 
 |         (float)1000.0 * elapsed_time_opt / (kSpeedIterations * width * height); | 
 |     float scaling = c_time_per_pixel / opt_time_per_pixel; | 
 |     printf( | 
 |         "%3dx%-3d: c_time_per_pixel=%10.5f, " | 
 |         "opt_time_per_pixel=%10.5f,  scaling=%f \n", | 
 |         width, height, c_time_per_pixel, opt_time_per_pixel, scaling); | 
 |   } | 
 |  | 
 |   // Fills the array p with signed integers. | 
 |   void Randomize(int16_t *p, int size, int max_bit_range) { | 
 |     ASSERT_TRUE(max_bit_range < 16) << "max_bit_range has to be less than 16"; | 
 |     for (int i = 0; i < size; ++i) { | 
 |       p[i] = rnd_.Rand15Signed() & ((1 << max_bit_range) - 1); | 
 |     } | 
 |   } | 
 |  | 
 |   // Fills the array p with signed integers of 31 bit range. | 
 |   void RandomizeSigned31(int *p, int size, uint32_t max_bit_range) { | 
 |     assert(max_bit_range <= 31); | 
 |     uint32_t mask = (uint32_t)(1 << max_bit_range) - 1; | 
 |     for (int i = 0; i < size; ++i) { | 
 |       p[i] = (int)(rnd_.Rand31() & mask); | 
 |     } | 
 |   } | 
 |  | 
 |   int *feature_sum_buffers_c_[NUM_PC_WIENER_FEATURES]; | 
 |   int *feature_sum_buffers_simd_[NUM_PC_WIENER_FEATURES]; | 
 |   int16_t *feature_line_buffers_c_[NUM_FEATURE_LINE_BUFFERS]; | 
 |   int16_t *feature_line_buffers_simd_[NUM_FEATURE_LINE_BUFFERS]; | 
 |   const int feature_lead = PC_WIENER_FEATURE_LEAD_LUMA; | 
 |   const int feature_lag = PC_WIENER_FEATURE_LAG_LUMA; | 
 |   const int feature_length = PC_WIENER_FEATURE_LENGTH_LUMA; | 
 |   const int buffer_width_ = MAX_SB_SIZE + kInputPadding; | 
 | }; | 
 |  | 
 | TEST_P(AV1FillDirFeatureBufHighbdTest, RunTest) { RunTest(); } | 
 |  | 
 | TEST_P(AV1FillDirFeatureBufHighbdTest, DISABLED_Speed) { RunSpeedTest(); } | 
 |  | 
 | #if HAVE_AVX2 | 
 | INSTANTIATE_TEST_SUITE_P( | 
 |     AVX2, AV1FillDirFeatureBufHighbdTest, | 
 |     BuildHighbdPCWienerParams(fill_directional_feature_buffers_highbd_avx2)); | 
 | #endif  // HAVE_AVX2 | 
 |  | 
 | typedef void (*FillTSkipSumBufferFunc)(int row, const uint8_t *tskip, | 
 |                                        int tskip_stride, | 
 |                                        int8_t *tskip_sum_buffer, int width, | 
 |                                        int height, int tskip_lead, | 
 |                                        int tskip_lag, bool use_strict_bounds); | 
 |  | 
 | typedef std::tuple<const FillTSkipSumBufferFunc> AV1FillTSkipSumBufferFuncParam; | 
 |  | 
 | class AV1Fill_TSkip_Sum_BufferTest | 
 |     : public ::testing::TestWithParam<AV1FillTSkipSumBufferFuncParam> { | 
 |  public: | 
 |   virtual void SetUp() { target_func_ = GET_PARAM(0); } | 
 |  | 
 |   void RunTest() { | 
 |     for (int i = 0; i < kTestIterations; i++) { | 
 |       TestTSkipSum(); | 
 |     } | 
 |   } | 
 |   void RunSpeedTest() { SpeedTestTSkipSum(); }; | 
 |  | 
 |  private: | 
 |   libaom_test::ACMRandom rnd_; | 
 |   FillTSkipSumBufferFunc target_func_; | 
 |  | 
 |   static constexpr int kSpeedIterations = 10000; | 
 |   static constexpr int kTestIterations = 100; | 
 |   static constexpr int kNumPlanes = 1; | 
 |   static constexpr int kWidth = RESTORATION_PROC_UNIT_SIZE; | 
 |   static constexpr int kHeight = RESTORATION_PROC_UNIT_SIZE; | 
 |   static constexpr int kInputWidth = MI_SIZE_64X64; | 
 |   static constexpr int kInputStride = MI_SIZE_64X64; | 
 |   static constexpr int kOutputWidth = | 
 |       (RESTORATION_PROC_UNIT_SIZE + PC_WIENER_FEATURE_LENGTH_LUMA - 1); | 
 |  | 
 |   uint8_t input_buffer_[MI_SIZE_64X64 * MI_SIZE_64X64]; | 
 |   int8_t ref_buffer_[kOutputWidth]; | 
 |   int8_t test_buffer_[kOutputWidth]; | 
 |   const bool tskip_strict_ = true; | 
 |  | 
 |   int RandBool() { | 
 |     const uint32_t value = rnd_.Rand8(); | 
 |     // There's a bit more entropy in the upper bits of this implementation. | 
 |     return (value >> 7) & 0x1; | 
 |   } | 
 |  | 
 |   void TestTSkipSum() { | 
 |     for (int i = 0; i < kInputWidth * kInputStride; ++i) { | 
 |       input_buffer_[i] = static_cast<uint8_t>(RandBool() ? 1 : 0); | 
 |     } | 
 |  | 
 |     for (int plane = 0; plane < kNumPlanes; ++plane) { | 
 |       const int is_uv = (plane > 0); | 
 |       const int ss_x = is_uv ? 1 : 0; | 
 |       const int ss_y = is_uv ? 1 : 0; | 
 |       const int plane_width = kWidth >> ss_x; | 
 |       const int plane_height = kHeight >> ss_y; | 
 |       const int tskip_lead = PC_WIENER_TSKIP_LEAD_LUMA; | 
 |       const int tskip_lag = PC_WIENER_TSKIP_LAG_LUMA; | 
 |  | 
 |       memset(ref_buffer_, 0, sizeof(*ref_buffer_) * kOutputWidth); | 
 |       memset(test_buffer_, 0, sizeof(*test_buffer_) * kOutputWidth); | 
 |  | 
 |       // Reference function | 
 |       for (int row = -tskip_lead; row < (tskip_lag + plane_height); ++row) { | 
 |         av1_fill_tskip_sum_buffer_c(row, input_buffer_, kInputStride, | 
 |                                     ref_buffer_, plane_width, plane_height, | 
 |                                     tskip_lead, tskip_lag, tskip_strict_); | 
 |       } | 
 |  | 
 |       // Test function | 
 |       for (int row = -tskip_lead; row < (tskip_lag + plane_height); ++row) { | 
 |         target_func_(row, input_buffer_, kInputStride, test_buffer_, | 
 |                      plane_width, plane_height, tskip_lead, tskip_lag, | 
 |                      tskip_strict_); | 
 |       } | 
 |  | 
 |       // Compare the output of reference and test for bit match | 
 |       for (int i = 0; i < kOutputWidth; ++i) { | 
 |         ASSERT_EQ(ref_buffer_[i], test_buffer_[i]) | 
 |             << " Mismatch at (" << i << ")"; | 
 |       } | 
 |     } | 
 |   } | 
 |  | 
 |   void SpeedTestTSkipSum() { | 
 |     for (int i = 0; i < kInputWidth * kInputStride; ++i) { | 
 |       input_buffer_[i] = static_cast<uint8_t>(RandBool() ? 1 : 0); | 
 |     } | 
 |  | 
 |     for (int plane = 0; plane < kNumPlanes; ++plane) { | 
 |       const int is_uv = (plane > 0); | 
 |       const int ss_x = is_uv ? 1 : 0; | 
 |       const int ss_y = is_uv ? 1 : 0; | 
 |       const int plane_width = kWidth >> ss_x; | 
 |       const int plane_height = kHeight >> ss_y; | 
 |       const int tskip_lead = PC_WIENER_TSKIP_LEAD_LUMA; | 
 |       const int tskip_lag = PC_WIENER_TSKIP_LAG_LUMA; | 
 |  | 
 |       memset(ref_buffer_, 0, sizeof(*ref_buffer_) * kOutputWidth); | 
 |       memset(test_buffer_, 0, sizeof(*test_buffer_) * kOutputWidth); | 
 |  | 
 |       // Calculate time taken by reference/c function | 
 |       aom_usec_timer timer; | 
 |       aom_usec_timer_start(&timer); | 
 |       for (int i = 0; i < kSpeedIterations; ++i) { | 
 |         // Reference function | 
 |         for (int row = -tskip_lead; row < (tskip_lag + plane_height - 1); | 
 |              ++row) { | 
 |           av1_fill_tskip_sum_buffer_c(row, input_buffer_, kInputStride, | 
 |                                       ref_buffer_, plane_width, plane_height, | 
 |                                       tskip_lead, tskip_lag, tskip_strict_); | 
 |         } | 
 |       } | 
 |       aom_usec_timer_mark(&timer); | 
 |       auto elapsed_time_c = aom_usec_timer_elapsed(&timer); | 
 |  | 
 |       // Calculate time taken by optimized/intrinsic function | 
 |       aom_usec_timer_start(&timer); | 
 |       for (int i = 0; i < kSpeedIterations; ++i) { | 
 |         for (int row = -tskip_lead; row < (tskip_lag + plane_height - 1); | 
 |              ++row) { | 
 |           target_func_(row, input_buffer_, kInputStride, test_buffer_, | 
 |                        plane_width, plane_height, tskip_lead, tskip_lag, | 
 |                        tskip_strict_); | 
 |         } | 
 |       } | 
 |       aom_usec_timer_mark(&timer); | 
 |       auto elapsed_time_opt = aom_usec_timer_elapsed(&timer); | 
 |  | 
 |       float c_time_per_pixel = | 
 |           (float)1000.0 * elapsed_time_c / kSpeedIterations; | 
 |       float opt_time_per_pixel = | 
 |           (float)1000.0 * elapsed_time_opt / kSpeedIterations; | 
 |       float scaling = c_time_per_pixel / opt_time_per_pixel; | 
 |       printf( | 
 |           " %3dx%-3d: c_time_per_pixel=%10.5f, " | 
 |           "opt_time_per_pixel=%10.5f,  scaling=%f \n", | 
 |           plane_width, plane_height, c_time_per_pixel, opt_time_per_pixel, | 
 |           scaling); | 
 |     } | 
 |   } | 
 | }; | 
 |  | 
 | TEST_P(AV1Fill_TSkip_Sum_BufferTest, RunTest) { RunTest(); } | 
 | TEST_P(AV1Fill_TSkip_Sum_BufferTest, DISABLED_Speed) { RunSpeedTest(); } | 
 |  | 
 | #if HAVE_AVX2 | 
 | INSTANTIATE_TEST_SUITE_P(AVX2, AV1Fill_TSkip_Sum_BufferTest, | 
 |                          ::testing::Values(av1_fill_tskip_sum_buffer_avx2)); | 
 | #endif  // HAVE_AVX2 | 
 |  | 
 | ////////////////////////////////////////////////////////// | 
 | //       unit-test for 'directional_feature_accum'      // | 
 | ////////////////////////////////////////////////////////// | 
 | typedef void (*FillDirFeatureAccumFunc)( | 
 |     int dir_feature_accum[NUM_PC_WIENER_FEATURES][PC_WIENER_FEATURE_ACC_SIZE], | 
 |     int *feature_sum_buf[NUM_PC_WIENER_FEATURES], int width, int col_offset, | 
 |     int feature_lead, int feature_lag); | 
 |  | 
 | typedef std::tuple<const FillDirFeatureAccumFunc> | 
 |     AV1FillDirFeatureAccumFuncParam; | 
 |  | 
 | class AV1FeatureDirAccumHighbdTest | 
 |     : public ::testing::TestWithParam<AV1FillDirFeatureAccumFuncParam> { | 
 |  public: | 
 |   void RunTest() { | 
 |     for (int i = 0; i < kTestIterations; i++) { | 
 |       FillInputBufs(); | 
 |       TestFillDirFeatureAccum(); | 
 |     } | 
 |   } | 
 |  | 
 |   void RunSpeedTest() { SpeedTestConvolve(); }; | 
 |  | 
 |   virtual void SetUp() { | 
 |     target_func_ = GET_PARAM(0); | 
 |  | 
 |     for (int j = 0; j < NUM_PC_WIENER_FEATURES; ++j) { | 
 |       feature_sum_buf[j] = | 
 |           (int *)(aom_malloc(kInputWidth * sizeof(*feature_sum_buf[j]))); | 
 |     } | 
 |   } | 
 |  | 
 |   virtual void TearDown() { | 
 |     for (int j = 0; j < NUM_PC_WIENER_FEATURES; ++j) { | 
 |       aom_free(feature_sum_buf[j]); | 
 |       feature_sum_buf[j] = NULL; | 
 |     } | 
 |   } | 
 |  | 
 |  private: | 
 |   libaom_test::ACMRandom rnd_; | 
 |   FillDirFeatureAccumFunc target_func_; | 
 |  | 
 |   static constexpr int kSpeedIterations = 1000000; | 
 |   static constexpr int kTestIterations = 100; | 
 |   static constexpr int kNumPlanes = 2; | 
 |   static constexpr int kWidth = RESTORATION_PROC_UNIT_SIZE; | 
 |   static constexpr int kInputWidth = | 
 |       (RESTORATION_PROC_UNIT_SIZE + PC_WIENER_FEATURE_LENGTH_LUMA - 1); | 
 |  | 
 |   int *feature_sum_buf[NUM_PC_WIENER_FEATURES]; | 
 |   int dir_feature_accum_buf_c[NUM_PC_WIENER_FEATURES] | 
 |                              [PC_WIENER_FEATURE_ACC_SIZE] = { { 0 } }; | 
 |   int dir_feature_accum_buf_simd[NUM_PC_WIENER_FEATURES] | 
 |                                 [PC_WIENER_FEATURE_ACC_SIZE] = { { 0 } }; | 
 |   int RandBool() { | 
 |     const uint32_t value = rnd_.Rand8(); | 
 |     // There's a bit more entropy in the upper bits of this implementation. | 
 |     return (value >> 7) & 0x1; | 
 |   } | 
 |  | 
 |   void FillInputBufs() { | 
 |     for (int i = 0; i < NUM_PC_WIENER_FEATURES; ++i) { | 
 |       for (int j = 0; j < kInputWidth; ++j) { | 
 |         // For the extreme values case, the maimum input that feature_sum_buf | 
 |         // can take is (kInputWidth * 2 * input_max_value). Hence, clipping the | 
 |         // value generated to 23 bit. | 
 |         const int max_range = (1 << 23); | 
 |         const int value = rnd_.Rand31() % max_range; | 
 |         feature_sum_buf[i][j] = | 
 |             static_cast<uint8_t>(RandBool() ? value : -value); | 
 |       } | 
 |     } | 
 |     // Reset output buffers | 
 |     av1_zero(dir_feature_accum_buf_c); | 
 |     av1_zero(dir_feature_accum_buf_simd); | 
 |   } | 
 |  | 
 |   void TestFillDirFeatureAccum() { | 
 |     for (int plane = 0; plane < kNumPlanes; ++plane) { | 
 |       const int is_uv = (plane > 0); | 
 |       const int ss_x = is_uv ? 1 : 0; | 
 |       const int plane_width = kWidth >> ss_x; | 
 |       const int feature_lead = PC_WIENER_FEATURE_LEAD_LUMA; | 
 |       const int feature_lag = PC_WIENER_FEATURE_LAG_LUMA; | 
 |  | 
 |       // Reset output buffers | 
 |       av1_zero(dir_feature_accum_buf_c); | 
 |       av1_zero(dir_feature_accum_buf_simd); | 
 |  | 
 |       // C function call | 
 |       av1_fill_directional_feature_accumulators_c( | 
 |           dir_feature_accum_buf_c, feature_sum_buf, plane_width, feature_lag, | 
 |           feature_lead, feature_lag); | 
 |  | 
 |       // SIMD function call | 
 |       target_func_(dir_feature_accum_buf_simd, feature_sum_buf, plane_width, | 
 |                    feature_lag, feature_lead, feature_lag); | 
 |  | 
 |       // Compare the output of reference and test for bit match | 
 |       for (int i = 0; i < NUM_PC_WIENER_FEATURES; i++) { | 
 |         for (int j = 0; j < PC_WIENER_FEATURE_ACC_SIZE; j++) { | 
 |           ASSERT_EQ(dir_feature_accum_buf_c[i][j], | 
 |                     dir_feature_accum_buf_simd[i][j]) | 
 |               << " Feature_Buf: Pixel mismatch at (" << i << ", " << j << ", " | 
 |               << plane_width << ")"; | 
 |         } | 
 |       } | 
 |     } | 
 |   } | 
 |  | 
 |   void SpeedTestConvolve() { | 
 |     for (int plane = 0; plane < kNumPlanes; ++plane) { | 
 |       const int is_uv = (plane > 0); | 
 |       const int ss_x = is_uv ? 1 : 0; | 
 |       const int plane_width = kWidth >> ss_x; | 
 |       const int feature_lead = PC_WIENER_FEATURE_LEAD_LUMA; | 
 |       const int feature_lag = PC_WIENER_FEATURE_LAG_LUMA; | 
 |       FillInputBufs(); | 
 |  | 
 |       // Calculate time taken by reference/c function | 
 |       aom_usec_timer timer; | 
 |       aom_usec_timer_start(&timer); | 
 |       for (int i = 0; i < kSpeedIterations; ++i) { | 
 |         av1_fill_directional_feature_accumulators_c( | 
 |             dir_feature_accum_buf_c, feature_sum_buf, plane_width, feature_lag, | 
 |             feature_lead, feature_lag); | 
 |       } | 
 |       aom_usec_timer_mark(&timer); | 
 |       auto elapsed_time_c = aom_usec_timer_elapsed(&timer); | 
 |  | 
 |       // Calculate time taken by optimized/intrinsic function | 
 |       aom_usec_timer_start(&timer); | 
 |       for (int i = 0; i < kSpeedIterations; ++i) { | 
 |         target_func_(dir_feature_accum_buf_simd, feature_sum_buf, plane_width, | 
 |                      feature_lag, feature_lead, feature_lag); | 
 |       } | 
 |       aom_usec_timer_mark(&timer); | 
 |       auto elapsed_time_opt = aom_usec_timer_elapsed(&timer); | 
 |  | 
 |       float c_time_per_pixel = | 
 |           (float)1000.0 * elapsed_time_c / (kSpeedIterations * plane_width); | 
 |       float opt_time_per_pixel = | 
 |           (float)1000.0 * elapsed_time_opt / (kSpeedIterations * plane_width); | 
 |       float scaling = c_time_per_pixel / opt_time_per_pixel; | 
 |       printf( | 
 |           " %3d: c_time_per_pixel=%10.5f, " | 
 |           "opt_time_per_pixel=%10.5f,  scaling=%f \n", | 
 |           plane_width, c_time_per_pixel, opt_time_per_pixel, scaling); | 
 |     } | 
 |   } | 
 | }; | 
 |  | 
 | TEST_P(AV1FeatureDirAccumHighbdTest, RunTest) { RunTest(); } | 
 | TEST_P(AV1FeatureDirAccumHighbdTest, DISABLED_Speed) { RunSpeedTest(); } | 
 |  | 
 | #if HAVE_AVX2 | 
 | INSTANTIATE_TEST_SUITE_P( | 
 |     AVX2, AV1FeatureDirAccumHighbdTest, | 
 |     ::testing::Values(av1_fill_directional_feature_accumulators_avx2)); | 
 | #endif  // HAVE_AVX2 | 
 |  | 
 | ////////////////////////////////////////////////////////// | 
 | //     unit-test for 'fill_tskip_feature_accumulator'   // | 
 | ////////////////////////////////////////////////////////// | 
 | typedef void (*FillTskip_Accumulator_func)( | 
 |     int16_t tskip_feature_accum[PC_WIENER_FEATURE_ACC_SIZE], | 
 |     int8_t *tskip_sum_buff, int width, int col_offset, int tskip_lead, | 
 |     int tskip_lag); | 
 | typedef std::tuple<const FillTskip_Accumulator_func> | 
 |     AV1FillTSkipAccumBufferFuncParam; | 
 |  | 
 | class AV1TskipAccumHighbdTest | 
 |     : public ::testing::TestWithParam<AV1FillTSkipAccumBufferFuncParam> { | 
 |  public: | 
 |   virtual void SetUp() { target_func_ = GET_PARAM(0); } | 
 |  | 
 |   void RunTest() { | 
 |     for (int i = 0; i < kTestIterations; i++) TestTskipAccum(); | 
 |   } | 
 |  | 
 |   void RunSpeedTest() { SpeedTestTskipAccum(); }; | 
 |  | 
 |  private: | 
 |   libaom_test::ACMRandom rnd_; | 
 |   FillTskip_Accumulator_func target_func_; | 
 |  | 
 |   static constexpr int kSpeedIterations = 1000000; | 
 |   static constexpr int kTestIterations = 100; | 
 |   static constexpr int kNumPlanes = 2; | 
 |   static constexpr int kWidth = RESTORATION_PROC_UNIT_SIZE; | 
 |   static constexpr int kInputWidth = | 
 |       (RESTORATION_PROC_UNIT_SIZE + PC_WIENER_FEATURE_LENGTH_LUMA - 1); | 
 |  | 
 |   int8_t *tskip_sum_buf; | 
 |   int16_t tskip_feature_accum_c[PC_WIENER_FEATURE_ACC_SIZE] = { 0 }; | 
 |   int16_t tskip_feature_accum_simd[PC_WIENER_FEATURE_ACC_SIZE] = { 0 }; | 
 |  | 
 |   void buffer_alloc_and_set_data() { | 
 |     tskip_sum_buf = | 
 |         (int8_t *)(aom_malloc(kInputWidth * sizeof(*tskip_sum_buf))); | 
 |     // Input buffer filling. Tskip buffer max value will not cross width of | 
 |     // restoration unit size. Hence, the generated values are clipped to the | 
 |     // same. | 
 |     for (int i = 0; i < kInputWidth; ++i) { | 
 |       const int8_t value = | 
 |           static_cast<int8_t>(rnd_.Rand8() % RESTORATION_PROC_UNIT_SIZE); | 
 |       tskip_sum_buf[i] = static_cast<uint8_t>(RandBool() ? value : -value); | 
 |     } | 
 |   } | 
 |  | 
 |   int RandBool() { | 
 |     const uint32_t value = rnd_.Rand8(); | 
 |     // There's a bit more entropy in the upper bits of this implementation. | 
 |     return (value >> 7) & 0x1; | 
 |   } | 
 |  | 
 |   void TestTskipAccum() { | 
 |     // Allocate memory and fill input buffer | 
 |     buffer_alloc_and_set_data(); | 
 |  | 
 |     // Loop over luma and chroma plane | 
 |     for (int plane = 0; plane < kNumPlanes; ++plane) { | 
 |       const int is_uv = (plane > 0); | 
 |       const int ss_x = is_uv ? 1 : 0; | 
 |       const int plane_width = kWidth >> ss_x; | 
 |       const int tskip_lead = PC_WIENER_TSKIP_LEAD_LUMA; | 
 |       const int tskip_lag = PC_WIENER_TSKIP_LAG_LUMA; | 
 |       av1_zero(tskip_feature_accum_c); | 
 |       av1_zero(tskip_feature_accum_simd); | 
 |  | 
 |       // C function call | 
 |       av1_fill_tskip_feature_accumulator_c(tskip_feature_accum_c, tskip_sum_buf, | 
 |                                            plane_width, tskip_lag, tskip_lead, | 
 |                                            tskip_lag); | 
 |  | 
 |       // SIMD function call | 
 |       target_func_(tskip_feature_accum_simd, tskip_sum_buf, plane_width, | 
 |                    tskip_lag, tskip_lead, tskip_lag); | 
 |  | 
 |       // Compare the output of reference and test for bit match | 
 |       for (int i = 0; i < PC_WIENER_FEATURE_ACC_SIZE; i++) { | 
 |         ASSERT_EQ(tskip_feature_accum_c[i], tskip_feature_accum_simd[i]) | 
 |             << " Feature_Buf: Pixel mismatch at (" << i << "," << plane_width | 
 |             << ")"; | 
 |       } | 
 |     } | 
 |     aom_free(tskip_sum_buf); | 
 |     tskip_sum_buf = NULL; | 
 |   } | 
 |  | 
 |   void SpeedTestTskipAccum() { | 
 |     // Allocate memory and fill input buffer | 
 |     buffer_alloc_and_set_data(); | 
 |  | 
 |     for (int plane = 0; plane < kNumPlanes; ++plane) { | 
 |       const int is_uv = (plane > 0); | 
 |       const int ss_x = is_uv ? 1 : 0; | 
 |       const int plane_width = kWidth >> ss_x; | 
 |       const int tskip_lead = PC_WIENER_TSKIP_LEAD_LUMA; | 
 |       const int tskip_lag = PC_WIENER_TSKIP_LAG_LUMA; | 
 |  | 
 |       // Calculate time taken by reference/c function | 
 |       aom_usec_timer timer; | 
 |       aom_usec_timer_start(&timer); | 
 |       for (int i = 0; i < kSpeedIterations; ++i) { | 
 |         av1_fill_tskip_feature_accumulator_c(tskip_feature_accum_c, | 
 |                                              tskip_sum_buf, plane_width, | 
 |                                              tskip_lag, tskip_lead, tskip_lag); | 
 |       } | 
 |       aom_usec_timer_mark(&timer); | 
 |       auto elapsed_time_c = aom_usec_timer_elapsed(&timer); | 
 |  | 
 |       // Calculate time taken by optimized/intrinsic function | 
 |       aom_usec_timer_start(&timer); | 
 |       for (int i = 0; i < kSpeedIterations; ++i) { | 
 |         target_func_(tskip_feature_accum_simd, tskip_sum_buf, plane_width, | 
 |                      tskip_lag, tskip_lead, tskip_lag); | 
 |       } | 
 |       aom_usec_timer_mark(&timer); | 
 |       auto elapsed_time_opt = aom_usec_timer_elapsed(&timer); | 
 |  | 
 |       float c_time_per_pixel = | 
 |           (float)1000.0 * elapsed_time_c / (kSpeedIterations * plane_width); | 
 |       float opt_time_per_pixel = | 
 |           (float)1000.0 * elapsed_time_opt / (kSpeedIterations * plane_width); | 
 |       float scaling = c_time_per_pixel / opt_time_per_pixel; | 
 |       printf( | 
 |           " %3d: c_time_per_pixel=%10.5f, " | 
 |           "opt_time_per_pixel=%10.5f,  scaling=%f \n", | 
 |           plane_width, c_time_per_pixel, opt_time_per_pixel, scaling); | 
 |     } | 
 |     aom_free(tskip_sum_buf); | 
 |     tskip_sum_buf = NULL; | 
 |   } | 
 | }; | 
 |  | 
 | TEST_P(AV1TskipAccumHighbdTest, RunTest) { RunTest(); } | 
 | TEST_P(AV1TskipAccumHighbdTest, DISABLED_Speed) { RunSpeedTest(); } | 
 |  | 
 | #if HAVE_AVX2 | 
 | INSTANTIATE_TEST_SUITE_P( | 
 |     AVX2, AV1TskipAccumHighbdTest, | 
 |     ::testing::Values(av1_fill_tskip_feature_accumulator_avx2)); | 
 | #endif  // HAVE_AVX2 | 
 | }  // namespace |