blob: a78380da01d652ef4f43d29b89a3a7fc2bdf8183 [file] [log] [blame]
/*
* Copyright (c) 2021, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 3-Clause Clear License
* and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
* License was not distributed with this source code in the LICENSE file, you
* can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the
* Alliance for Open Media Patent License 1.0 was not distributed with this
* source code in the PATENTS file, you can obtain it at
* aomedia.org/license/patent-license/.
*/
#include <ostream>
#include <set>
#include <vector>
#include "aom_ports/aom_timer.h"
#include "config/av1_rtcd.h"
#include "config/aom_dsp_rtcd.h"
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wuninitialized"
#include "test/acm_random.h"
#pragma GCC diagnostic pop
#include "test/clear_system_state.h"
#include "test/util.h"
#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
#if CONFIG_LR_IMPROVEMENTS
#include "av1/common/restoration.h"
#endif // CONFIG_LR_IMPROVEMENTS
namespace {
// TODO(any): Remove following INTERP_FILTERS_ALL define, so that 12-tap filter
// is tested once 12-tap filter SIMD is done.
#undef INTERP_FILTERS_ALL
#define INTERP_FILTERS_ALL 4
// All single reference convolve tests are parameterized on block size,
// bit-depth, and function to test.
//
// Note that parameterizing on these variables (and not other parameters) is
// a conscious decision - Jenkins needs some degree of parallelization to run
// the tests within the time limit, but if the number of parameters increases
// too much, the gtest framework does not handle it well (increased overhead per
// test, huge amount of output to stdout, etc.).
//
// Also note that the test suites must be named with the architecture, e.g.,
// C, C_X, AVX2_X, ... The test suite that runs on Jenkins sometimes runs tests
// that cannot deal with intrinsics (e.g., the Valgrind tests on 32-bit x86
// binaries) and will disable tests using a filter like
// --gtest_filter=-:SSE4_1.*. If the test suites are not named this way, the
// testing infrastructure will not selectively filter them properly.
class BlockSize {
public:
BlockSize(int w, int h) : width_(w), height_(h) {}
int Width() const { return width_; }
int Height() const { return height_; }
bool operator<(const BlockSize &other) const {
if (Width() == other.Width()) {
return Height() < other.Height();
}
return Width() < other.Width();
}
bool operator==(const BlockSize &other) const {
return Width() == other.Width() && Height() == other.Height();
}
private:
int width_;
int height_;
};
// Block size / bit depth / test function used to parameterize the tests.
template <typename T>
class TestParam {
public:
TestParam(const BlockSize &block, int bd, T test_func)
: block_(block), bd_(bd), test_func_(test_func) {}
const BlockSize &Block() const { return block_; }
int BitDepth() const { return bd_; }
T TestFunction() const { return test_func_; }
bool operator==(const TestParam &other) const {
return Block() == other.Block() && BitDepth() == other.BitDepth() &&
TestFunction() == other.TestFunction();
}
private:
BlockSize block_;
int bd_;
T test_func_;
};
template <typename T>
std::ostream &operator<<(std::ostream &os, const TestParam<T> &test_arg) {
return os << "TestParam { width:" << test_arg.Block().Width()
<< " height:" << test_arg.Block().Height()
<< " bd:" << test_arg.BitDepth() << " }";
}
// Generate the list of all block widths / heights that need to be tested,
// includes chroma and luma sizes, for the given bit-depths. The test
// function is the same for all generated parameters.
template <typename T>
std::vector<TestParam<T>> GetTestParams(std::initializer_list<int> bit_depths,
T test_func) {
std::set<BlockSize> sizes;
for (int b = BLOCK_4X4; b < BLOCK_SIZES_ALL; ++b) {
const int w = block_size_wide[b];
const int h = block_size_high[b];
sizes.insert(BlockSize(w, h));
// Add in smaller chroma sizes as well.
if (w == 4 || h == 4) {
sizes.insert(BlockSize(w / 2, h / 2));
}
}
std::vector<TestParam<T>> result;
for (const BlockSize &block : sizes) {
for (int bd : bit_depths) {
result.push_back(TestParam<T>(block, bd, test_func));
}
}
return result;
}
// Test the test-parameters generators work as expected.
class AV1ConvolveParametersTest : public ::testing::Test {};
template <typename T>
std::vector<TestParam<T>> GetHighbdTestParams(T test_func) {
return GetTestParams({ 10, 12 }, test_func);
}
template <typename T>
::testing::internal::ParamGenerator<TestParam<T>> BuildHighbdParams(
T test_func) {
return ::testing::ValuesIn(GetHighbdTestParams(test_func));
}
TEST_F(AV1ConvolveParametersTest, GetHighbdTestParams) {
auto v = GetHighbdTestParams(av1_highbd_convolve_x_sr_c);
#if CONFIG_FLEX_PARTITION
ASSERT_EQ(80U, v.size());
#else
ASSERT_EQ(60U, v.size());
#endif // CONFIG_FLEX_PARTITION
int num_10 = 0;
int num_12 = 0;
for (const auto &p : v) {
ASSERT_TRUE(p.BitDepth() == 10 || p.BitDepth() == 12);
bool same_fn = av1_highbd_convolve_x_sr_c == p.TestFunction();
ASSERT_TRUE(same_fn);
if (p.BitDepth() == 10) {
++num_10;
} else {
++num_12;
}
}
ASSERT_EQ(num_10, num_12);
}
// AV1ConvolveTest is the base class that all convolve tests should derive from.
// It provides storage/methods for generating randomized buffers for both
// low bit-depth and high bit-depth, and setup/teardown methods for clearing
// system state. Implementors can get the bit-depth / block-size /
// test function by calling GetParam().
template <typename T>
class AV1ConvolveTest : public ::testing::TestWithParam<TestParam<T>> {
public:
virtual ~AV1ConvolveTest() { TearDown(); }
virtual void SetUp() override {
rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
}
virtual void TearDown() override { libaom_test::ClearSystemState(); }
// Randomizes the 8-bit input buffer and returns a pointer to it. Note that
// the pointer is safe to use with an 8-tap filter. The stride can range
// from width to (width + kPadding). Also note that the pointer is to the
// same memory location.
static constexpr int kInputPadding = 8;
// Get a pointer to a buffer with stride == width. Note that we must have
// the test param passed in explicitly -- the gtest framework does not
// support calling GetParam() within a templatized class.
// Note that FirstRandomInput8 always returns the same pointer -- if two
// inputs are needed, also use SecondRandomInput8.
const uint8_t *FirstRandomInput8(const TestParam<T> &param) {
// Note we can't call GetParam() directly -- gtest does not support
// this for parameterized types.
return RandomInput8(input8_1_, param);
}
const uint8_t *SecondRandomInput8(const TestParam<T> &param) {
return RandomInput8(input8_2_, param);
}
// Some of the intrinsics perform writes in 32 byte chunks. Moreover, some
// of the instrinsics assume that the stride is also a multiple of 32.
// To satisfy these constraints and also remain simple, output buffer strides
// are assumed MAX_SB_SIZE.
static constexpr int kOutputStride = MAX_SB_SIZE;
// Check that two 8-bit output buffers are identical.
void AssertOutputBufferEq(const uint8_t *p1, const uint8_t *p2, int width,
int height) {
ASSERT_TRUE(p1 != p2) << "Buffers must be at different memory locations";
for (int j = 0; j < height; ++j) {
if (memcmp(p1, p2, sizeof(*p1) * width) == 0) {
p1 += kOutputStride;
p2 += kOutputStride;
continue;
}
for (int i = 0; i < width; ++i) {
ASSERT_EQ(p1[i], p2[i])
<< width << "x" << height << " Pixel mismatch at (" << i << ", "
<< j << ")";
}
}
}
// Check that two 16-bit output buffers are identical.
void AssertOutputBufferEq(const uint16_t *p1, const uint16_t *p2, int width,
int height) {
ASSERT_TRUE(p1 != p2) << "Buffers must be in different memory locations";
for (int j = 0; j < height; ++j) {
if (memcmp(p1, p2, sizeof(*p1) * width) == 0) {
p1 += kOutputStride;
p2 += kOutputStride;
continue;
}
for (int i = 0; i < width; ++i) {
ASSERT_EQ(p1[i], p2[i])
<< width << "x" << height << " Pixel mismatch at (" << i << ", "
<< j << ")";
}
}
}
// Note that the randomized values are capped by bit-depth.
const uint16_t *FirstRandomInput12(const TestParam<T> &param) {
return RandomInput12(input16_1_, param);
}
const uint16_t *SecondRandomInput12(const TestParam<T> &param) {
return RandomInput12(input16_2_, param);
}
#if CONFIG_LR_IMPROVEMENTS
const uint16_t *FirstRandomInput16Extreme(const TestParam<T> &param) {
return RandomInput16Extreme(input16_1_, param);
}
#endif // CONFIG_LR_IMPROVEMENTS
private:
const uint8_t *RandomInput8(uint8_t *p, const TestParam<T> &param) {
EXPECT_EQ(8, param.BitDepth());
EXPECT_GE(MAX_SB_SIZE, param.Block().Width());
EXPECT_GE(MAX_SB_SIZE, param.Block().Height());
const int padded_width = param.Block().Width() + kInputPadding;
const int padded_height = param.Block().Height() + kInputPadding;
Randomize(p, padded_width * padded_height);
return p + (kInputPadding / 2) * padded_width + kInputPadding / 2;
}
void Randomize(uint8_t *p, int size) {
for (int i = 0; i < size; ++i) {
p[i] = rnd_.Rand8();
}
}
const uint16_t *RandomInput12(uint16_t *p, const TestParam<T> &param) {
// Check that this is only called with high bit-depths up to 12.
EXPECT_TRUE(param.BitDepth() == 10 || param.BitDepth() == 12);
EXPECT_GE(MAX_SB_SIZE, param.Block().Width());
EXPECT_GE(MAX_SB_SIZE, param.Block().Height());
const int padded_width = param.Block().Width() + kInputPadding;
const int padded_height = param.Block().Height() + kInputPadding;
Randomize12(p, padded_width * padded_height, param.BitDepth());
return p + (kInputPadding / 2) * padded_width + kInputPadding / 2;
}
void Randomize12(uint16_t *p, int size, int bit_depth) {
EXPECT_TRUE(bit_depth == 10 || bit_depth == 12);
// Make sure bitdepth is capped in case error not triggered
const int bd_capped = bit_depth >= 12 ? 12 : bit_depth;
for (int i = 0; i < size; ++i) {
p[i] = (uint16_t)Clamp(rnd_.Rand12(), 0, (1 << bd_capped) - 1);
}
}
int Clamp(int value, int low, int high) {
return value < low ? low : (value > high ? high : value);
}
#if CONFIG_LR_IMPROVEMENTS
const uint16_t *RandomInput16Extreme(uint16_t *p, const TestParam<T> &param) {
// Check that this is only called with high bit-depths.
EXPECT_TRUE(param.BitDepth() == 10 || param.BitDepth() == 12);
EXPECT_GE(MAX_SB_SIZE, param.Block().Width());
EXPECT_GE(MAX_SB_SIZE, param.Block().Height());
const int padded_width = param.Block().Width() + kInputPadding;
const int padded_height = param.Block().Height() + kInputPadding;
RandomizeExtreme(p, padded_width * padded_height, param.BitDepth());
return p + (kInputPadding / 2) * padded_width + kInputPadding / 2;
}
void RandomizeExtreme(uint16_t *p, int size, int max_bit_range) {
EXPECT_GE(12, max_bit_range);
const int max_val = (1 << max_bit_range) - 1;
for (int i = 0; i < size; ++i) {
p[i] = static_cast<uint16_t>(RandBool() ? max_val : 0);
}
}
int RandBool() {
const uint32_t value = rnd_.Rand8();
// There's a bit more entropy in the upper bits of this implementation.
return (value >> 7) & 0x1;
}
#endif // CONFIG_LR_IMPROVEMENTS
static constexpr int kInputStride = MAX_SB_SIZE + kInputPadding;
libaom_test::ACMRandom rnd_;
// Statically allocate all the memory that is needed for the tests. Note
// that we cannot allocate output memory here. It must use DECLARE_ALIGNED,
// which is a C99 feature and interacts badly with C++ member variables.
uint8_t input8_1_[kInputStride * kInputStride];
uint8_t input8_2_[kInputStride * kInputStride];
uint16_t input16_1_[kInputStride * kInputStride];
uint16_t input16_2_[kInputStride * kInputStride];
};
/////////////////////////////////////////////////////////
// Single reference convolve-x functions (high bit-depth)
/////////////////////////////////////////////////////////
typedef void (*highbd_convolve_x_func)(
const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
ConvolveParams *conv_params, int bd);
class AV1ConvolveXHighbdTest : public AV1ConvolveTest<highbd_convolve_x_func> {
public:
void RunTest() {
for (int sub_x = 0; sub_x < 16; ++sub_x) {
for (int filter = EIGHTTAP_REGULAR; filter < INTERP_FILTERS_ALL;
++filter) {
InterpFilter f = static_cast<InterpFilter>(filter);
TestConvolve(sub_x, f);
}
}
}
private:
void TestConvolve(const int sub_x, const InterpFilter filter) {
const int width = GetParam().Block().Width();
const int height = GetParam().Block().Height();
const int bit_depth = GetParam().BitDepth();
const InterpFilterParams *filter_params_x =
av1_get_interp_filter_params_with_block_size(filter, width);
ConvolveParams conv_params1 =
get_conv_params_no_round(0, 0, NULL, 0, 0, bit_depth);
const uint16_t *input = FirstRandomInput12(GetParam());
DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
av1_highbd_convolve_x_sr_c(input, width, reference, kOutputStride, width,
height, filter_params_x, sub_x, &conv_params1,
bit_depth);
ConvolveParams conv_params2 =
get_conv_params_no_round(0, 0, NULL, 0, 0, bit_depth);
DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
GetParam().TestFunction()(input, width, test, kOutputStride, width, height,
filter_params_x, sub_x, &conv_params2, bit_depth);
AssertOutputBufferEq(reference, test, width, height);
}
};
TEST_P(AV1ConvolveXHighbdTest, RunTest) { RunTest(); }
INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveXHighbdTest,
BuildHighbdParams(av1_highbd_convolve_x_sr_c));
#if HAVE_SSSE3
INSTANTIATE_TEST_SUITE_P(SSSE3, AV1ConvolveXHighbdTest,
BuildHighbdParams(av1_highbd_convolve_x_sr_ssse3));
#endif
#if HAVE_AVX2
INSTANTIATE_TEST_SUITE_P(AVX2, AV1ConvolveXHighbdTest,
BuildHighbdParams(av1_highbd_convolve_x_sr_avx2));
#endif
/////////////////////////////////////////////////////////
// Single reference convolve-y functions (high bit-depth)
/////////////////////////////////////////////////////////
typedef void (*highbd_convolve_y_func)(
const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn,
int bd);
class AV1ConvolveYHighbdTest : public AV1ConvolveTest<highbd_convolve_y_func> {
public:
void RunTest() {
for (int sub_y = 0; sub_y < 16; ++sub_y) {
for (int filter = EIGHTTAP_REGULAR; filter < INTERP_FILTERS_ALL;
++filter) {
InterpFilter f = static_cast<InterpFilter>(filter);
TestConvolve(sub_y, f);
}
}
}
private:
void TestConvolve(const int sub_y, const InterpFilter filter) {
const int width = GetParam().Block().Width();
const int height = GetParam().Block().Height();
const int bit_depth = GetParam().BitDepth();
const InterpFilterParams *filter_params_y =
av1_get_interp_filter_params_with_block_size(filter, height);
const uint16_t *input = FirstRandomInput12(GetParam());
DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
av1_highbd_convolve_y_sr_c(input, width, reference, kOutputStride, width,
height, filter_params_y, sub_y, bit_depth);
DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
GetParam().TestFunction()(input, width, test, kOutputStride, width, height,
filter_params_y, sub_y, bit_depth);
AssertOutputBufferEq(reference, test, width, height);
}
};
TEST_P(AV1ConvolveYHighbdTest, RunTest) { RunTest(); }
INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveYHighbdTest,
BuildHighbdParams(av1_highbd_convolve_y_sr_c));
#if HAVE_SSSE3
INSTANTIATE_TEST_SUITE_P(SSSE3, AV1ConvolveYHighbdTest,
BuildHighbdParams(av1_highbd_convolve_y_sr_ssse3));
#endif
#if HAVE_AVX2
INSTANTIATE_TEST_SUITE_P(AVX2, AV1ConvolveYHighbdTest,
BuildHighbdParams(av1_highbd_convolve_y_sr_avx2));
#endif
///////////////////////////////////////////////////////////////
// Single reference convolve-copy functions (high bit-depth)
///////////////////////////////////////////////////////////////
typedef void (*highbd_convolve_copy_func)(const uint16_t *src,
ptrdiff_t src_stride, uint16_t *dst,
ptrdiff_t dst_stride, int w, int h);
class AV1ConvolveCopyHighbdTest
: public AV1ConvolveTest<highbd_convolve_copy_func> {
public:
void RunTest() {
const BlockSize &block = GetParam().Block();
const int width = block.Width();
const int height = block.Height();
const uint16_t *input = FirstRandomInput12(GetParam());
DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
aom_highbd_convolve_copy_c(input, width, reference, kOutputStride, width,
height);
DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
GetParam().TestFunction()(input, width, test, kOutputStride, width, height);
AssertOutputBufferEq(reference, test, width, height);
}
};
TEST_P(AV1ConvolveCopyHighbdTest, RunTest) { RunTest(); }
INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveCopyHighbdTest,
BuildHighbdParams(aom_highbd_convolve_copy_c));
#if HAVE_SSE2
INSTANTIATE_TEST_SUITE_P(SSE2, AV1ConvolveCopyHighbdTest,
BuildHighbdParams(aom_highbd_convolve_copy_sse2));
#endif
#if HAVE_AVX2
INSTANTIATE_TEST_SUITE_P(AVX2, AV1ConvolveCopyHighbdTest,
BuildHighbdParams(aom_highbd_convolve_copy_avx2));
#endif
//////////////////////////////////////////////////////////
// Single reference convolve-2d functions (high bit-depth)
//////////////////////////////////////////////////////////
typedef void (*highbd_convolve_2d_func)(
const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
int h, const InterpFilterParams *filter_params_x,
const InterpFilterParams *filter_params_y, const int subpel_x_qn,
const int subpel_y_qn, ConvolveParams *conv_params, int bd);
class AV1Convolve2DHighbdTest
: public AV1ConvolveTest<highbd_convolve_2d_func> {
public:
void RunTest() {
for (int sub_x = 0; sub_x < 16; ++sub_x) {
for (int sub_y = 0; sub_y < 16; ++sub_y) {
for (int h_f = EIGHTTAP_REGULAR; h_f < INTERP_FILTERS_ALL; ++h_f) {
for (int v_f = EIGHTTAP_REGULAR; v_f < INTERP_FILTERS_ALL; ++v_f) {
TestConvolve(static_cast<InterpFilter>(h_f),
static_cast<InterpFilter>(v_f), sub_x, sub_y);
}
}
}
}
}
private:
void TestConvolve(const InterpFilter h_f, const InterpFilter v_f,
const int sub_x, const int sub_y) {
const int width = GetParam().Block().Width();
const int height = GetParam().Block().Height();
const int bit_depth = GetParam().BitDepth();
const InterpFilterParams *filter_params_x =
av1_get_interp_filter_params_with_block_size(h_f, width);
const InterpFilterParams *filter_params_y =
av1_get_interp_filter_params_with_block_size(v_f, height);
const uint16_t *input = FirstRandomInput12(GetParam());
DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
ConvolveParams conv_params1 =
get_conv_params_no_round(0, 0, NULL, 0, 0, bit_depth);
av1_highbd_convolve_2d_sr_c(input, width, reference, kOutputStride, width,
height, filter_params_x, filter_params_y, sub_x,
sub_y, &conv_params1, bit_depth);
DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
ConvolveParams conv_params2 =
get_conv_params_no_round(0, 0, NULL, 0, 0, bit_depth);
GetParam().TestFunction()(input, width, test, kOutputStride, width, height,
filter_params_x, filter_params_y, sub_x, sub_y,
&conv_params2, bit_depth);
AssertOutputBufferEq(reference, test, width, height);
}
};
TEST_P(AV1Convolve2DHighbdTest, RunTest) { RunTest(); }
INSTANTIATE_TEST_SUITE_P(C, AV1Convolve2DHighbdTest,
BuildHighbdParams(av1_highbd_convolve_2d_sr_c));
#if HAVE_SSSE3
INSTANTIATE_TEST_SUITE_P(SSSE3, AV1Convolve2DHighbdTest,
BuildHighbdParams(av1_highbd_convolve_2d_sr_ssse3));
#endif
#if HAVE_AVX2
INSTANTIATE_TEST_SUITE_P(AVX2, AV1Convolve2DHighbdTest,
BuildHighbdParams(av1_highbd_convolve_2d_sr_avx2));
#endif
//////////////////////////
// Compound Convolve Tests
//////////////////////////
// The compound functions do not work for chroma block sizes. Provide
// a function to generate test parameters for just luma block sizes.
template <typename T>
std::vector<TestParam<T>> GetLumaTestParams(
std::initializer_list<int> bit_depths, T test_func) {
std::set<BlockSize> sizes;
for (int b = BLOCK_4X4; b < BLOCK_SIZES_ALL; ++b) {
const int w = block_size_wide[b];
const int h = block_size_high[b];
sizes.insert(BlockSize(w, h));
}
std::vector<TestParam<T>> result;
for (int bit_depth : bit_depths) {
for (const auto &block : sizes) {
result.push_back(TestParam<T>(block, bit_depth, test_func));
}
}
return result;
}
template <typename T>
std::vector<TestParam<T>> GetHighbdLumaTestParams(T test_func) {
return GetLumaTestParams({ 10, 12 }, test_func);
}
TEST_F(AV1ConvolveParametersTest, GetHighbdLumaTestParams) {
auto v = GetHighbdLumaTestParams(av1_highbd_dist_wtd_convolve_x_c);
ASSERT_EQ(static_cast<size_t>(BLOCK_SIZES_ALL * 2), v.size());
int num_10 = 0;
int num_12 = 0;
for (const auto &e : v) {
ASSERT_TRUE(10 == e.BitDepth() || 12 == e.BitDepth());
bool same_fn = av1_highbd_dist_wtd_convolve_x_c == e.TestFunction();
ASSERT_TRUE(same_fn);
if (e.BitDepth() == 10) {
++num_10;
} else {
++num_12;
}
}
ASSERT_EQ(num_10, num_12);
}
template <typename T>
::testing::internal::ParamGenerator<TestParam<T>> BuildHighbdLumaParams(
T test_func) {
return ::testing::ValuesIn(GetHighbdLumaTestParams(test_func));
}
// Compound cases also need to test different frame offsets and weightings.
class CompoundParam {
public:
CompoundParam(int fwd_offset, int bck_offset)
: fwd_offset_(fwd_offset), bck_offset_(bck_offset) {}
bool UseWtdCompAvg() const {
return bck_offset_ != (1 << (DIST_PRECISION_BITS - 1)) ||
fwd_offset_ != (1 << (DIST_PRECISION_BITS - 1));
}
int FwdOffset() const { return fwd_offset_; }
int BckOffset() const { return bck_offset_; }
private:
int fwd_offset_;
int bck_offset_;
};
std::vector<CompoundParam> GetCompoundParams() {
std::vector<CompoundParam> result;
result.push_back(CompoundParam(1 << (DIST_PRECISION_BITS - 1),
1 << (DIST_PRECISION_BITS - 1)));
for (int k = 0; k < 2; ++k) {
for (int l = 0; l < 4; ++l) {
result.push_back(CompoundParam(quant_dist_lookup_table[l][k],
quant_dist_lookup_table[l][1 - k]));
}
}
return result;
}
TEST_F(AV1ConvolveParametersTest, GetCompoundParams) {
auto v = GetCompoundParams();
ASSERT_EQ(9U, v.size());
ASSERT_FALSE(v[0].UseWtdCompAvg());
for (size_t i = 1; i < v.size(); ++i) {
ASSERT_TRUE(v[i].UseWtdCompAvg());
}
}
/////////////////////////////////////////////////
// Compound convolve-x functions (high bit-depth)
/////////////////////////////////////////////////
ConvolveParams GetConvolveParams(int do_average, CONV_BUF_TYPE *conv_buf,
int width, int bit_depth,
const CompoundParam &compound) {
ConvolveParams conv_params =
get_conv_params_no_round(do_average, 0, conv_buf, width, 1, bit_depth);
(void)compound;
conv_params.fwd_offset = compound.FwdOffset();
conv_params.bck_offset = compound.BckOffset();
return conv_params;
}
class AV1ConvolveXHighbdCompoundTest
: public AV1ConvolveTest<highbd_convolve_x_func> {
public:
void RunTest() {
auto compound_params = GetCompoundParams();
for (int sub_pix = 0; sub_pix < 16; ++sub_pix) {
for (int f = EIGHTTAP_REGULAR; f < INTERP_FILTERS_ALL; ++f) {
for (const auto &c : compound_params) {
TestConvolve(sub_pix, static_cast<InterpFilter>(f), c);
}
}
}
}
protected:
virtual const InterpFilterParams *FilterParams(InterpFilter f,
const BlockSize &block) const {
return av1_get_interp_filter_params_with_block_size(f, block.Width());
}
virtual highbd_convolve_x_func ReferenceFunc() const {
return av1_highbd_dist_wtd_convolve_x_c;
}
private:
void TestConvolve(const int sub_pix, const InterpFilter filter,
const CompoundParam &compound) {
const int width = GetParam().Block().Width();
const int height = GetParam().Block().Height();
const uint16_t *input1 = FirstRandomInput12(GetParam());
const uint16_t *input2 = SecondRandomInput12(GetParam());
DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
DECLARE_ALIGNED(32, CONV_BUF_TYPE, reference_conv_buf[MAX_SB_SQUARE]);
Convolve(ReferenceFunc(), input1, input2, reference, reference_conv_buf,
compound, sub_pix, filter);
DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
DECLARE_ALIGNED(32, CONV_BUF_TYPE, test_conv_buf[MAX_SB_SQUARE]);
Convolve(GetParam().TestFunction(), input1, input2, test, test_conv_buf,
compound, sub_pix, filter);
AssertOutputBufferEq(reference_conv_buf, test_conv_buf, width, height);
AssertOutputBufferEq(reference, test, width, height);
}
void Convolve(highbd_convolve_x_func test_func, const uint16_t *src1,
const uint16_t *src2, uint16_t *dst, CONV_BUF_TYPE *conv_buf,
const CompoundParam &compound, const int sub_pix,
const InterpFilter filter) {
const int width = GetParam().Block().Width();
const int height = GetParam().Block().Height();
const int bit_depth = GetParam().BitDepth();
const InterpFilterParams *filter_params =
FilterParams(filter, GetParam().Block());
ConvolveParams conv_params =
GetConvolveParams(0, conv_buf, kOutputStride, bit_depth, compound);
test_func(src1, width, dst, kOutputStride, width, height, filter_params,
sub_pix, &conv_params, bit_depth);
conv_params =
GetConvolveParams(1, conv_buf, kOutputStride, bit_depth, compound);
test_func(src2, width, dst, kOutputStride, width, height, filter_params,
sub_pix, &conv_params, bit_depth);
}
};
TEST_P(AV1ConvolveXHighbdCompoundTest, RunTest) { RunTest(); }
INSTANTIATE_TEST_SUITE_P(
C, AV1ConvolveXHighbdCompoundTest,
BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_x_c));
#if HAVE_SSE4_1
INSTANTIATE_TEST_SUITE_P(
SSE4_1, AV1ConvolveXHighbdCompoundTest,
BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_x_sse4_1));
#endif
#if HAVE_AVX2
INSTANTIATE_TEST_SUITE_P(
AVX2, AV1ConvolveXHighbdCompoundTest,
BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_x_avx2));
#endif
/////////////////////////////////////////////////
// Compound convolve-y functions (high bit-depth)
/////////////////////////////////////////////////
// Again, the X and Y convolve functions have the same type signature and logic.
class AV1ConvolveYHighbdCompoundTest : public AV1ConvolveXHighbdCompoundTest {
virtual highbd_convolve_x_func ReferenceFunc() const override {
return av1_highbd_dist_wtd_convolve_y_c;
}
virtual const InterpFilterParams *FilterParams(
InterpFilter f, const BlockSize &block) const override {
return av1_get_interp_filter_params_with_block_size(f, block.Height());
}
};
TEST_P(AV1ConvolveYHighbdCompoundTest, RunTest) { RunTest(); }
INSTANTIATE_TEST_SUITE_P(
C, AV1ConvolveYHighbdCompoundTest,
BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_y_c));
#if HAVE_SSE4_1
INSTANTIATE_TEST_SUITE_P(
SSE4_1, AV1ConvolveYHighbdCompoundTest,
BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_y_sse4_1));
#endif
#if HAVE_AVX2
INSTANTIATE_TEST_SUITE_P(
AVX2, AV1ConvolveYHighbdCompoundTest,
BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_y_avx2));
#endif
///////////////////////////////////////////////////////
// Compound convolve-2d-copy functions (high bit-depth)
///////////////////////////////////////////////////////
typedef void (*highbd_compound_conv_2d_copy_func)(const uint16_t *src,
int src_stride, uint16_t *dst,
int dst_stride, int w, int h,
ConvolveParams *conv_params,
int bd);
class AV1Convolve2DCopyHighbdCompoundTest
: public AV1ConvolveTest<highbd_compound_conv_2d_copy_func> {
public:
void RunTest() {
auto compound_params = GetCompoundParams();
for (const auto &compound : compound_params) {
TestConvolve(compound);
}
}
public:
void SpeedTest() {
auto compound_params = GetCompoundParams();
for (const auto &compound : compound_params) {
SpeedTestConvolve(compound);
}
}
private:
void SpeedTestConvolve(const CompoundParam &compound) {
const BlockSize &block = GetParam().Block();
const int width = block.Width();
const int height = block.Height();
const int bit_depth = GetParam().BitDepth();
int nob = 100000;
const uint16_t *input = FirstRandomInput12(GetParam());
DECLARE_ALIGNED(32, uint16_t, conv_buf[MAX_SB_SQUARE]);
highbd_compound_conv_2d_copy_func test_func = GetParam().TestFunction();
ConvolveParams conv_params =
GetConvolveParams(0, conv_buf, kOutputStride, bit_depth, compound);
ConvolveParams conv_params_do_avg =
GetConvolveParams(1, conv_buf, kOutputStride, bit_depth, compound);
aom_usec_timer timer;
aom_usec_timer_start(&timer);
for (int i = 0; i < nob; i++) {
av1_highbd_dist_wtd_convolve_2d_copy_c(input, width, conv_buf,
kOutputStride, width, height,
&conv_params, bit_depth);
av1_highbd_dist_wtd_convolve_2d_copy_c(input, width, conv_buf,
kOutputStride, width, height,
&conv_params_do_avg, bit_depth);
}
aom_usec_timer_mark(&timer);
const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
aom_usec_timer timer1;
aom_usec_timer_start(&timer1);
for (int i = 0; i < nob; i++) {
test_func(input, width, conv_buf, kOutputStride, width, height,
&conv_params, bit_depth);
test_func(input, width, conv_buf, kOutputStride, width, height,
&conv_params_do_avg, bit_depth);
}
aom_usec_timer_mark(&timer1);
const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
printf("%d x %d block: bd: %d, Scaling = %.2f\n", width, height, bit_depth,
(double)elapsed_time / elapsed_time1);
}
private:
void TestConvolve(const CompoundParam &compound) {
const BlockSize &block = GetParam().Block();
const int width = block.Width();
const int height = block.Height();
const uint16_t *input1 = FirstRandomInput12(GetParam());
const uint16_t *input2 = SecondRandomInput12(GetParam());
DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
DECLARE_ALIGNED(32, CONV_BUF_TYPE, reference_conv_buf[MAX_SB_SQUARE]);
Convolve(av1_highbd_dist_wtd_convolve_2d_copy_c, input1, input2, reference,
reference_conv_buf, compound);
DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
DECLARE_ALIGNED(32, CONV_BUF_TYPE, test_conv_buf[MAX_SB_SQUARE]);
Convolve(GetParam().TestFunction(), input1, input2, test, test_conv_buf,
compound);
AssertOutputBufferEq(reference_conv_buf, test_conv_buf, width, height);
AssertOutputBufferEq(reference, test, width, height);
}
void Convolve(highbd_compound_conv_2d_copy_func test_func,
const uint16_t *src1, const uint16_t *src2, uint16_t *dst,
uint16_t *conv_buf, const CompoundParam &compound) {
const BlockSize &block = GetParam().Block();
const int width = block.Width();
const int height = block.Height();
const int bit_depth = GetParam().BitDepth();
ConvolveParams conv_params =
GetConvolveParams(0, conv_buf, kOutputStride, bit_depth, compound);
test_func(src1, width, dst, kOutputStride, width, height, &conv_params,
bit_depth);
conv_params =
GetConvolveParams(1, conv_buf, kOutputStride, bit_depth, compound);
test_func(src2, width, dst, kOutputStride, width, height, &conv_params,
bit_depth);
}
};
TEST_P(AV1Convolve2DCopyHighbdCompoundTest, RunTest) { RunTest(); }
TEST_P(AV1Convolve2DCopyHighbdCompoundTest, DISABLED_SpeedTest) { SpeedTest(); }
INSTANTIATE_TEST_SUITE_P(
C, AV1Convolve2DCopyHighbdCompoundTest,
BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_2d_copy_c));
#if HAVE_SSE4_1
INSTANTIATE_TEST_SUITE_P(
SSE4_1, AV1Convolve2DCopyHighbdCompoundTest,
BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_2d_copy_sse4_1));
#endif
#if HAVE_AVX2
INSTANTIATE_TEST_SUITE_P(
AVX2, AV1Convolve2DCopyHighbdCompoundTest,
BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_2d_copy_avx2));
#endif
//////////////////////////////////////////////////
// Compound convolve-2d functions (high bit-depth)
//////////////////////////////////////////////////
class AV1Convolve2DHighbdCompoundTest
: public AV1ConvolveTest<highbd_convolve_2d_func> {
public:
void RunTest() {
auto compound_params = GetCompoundParams();
for (int h_f = EIGHTTAP_REGULAR; h_f < INTERP_FILTERS_ALL; ++h_f) {
for (int v_f = EIGHTTAP_REGULAR; v_f < INTERP_FILTERS_ALL; ++v_f) {
for (int sub_x = 0; sub_x < 16; ++sub_x) {
for (int sub_y = 0; sub_y < 16; ++sub_y) {
for (const auto &compound : compound_params) {
TestConvolve(static_cast<InterpFilter>(h_f),
static_cast<InterpFilter>(v_f), sub_x, sub_y,
compound);
}
}
}
}
}
}
private:
void TestConvolve(const InterpFilter h_f, const InterpFilter v_f,
const int sub_x, const int sub_y,
const CompoundParam &compound) {
const BlockSize &block = GetParam().Block();
const int width = block.Width();
const int height = block.Height();
const uint16_t *input1 = FirstRandomInput12(GetParam());
const uint16_t *input2 = SecondRandomInput12(GetParam());
DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
DECLARE_ALIGNED(32, CONV_BUF_TYPE, reference_conv_buf[MAX_SB_SQUARE]);
Convolve(av1_highbd_dist_wtd_convolve_2d_c, input1, input2, reference,
reference_conv_buf, compound, h_f, v_f, sub_x, sub_y);
DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
DECLARE_ALIGNED(32, CONV_BUF_TYPE, test_conv_buf[MAX_SB_SQUARE]);
Convolve(GetParam().TestFunction(), input1, input2, test, test_conv_buf,
compound, h_f, v_f, sub_x, sub_y);
AssertOutputBufferEq(reference_conv_buf, test_conv_buf, width, height);
AssertOutputBufferEq(reference, test, width, height);
}
private:
void Convolve(highbd_convolve_2d_func test_func, const uint16_t *src1,
const uint16_t *src2, uint16_t *dst, uint16_t *conv_buf,
const CompoundParam &compound, const InterpFilter h_f,
const InterpFilter v_f, const int sub_x, const int sub_y) {
const BlockSize &block = GetParam().Block();
const int width = block.Width();
const int height = block.Height();
const InterpFilterParams *filter_params_x =
av1_get_interp_filter_params_with_block_size(h_f, width);
const InterpFilterParams *filter_params_y =
av1_get_interp_filter_params_with_block_size(v_f, height);
const int bit_depth = GetParam().BitDepth();
ConvolveParams conv_params =
GetConvolveParams(0, conv_buf, kOutputStride, bit_depth, compound);
test_func(src1, width, dst, kOutputStride, width, height, filter_params_x,
filter_params_y, sub_x, sub_y, &conv_params, bit_depth);
conv_params =
GetConvolveParams(1, conv_buf, kOutputStride, bit_depth, compound);
test_func(src2, width, dst, kOutputStride, width, height, filter_params_x,
filter_params_y, sub_x, sub_y, &conv_params, bit_depth);
}
};
TEST_P(AV1Convolve2DHighbdCompoundTest, RunTest) { RunTest(); }
INSTANTIATE_TEST_SUITE_P(
C, AV1Convolve2DHighbdCompoundTest,
BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_2d_c));
#if HAVE_SSE4_1
INSTANTIATE_TEST_SUITE_P(
SSE4_1, AV1Convolve2DHighbdCompoundTest,
BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_2d_sse4_1));
#endif
#if HAVE_AVX2
INSTANTIATE_TEST_SUITE_P(
AVX2, AV1Convolve2DHighbdCompoundTest,
BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_2d_avx2));
#endif
//////////////////////////////////////////////////////////
// Nonseparable convolve-2d functions (high bit-depth)
//////////////////////////////////////////////////////////
#if CONFIG_LR_IMPROVEMENTS
typedef void (*highbd_convolve_nonsep_2d_func)(
const uint16_t *src, int src_stride,
const NonsepFilterConfig *filter_config, const int16_t *filter,
uint16_t *dst, int dst_stride, int bit_depth, int block_row_begin,
int block_row_end, int block_col_begin, int block_col_end);
class AV1ConvolveNonSep2DHighbdTest
: public AV1ConvolveTest<highbd_convolve_nonsep_2d_func> {
public:
void RunTest(RestorationType rtype) {
for (int i = 0; i < kTestIterations; i++) {
SetFilterTaps();
TestConvolve(FilterTaps_, rtype);
}
}
void RunSpeedTest(RestorationType rtype) {
SpeedTestConvolve(FilterTaps_, rtype);
};
private:
void BitMatchTest(const uint16_t *input, int input_stride, int width,
int height, const int16_t *filter, uint16_t *reference,
uint16_t *test, int dst_stride, int bit_depth,
int block_row_begin, int block_row_end, int block_col_begin,
int block_col_end, RestorationType rtype) {
const NonsepFilterConfig *filter_config[2] = { NULL, NULL };
highbd_convolve_nonsep_2d_func ref_func = av1_convolve_symmetric_highbd_c;
const int num_planes = 2;
if (rtype == RESTORE_PC_WIENER) {
ref_func = av1_convolve_symmetric_highbd_c;
filter_config[0] = &UnconstrainedSumFilterConfig_;
filter_config[1] = &PcWienerNonsepFilterConfigChroma_;
}
// When CONFIG_WIENER_NONSEP=1, luma and chroma plane uses different number
// of filter taps and both needs to be tested. Here, luma is tested for
// 12/13-tap filtering whereas chroma is tested for 6-tap filtering.
if (rtype == RESTORE_WIENER_NONSEP) {
ref_func = av1_convolve_symmetric_subtract_center_highbd_c;
filter_config[0] = &UnitSumFilterConfig_;
filter_config[1] = &UnitSumFilterConfigChroma_;
}
assert(filter_config[0] != NULL && filter_config[1] != NULL);
for (int plane = 0; plane < num_planes; plane++) {
ref_func(input, input_stride, filter_config[plane], filter, reference,
dst_stride, bit_depth, block_row_begin, block_row_end,
block_col_begin, block_col_end);
GetParam().TestFunction()(input, input_stride, filter_config[plane],
filter, test, dst_stride, bit_depth,
block_row_begin, block_row_end, block_col_begin,
block_col_end);
AssertOutputBufferEq(reference, test, width, height);
}
}
void TestConvolve(const int16_t *filter, RestorationType rtype) {
const int width = GetParam().Block().Width();
const int height = GetParam().Block().Height();
const int bit_depth = GetParam().BitDepth();
const uint16_t *input = FirstRandomInput12(GetParam());
DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
ASSERT_TRUE(kInputPadding >= kMaxTapOffset)
<< "Not enough padding for 7x7 filters";
const uint16_t *centered_input =
input + kMaxTapOffset * width + kMaxTapOffset;
const int input_stride = width;
BitMatchTest(centered_input, input_stride, width, height, filter, reference,
test, kOutputStride, bit_depth, 0, height, 0, width, rtype);
// Extreme value test
const uint16_t *extreme_input = FirstRandomInput16Extreme(GetParam());
const uint16_t *centered_extreme_input =
extreme_input + kMaxTapOffset * width + kMaxTapOffset;
int16_t Extream_Tap_[kNumSymmetricTaps + 1];
RandomizeExtreamFilterTap(Extream_Tap_, kNumSymmetricTaps + 1,
kMaxPrecisionBeforeOverflow);
BitMatchTest(centered_extreme_input, input_stride, width, height,
Extream_Tap_, reference, test, kOutputStride, bit_depth, 0,
height, 0, width, rtype);
}
void SpeedTestConvolve(const int16_t *filter, RestorationType rtype) {
const int width = GetParam().Block().Width();
const int height = GetParam().Block().Height();
const int bit_depth = GetParam().BitDepth();
const int num_planes = 2;
const uint16_t *input = FirstRandomInput12(GetParam());
DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
ASSERT_TRUE(kInputPadding >= kMaxTapOffset)
<< "Not enough padding for 7x7 filters";
const uint16_t *centered_input =
input + kMaxTapOffset * width + kMaxTapOffset;
// Calculate time taken for C function
const NonsepFilterConfig *filter_config[2] = { NULL, NULL };
highbd_convolve_nonsep_2d_func ref_func = av1_convolve_symmetric_highbd_c;
if (rtype == RESTORE_PC_WIENER) {
ref_func = av1_convolve_symmetric_highbd_c;
filter_config[0] = &UnconstrainedSumFilterConfig_;
filter_config[1] = &PcWienerNonsepFilterConfigChroma_;
}
// When CONFIG_WIENER_NONSEP=1, luma and chroma uses different number of
// filter taps and both needs to be tested. Here, luma is tested for
// 12/13-tap filtering whereas chroma is tested for 6-tap filtering.
if (rtype == RESTORE_WIENER_NONSEP) {
ref_func = av1_convolve_symmetric_subtract_center_highbd_c;
filter_config[0] = &UnitSumFilterConfig_;
filter_config[1] = &UnitSumFilterConfigChroma_;
}
for (int plane = 0; plane < num_planes; plane++) {
// Calculate time taken by reference/c function
aom_usec_timer timer;
aom_usec_timer_start(&timer);
for (int i = 0; i < kSpeedIterations; ++i) {
ref_func(centered_input, width, filter_config[plane], filter, reference,
kOutputStride, bit_depth, 0, height, 0, width);
}
aom_usec_timer_mark(&timer);
auto elapsed_time_c = aom_usec_timer_elapsed(&timer);
// Calculate time taken by optimized/intrinsic function
aom_usec_timer_start(&timer);
for (int i = 0; i < kSpeedIterations; ++i) {
GetParam().TestFunction()(centered_input, width, filter_config[plane],
filter, test, kOutputStride, bit_depth, 0,
height, 0, width);
}
aom_usec_timer_mark(&timer);
auto elapsed_time_opt = aom_usec_timer_elapsed(&timer);
float c_time_per_pixel =
(float)1000.0 * elapsed_time_c / (kSpeedIterations * width * height);
float opt_time_per_pixel = (float)1000.0 * elapsed_time_opt /
(kSpeedIterations * width * height);
float scaling = c_time_per_pixel / opt_time_per_pixel;
printf(
"plane=%3d, %3dx%-3d: c_time_per_pixel=%10.5f, "
"opt_time_per_pixel=%10.5f, scaling=%f \n",
plane, width, height, c_time_per_pixel, opt_time_per_pixel, scaling);
}
}
// Generates NonsepFilterConfig compliant origin symmetric filter tap values.
// The first (2 * kNumSymmetricTaps) are for the CONFIG_WIENER_NONSEP use case
// where the center tap is constrained so that filter sums to one. The last
// added tap at (2 * kNumSymmetricTaps) is unconstrained and intended for
// CONFIG_PC_WIENER use case.
void SetFilterTaps() {
Randomize(FilterTaps_, kNumSymmetricTaps + 1, kMaxPrecisionBeforeOverflow);
}
// Fills the array p with signed integers.
void Randomize(int16_t *p, int size, int max_bit_range) {
ASSERT_TRUE(max_bit_range < 16) << "max_bit_range has to be less than 16";
for (int i = 0; i < size; ++i) {
p[i] = rnd_.Rand15Signed() & ((1 << max_bit_range) - 1);
}
}
// Fills the array p with maximum and minimum possible integers.
void RandomizeExtreamFilterTap(int16_t *p, int size, int max_bit_range) {
ASSERT_TRUE(max_bit_range < 16) << "max_bit_range has to be less than 16";
const int sign_max_val = (1 << (max_bit_range - 1)) - 1;
for (int i = 0; i < size; ++i) {
p[i] =
static_cast<int16_t>(RandBool() ? sign_max_val : -(sign_max_val + 1));
}
}
int RandBool() {
const uint32_t value = rnd_.Rand8();
// There's a bit more entropy in the upper bits of this implementation.
return (value >> 7) & 0x1;
}
libaom_test::ACMRandom rnd_;
static constexpr int kMaxPrecisionBeforeOverflow = 12;
static constexpr int kNumSymmetricTaps = 12;
static constexpr int kNumSymmetricTapsChroma = 6;
static constexpr int kMaxTapOffset = 3; // Filters are 7x7.
static constexpr int kSpeedIterations = 10000;
static constexpr int kTestIterations = 100;
// Configuration for nonseparable 7x7 filters for DIAMOND shape.
// Format is offset (i) row and (ii) column from center pixel
// and the (iii) filter-tap index that multiplies the pixel at
// the respective offset.
const int NonsepConfig_[25][3] = {
{ -3, 0, 0 }, { 3, 0, 0 }, { -2, -1, 1 }, { 2, 1, 1 }, { -2, 0, 2 },
{ 2, 0, 2 }, { -2, 1, 3 }, { 2, -1, 3 }, { -1, -2, 4 }, { 1, 2, 4 },
{ -1, -1, 5 }, { 1, 1, 5 }, { -1, 0, 6 }, { 1, 0, 6 }, { -1, 1, 7 },
{ 1, -1, 7 }, { -1, 2, 8 }, { 1, -2, 8 }, { 0, -3, 9 }, { 0, 3, 9 },
{ 0, -2, 10 }, { 0, 2, 10 }, { 0, -1, 11 }, { 0, 1, 11 }, { 0, 0, 12 },
};
const int wienerns_wout_subtract_center_config_uv_from_uv_[13][3] = {
{ 1, 0, 0 }, { -1, 0, 0 }, { 0, 1, 1 }, { 0, -1, 1 }, { 1, 1, 2 },
{ -1, -1, 2 }, { -1, 1, 3 }, { 1, -1, 3 }, { 2, 0, 4 }, { -2, 0, 4 },
{ 0, 2, 5 }, { 0, -2, 5 }, { 0, 0, 6 },
};
// Filters use all unique taps.
const NonsepFilterConfig UnconstrainedSumFilterConfig_ = {
kMaxPrecisionBeforeOverflow,
2 * kNumSymmetricTaps + 1,
0,
NonsepConfig_,
NULL,
0,
0
};
const NonsepFilterConfig PcWienerNonsepFilterConfigChroma_ = {
kMaxPrecisionBeforeOverflow,
2 * kNumSymmetricTapsChroma + 1,
0,
wienerns_wout_subtract_center_config_uv_from_uv_,
NULL,
0,
0
};
// Configuration for UnitSumFilterConfig_ wiener nonseparable 7x7 filters for
// DIAMOND shape. Format is offset (i) row and (ii) column from center pixel
// and the (iii) filter-tap index that multiplies the pixel at the respective
// offset.
const int WienerNonsepConfig_[25][3] = {
{ 1, 0, 0 },
{ -1, 0, 0 },
{ 0, 1, 1 },
{ 0, -1, 1 },
{ 2, 0, 2 },
{ -2, 0, 2 },
{ 0, 2, 3 },
{ 0, -2, 3 },
{ 1, 1, 4 },
{ -1, -1, 4 },
{ -1, 1, 5 },
{ 1, -1, 5 },
{ 2, 1, 6 },
{ -2, -1, 6 },
{ 2, -1, 7 },
{ -2, 1, 7 },
{ 1, 2, 8 },
{ -1, -2, 8 },
{ 1, -2, 9 },
{ -1, 2, 9 },
{ 3, 0, 10 },
{ -3, 0, 10 },
{ 0, 3, 11 },
{ 0, -3, 11 },
#if USE_CENTER_WIENER_NONSEP
{ 0, 0, 12 },
#endif // USE_CENTER_WIENER_NONSEP
};
const int WienerNonsepConfigChroma_[12][3] = {
{ 1, 0, 0 }, { -1, 0, 0 }, { 0, 1, 1 }, { 0, -1, 1 },
{ 1, 1, 2 }, { -1, -1, 2 }, { -1, 1, 3 }, { 1, -1, 3 },
{ 2, 0, 4 }, { -2, 0, 4 }, { 0, 2, 5 }, { 0, -2, 5 },
};
// Filters use only the first (2 * kNumSymmetricTaps) taps. Center tap is
// constrained.
const NonsepFilterConfig UnitSumFilterConfig_ = {
kMaxPrecisionBeforeOverflow,
#if USE_CENTER_WIENER_NONSEP
2 * kNumSymmetricTaps + 1,
#else
2 * kNumSymmetricTaps,
#endif // USE_CENTER_WIENER_NONSEP
0,
WienerNonsepConfig_,
NULL,
0,
1
};
// Config used for filtering of chroma when CONFIG_WIENER_NONSEP=1.
const NonsepFilterConfig UnitSumFilterConfigChroma_ = {
kMaxPrecisionBeforeOverflow,
2 * kNumSymmetricTapsChroma,
0,
WienerNonsepConfigChroma_,
NULL,
0,
1
};
int16_t FilterTaps_[kNumSymmetricTaps + 1];
};
TEST_P(AV1ConvolveNonSep2DHighbdTest, RunTest) { RunTest(RESTORE_PC_WIENER); }
TEST_P(AV1ConvolveNonSep2DHighbdTest, DISABLED_Speed) {
RunSpeedTest(RESTORE_PC_WIENER);
}
#if HAVE_AVX2
INSTANTIATE_TEST_SUITE_P(AVX2, AV1ConvolveNonSep2DHighbdTest,
BuildHighbdParams(av1_convolve_symmetric_highbd_avx2));
#endif
class AV1ConvolveWienerNonSep2DHighbdTest
: public AV1ConvolveNonSep2DHighbdTest {};
TEST_P(AV1ConvolveWienerNonSep2DHighbdTest, RunTest) {
RunTest(RESTORE_WIENER_NONSEP);
}
TEST_P(AV1ConvolveWienerNonSep2DHighbdTest, DISABLED_Speed) {
RunSpeedTest(RESTORE_WIENER_NONSEP);
}
#if HAVE_AVX2
INSTANTIATE_TEST_SUITE_P(
AVX2, AV1ConvolveWienerNonSep2DHighbdTest,
BuildHighbdParams(av1_convolve_symmetric_subtract_center_highbd_avx2));
#endif
#endif // CONFIG_LR_IMPROVEMENTS
//////////////////////////////////////////////////////////
// Nonseparable convolve-2d Dual functions (high bit-depth)
//////////////////////////////////////////////////////////
#if CONFIG_LR_IMPROVEMENTS
typedef void (*highbd_convolve_nonsep_dual_2d_func)(
const uint16_t *dgd, int dgd_stride, const uint16_t *dgd_dual,
int dgd_dual_stride, const NonsepFilterConfig *filter_config,
const int16_t *filter, uint16_t *dst, int dst_stride, int bit_depth,
int block_row_begin, int block_row_end, int block_col_begin,
int block_col_end);
class AV1ConvolveNon_Sep_dual2DHighbdTest
: public AV1ConvolveTest<highbd_convolve_nonsep_dual_2d_func> {
public:
void RunTest(int is_subtract_center) {
for (int i = 0; i < kTestIterations; i++) {
SetFilterTaps();
TestConvolve(FilterTaps_, is_subtract_center);
}
}
void RunSpeedTest(int is_subtract_center) {
SpeedTestConvolve(FilterTaps_, is_subtract_center);
};
private:
libaom_test::ACMRandom rnd_;
static constexpr int kMaxPrecisionBeforeOverflow = 12;
static constexpr int kNumSymmetricTaps = 6;
// In dual filtering, 7 taps (6 symmetric + 1 center) are required for each of
// the buffer.
static constexpr int kNumSubtractCenterOffTaps = (2 * kNumSymmetricTaps) + 2;
static constexpr int kMaxTapOffset = 2; // Filters are 5x5.
static constexpr int kSpeedIterations = 10000;
static constexpr int kTestIterations = 100;
// Declare the filter taps for worst case (i.e., for subtract center off
// case).
int16_t FilterTaps_[kNumSubtractCenterOffTaps];
// Fills the array p with signed integers.
void Randomize(int16_t *p, int size, int max_bit_range) {
ASSERT_TRUE(max_bit_range < 16) << "max_bit_range has to be less than 16";
for (int i = 0; i < size; ++i) {
p[i] = rnd_.Rand15Signed() & ((1 << max_bit_range) - 1);
}
}
void SetFilterTaps() {
Randomize(FilterTaps_, kNumSubtractCenterOffTaps,
kMaxPrecisionBeforeOverflow);
}
int RandBool() {
const uint32_t value = rnd_.Rand8();
// There's a bit more entropy in the upper bits of this implementation.
return (value >> 7) & 0x1;
}
// Fills the array p with maximum and minimum possible integers.
void RandomizeExtreamFilterTap(int16_t *p, int size, int max_bit_range) {
ASSERT_TRUE(max_bit_range < 16) << "max_bit_range has to be less than 16";
const int sign_max_val = (1 << (max_bit_range - 1)) - 1;
for (int i = 0; i < size; ++i) {
p[i] =
static_cast<int16_t>(RandBool() ? sign_max_val : -(sign_max_val + 1));
}
}
void BitMatchTest(const uint16_t *dgd, const uint16_t *dgd_dual,
int dgd_stride, int width, int height,
const int16_t *filter, uint16_t *reference, uint16_t *test,
int dst_stride, int bit_depth, int block_row_begin,
int block_row_end, int block_col_begin, int block_col_end,
int is_subtract_center) {
// Set filter_config and reference function appropriately.
highbd_convolve_nonsep_dual_2d_func ref_func;
const NonsepFilterConfig *filter_cfg;
filter_cfg = &DualFilterWithCenterConfig_;
ref_func = av1_convolve_symmetric_dual_subtract_center_highbd_c;
if (!is_subtract_center) {
ref_func = av1_convolve_symmetric_dual_highbd_c;
filter_cfg = &DualFilterWithoutCenterConfig_;
}
// Reference function
ref_func(dgd, dgd_stride, dgd_dual, dgd_stride, filter_cfg, filter,
reference, dst_stride, bit_depth, block_row_begin, block_row_end,
block_col_begin, block_col_end);
// Test function
GetParam().TestFunction()(dgd, dgd_stride, dgd_dual, dgd_stride, filter_cfg,
filter, test, dst_stride, bit_depth,
block_row_begin, block_row_end, block_col_begin,
block_col_end);
// Compare the output of reference and test for bit match
AssertOutputBufferEq(reference, test, width, height);
}
void TestConvolve(const int16_t *filter, int is_subtract_center) {
const int width = GetParam().Block().Width();
const int height = GetParam().Block().Height();
const int bit_depth = GetParam().BitDepth();
const uint16_t *dgd = FirstRandomInput12(GetParam());
const uint16_t *dgd_dual = FirstRandomInput12(GetParam());
DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
ASSERT_TRUE(kInputPadding >= kMaxTapOffset)
<< "Not enough padding for 5x5 filters";
const uint16_t *centered_input1 =
dgd + kMaxTapOffset * width + kMaxTapOffset;
const uint16_t *centered_input2 =
dgd_dual + kMaxTapOffset * width + kMaxTapOffset;
const int input_stride = width;
BitMatchTest(centered_input1, centered_input2, input_stride, width, height,
filter, reference, test, kOutputStride, bit_depth, 0, height,
0, width, is_subtract_center);
// Extreme value test
const uint16_t *extreme_input1 = FirstRandomInput16Extreme(GetParam());
const uint16_t *extreme_input2 = FirstRandomInput16Extreme(GetParam());
const uint16_t *centered_extreme_input1 =
extreme_input1 + kMaxTapOffset * width + kMaxTapOffset;
const uint16_t *centered_extreme_input2 =
extreme_input2 + kMaxTapOffset * width + kMaxTapOffset;
int16_t Extream_Tap_[kNumSubtractCenterOffTaps];
RandomizeExtreamFilterTap(Extream_Tap_, kNumSubtractCenterOffTaps,
kMaxPrecisionBeforeOverflow);
BitMatchTest(centered_extreme_input1, centered_extreme_input2, input_stride,
width, height, Extream_Tap_, reference, test, kOutputStride,
bit_depth, 0, height, 0, width, is_subtract_center);
}
void SpeedTestConvolve(const int16_t *filter, int is_subtract_center) {
const int width = GetParam().Block().Width();
const int height = GetParam().Block().Height();
const int bit_depth = GetParam().BitDepth();
const uint16_t *dgd = FirstRandomInput12(GetParam());
const uint16_t *dgd_dual = FirstRandomInput12(GetParam());
DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
ASSERT_TRUE(kInputPadding >= kMaxTapOffset)
<< "Not enough padding for 5x5 filters";
const uint16_t *centered_input1 =
dgd + kMaxTapOffset * width + kMaxTapOffset;
const uint16_t *centered_input2 =
dgd_dual + kMaxTapOffset * width + kMaxTapOffset;
// Set filter_config and reference function appropriately.
highbd_convolve_nonsep_dual_2d_func ref_func;
const NonsepFilterConfig *filter_cfg;
filter_cfg = &DualFilterWithCenterConfig_;
ref_func = av1_convolve_symmetric_dual_subtract_center_highbd_c;
if (!is_subtract_center) {
ref_func = av1_convolve_symmetric_dual_highbd_c;
filter_cfg = &DualFilterWithoutCenterConfig_;
}
// Calculate time taken by reference/c function
aom_usec_timer timer;
aom_usec_timer_start(&timer);
for (int i = 0; i < kSpeedIterations; ++i) {
ref_func(centered_input1, width, centered_input2, width, filter_cfg,
filter, reference, kOutputStride, bit_depth, 0, height, 0,
width);
}
aom_usec_timer_mark(&timer);
auto elapsed_time_c = aom_usec_timer_elapsed(&timer);
// Calculate time taken by optimized/intrinsic function
aom_usec_timer_start(&timer);
for (int i = 0; i < kSpeedIterations; ++i) {
GetParam().TestFunction()(centered_input1, width, centered_input2, width,
filter_cfg, filter, test, kOutputStride,
bit_depth, 0, height, 0, width);
}
aom_usec_timer_mark(&timer);
auto elapsed_time_opt = aom_usec_timer_elapsed(&timer);
float c_time_per_pixel =
(float)1000.0 * elapsed_time_c / (kSpeedIterations * width * height);
float opt_time_per_pixel =
(float)1000.0 * elapsed_time_opt / (kSpeedIterations * width * height);
float scaling = c_time_per_pixel / opt_time_per_pixel;
printf(
" %3dx%-3d: c_time_per_pixel=%10.5f, "
"opt_time_per_pixel=%10.5f, scaling=%f \n",
width, height, c_time_per_pixel, opt_time_per_pixel, scaling);
}
const int wienerns_config_uv_from_uv[12][3] = {
{ 1, 0, 0 }, { -1, 0, 0 }, { 0, 1, 1 }, { 0, -1, 1 },
{ 1, 1, 2 }, { -1, -1, 2 }, { -1, 1, 3 }, { 1, -1, 3 },
{ 2, 0, 4 }, { -2, 0, 4 }, { 0, 2, 5 }, { 0, -2, 5 },
};
const int wienerns_config_uv_from_y[12][3] = {
{ 1, 0, 6 }, { -1, 0, 6 }, { 0, 1, 7 }, { 0, -1, 7 },
{ 1, 1, 8 }, { -1, -1, 8 }, { -1, 1, 9 }, { 1, -1, 9 },
{ 2, 0, 10 }, { -2, 0, 10 }, { 0, 2, 11 }, { 0, -2, 11 },
};
const int wienerns_wout_subtract_center_config_uv_from_uv[13][3] = {
{ 1, 0, 0 }, { -1, 0, 0 }, { 0, 1, 1 }, { 0, -1, 1 }, { 1, 1, 2 },
{ -1, -1, 2 }, { -1, 1, 3 }, { 1, -1, 3 }, { 2, 0, 4 }, { -2, 0, 4 },
{ 0, 2, 5 }, { 0, -2, 5 }, { 0, 0, 6 },
};
// Adjust the beginning tap to account for the above change and add a tap at
// (0, 0).
const int wienerns_wout_subtract_center_config_uv_from_y[13][3] = {
{ 1, 0, 7 }, { -1, 0, 7 }, { 0, 1, 8 }, { 0, -1, 8 }, { 1, 1, 9 },
{ -1, -1, 9 }, { -1, 1, 10 }, { 1, -1, 10 }, { 2, 0, 11 }, { -2, 0, 11 },
{ 0, 2, 12 }, { 0, -2, 12 }, { 0, 0, 13 },
};
const NonsepFilterConfig DualFilterWithCenterConfig_ = {
kMaxPrecisionBeforeOverflow, // prec_bits;
sizeof(wienerns_config_uv_from_uv) /
sizeof(wienerns_config_uv_from_uv[0]), // num_pixels;
sizeof(wienerns_config_uv_from_y) /
sizeof(wienerns_config_uv_from_y[0]), // num_pixels2
wienerns_config_uv_from_uv, // config
wienerns_config_uv_from_y, // config2
0, // strict_bounds
1 // subtract_center
};
const NonsepFilterConfig DualFilterWithoutCenterConfig_ = {
kMaxPrecisionBeforeOverflow, // prec_bits;
sizeof(wienerns_wout_subtract_center_config_uv_from_uv) /
sizeof(
wienerns_wout_subtract_center_config_uv_from_uv[0]), // num_pixels;
sizeof(wienerns_wout_subtract_center_config_uv_from_y) /
sizeof(
wienerns_wout_subtract_center_config_uv_from_y[0]), // num_pixels2
wienerns_wout_subtract_center_config_uv_from_uv, // config
wienerns_wout_subtract_center_config_uv_from_y, // config2
0, // strict_bounds
0 // subtract_center
};
};
TEST_P(AV1ConvolveNon_Sep_dual2DHighbdTest, RunTest) { RunTest(1); }
TEST_P(AV1ConvolveNon_Sep_dual2DHighbdTest, DISABLED_Speed) { RunSpeedTest(1); }
#if HAVE_AVX2
INSTANTIATE_TEST_SUITE_P(
AVX2, AV1ConvolveNon_Sep_dual2DHighbdTest,
BuildHighbdParams(av1_convolve_symmetric_dual_subtract_center_highbd_avx2));
#endif // HAVE_AVX2
/* Dual with subtract center off unit-test*/
class AV1ConvolveDualWithoutsubtract2DHighbdTest
: public AV1ConvolveNon_Sep_dual2DHighbdTest {};
TEST_P(AV1ConvolveDualWithoutsubtract2DHighbdTest, RunTest) { RunTest(0); }
TEST_P(AV1ConvolveDualWithoutsubtract2DHighbdTest, DISABLED_Speed) {
RunSpeedTest(0);
}
#if HAVE_AVX2
INSTANTIATE_TEST_SUITE_P(
AVX2, AV1ConvolveDualWithoutsubtract2DHighbdTest,
BuildHighbdParams(av1_convolve_symmetric_dual_highbd_avx2));
#endif
#endif // CONFIG_LR_IMPROVEMENTS
//////////////////////////////////////////////////////////
// Unit-test corresponds to buffer accumulations to derive filter
// index for each block size (pc_wiener_block_size: 4x4)
//////////////////////////////////////////////////////////
#if CONFIG_LR_IMPROVEMENTS
// Generate the list of all block widths / heights that need to be tested for
// pc_wiener.
template <typename T>
std::vector<TestParam<T>> GetPCWienerTestParams(
std::initializer_list<int> bit_depths, T test_func) {
std::set<BlockSize> sizes;
for (int b = BLOCK_4X4; b < BLOCK_SIZES_ALL; ++b) {
const int w = block_size_wide[b];
const int h = block_size_high[b];
if (w > RESTORATION_PROC_UNIT_SIZE || h > RESTORATION_PROC_UNIT_SIZE) {
continue;
}
sizes.insert(BlockSize(w, h));
// Add in smaller chroma sizes as well.
if (w == 4 || h == 4) {
sizes.insert(BlockSize(w / 2, h / 2));
}
}
std::vector<TestParam<T>> result;
for (const BlockSize &block : sizes) {
for (int bd : bit_depths) {
result.push_back(TestParam<T>(block, bd, test_func));
}
}
return result;
}
template <typename T>
::testing::internal::ParamGenerator<TestParam<T>> BuildHighbdPCWienerParams(
T test_func) {
return ::testing::ValuesIn(GetPCWienerTestParams({ 10, 12 }, test_func));
}
typedef void (*fill_directional_feature_buffers_highbd_func)(
int *feature_sum_buffers[], int16_t *feature_line_buffers[], int row,
int buffer_row, const uint16_t *dgd, int dgd_stride, int width,
int feature_lead, int feature_lag);
class AV1FillDirFeatureBufHighbdTest
: public AV1ConvolveTest<fill_directional_feature_buffers_highbd_func> {
public:
void RunTest() {
for (int i = 0; i < kTestIterations; i++) {
// Set buffer values here.
SetBufferValues();
TestConvolve();
}
}
void RunSpeedTest() { SpeedTestConvolve(); };
protected:
virtual void SetUp() {
for (int j = 0; j < NUM_FEATURE_LINE_BUFFERS; ++j) {
feature_line_buffers_c_[j] = static_cast<int16_t *>(
(aom_malloc(buffer_width_ * sizeof(*feature_line_buffers_c_[j]))));
ASSERT_NE(feature_line_buffers_c_[j], nullptr);
feature_line_buffers_simd_[j] = static_cast<int16_t *>(
(aom_malloc(buffer_width_ * sizeof(*feature_line_buffers_simd_[j]))));
ASSERT_NE(feature_line_buffers_simd_[j], nullptr);
}
for (int j = 0; j < NUM_PC_WIENER_FEATURES; ++j) {
feature_sum_buffers_c_[j] = static_cast<int *>(
(aom_malloc(buffer_width_ * sizeof(*feature_sum_buffers_c_[j]))));
ASSERT_NE(feature_sum_buffers_c_[j], nullptr);
feature_sum_buffers_simd_[j] = static_cast<int *>(
(aom_malloc(buffer_width_ * sizeof(*feature_sum_buffers_simd_[j]))));
ASSERT_NE(feature_sum_buffers_simd_[j], nullptr);
}
}
virtual void TearDown() {
for (int j = 0; j < NUM_FEATURE_LINE_BUFFERS; ++j) {
aom_free(feature_line_buffers_c_[j]);
feature_line_buffers_c_[j] = NULL;
aom_free(feature_line_buffers_simd_[j]);
feature_line_buffers_simd_[j] = NULL;
}
for (int j = 0; j < NUM_PC_WIENER_FEATURES; ++j) {
aom_free(feature_sum_buffers_c_[j]);
feature_sum_buffers_c_[j] = NULL;
aom_free(feature_sum_buffers_simd_[j]);
feature_sum_buffers_simd_[j] = NULL;
}
}
void SetBufferValues() {
const int bitdepth = GetParam().BitDepth();
for (int j = 0; j < NUM_FEATURE_LINE_BUFFERS; ++j) {
Randomize(feature_line_buffers_c_[j], buffer_width_, bitdepth);
memcpy(feature_line_buffers_simd_[j], feature_line_buffers_c_[j],
buffer_width_ * sizeof(*feature_line_buffers_simd_[j]));
}
for (int j = 0; j < NUM_PC_WIENER_FEATURES; ++j) {
RandomizeSigned31(feature_sum_buffers_c_[j], buffer_width_, 31);
memcpy(feature_sum_buffers_simd_[j], feature_sum_buffers_c_[j],
buffer_width_ * sizeof(*feature_sum_buffers_simd_[j]));
}
}
private:
libaom_test::ACMRandom rnd_;
static constexpr int kSpeedIterations = 10000;
static constexpr int kTestIterations = 100;
void TestConvolve() {
const int width = GetParam().Block().Width();
const int height = GetParam().Block().Height();
// Input buffer allocation.
const uint16_t *input = FirstRandomInput12(GetParam());
const int input_stride = width;
// C function call
for (int i = 0; i < height; ++i) {
const int row_to_process = AOMMIN(i + feature_lag, height + 3 - 2);
fill_directional_feature_buffers_highbd_c(
feature_sum_buffers_c_, feature_line_buffers_c_, row_to_process,
feature_length - 1, input, input_stride, width, feature_lead,
feature_lag);
}
// SIMD function call
for (int i = 0; i < height; ++i) {
const int row_to_process = AOMMIN(i + feature_lag, height + 3 - 2);
GetParam().TestFunction()(feature_sum_buffers_simd_,
feature_line_buffers_simd_, row_to_process,
feature_length - 1, input, input_stride, width,
feature_lead, feature_lag);
}
// Compare the outputs of C and SIMD
for (int i = 0; i < NUM_PC_WIENER_FEATURES; i++) {
int *c_buf = feature_sum_buffers_c_[i];
int *simd_buf = feature_sum_buffers_simd_[i];
for (int j = 0; j < buffer_width_; ++j) {
ASSERT_EQ(c_buf[j], simd_buf[j])
<< "feature_buf=" << i << " Pixel mismatch at width (" << i << ")";
}
}
}
void SpeedTestConvolve() {
const int width = GetParam().Block().Width();
const int height = GetParam().Block().Height();
// Input buffer allocation.
const uint16_t *input = FirstRandomInput12(GetParam());
const int input_stride = width;
// Calculate time taken for C function
aom_usec_timer timer;
aom_usec_timer_start(&timer);
for (int i = 0; i < kSpeedIterations; ++i) {
for (int i = 0; i < height; ++i) {
const int row_to_process = AOMMIN(i + feature_lag, height + 3 - 2);
fill_directional_feature_buffers_highbd_c(
feature_sum_buffers_c_, feature_line_buffers_c_, row_to_process,
feature_length - 1, input, input_stride, width, feature_lead,
feature_lag);
}
}
aom_usec_timer_mark(&timer);
auto elapsed_time_c = aom_usec_timer_elapsed(&timer);
// Calculate time taken by optimized/intrinsic function
aom_usec_timer_start(&timer);
for (int i = 0; i < kSpeedIterations; ++i) {
for (int i = 0; i < height; ++i) {
const int row_to_process = AOMMIN(i + feature_lag, height + 3 - 2);
GetParam().TestFunction()(feature_sum_buffers_simd_,
feature_line_buffers_simd_, row_to_process,
feature_length - 1, input, input_stride,
width, feature_lead, feature_lag);
}
}
aom_usec_timer_mark(&timer);
auto elapsed_time_opt = aom_usec_timer_elapsed(&timer);
float c_time_per_pixel =
(float)1000.0 * elapsed_time_c / (kSpeedIterations * width * height);
float opt_time_per_pixel =
(float)1000.0 * elapsed_time_opt / (kSpeedIterations * width * height);
float scaling = c_time_per_pixel / opt_time_per_pixel;
printf(
"%3dx%-3d: c_time_per_pixel=%10.5f, "
"opt_time_per_pixel=%10.5f, scaling=%f \n",
width, height, c_time_per_pixel, opt_time_per_pixel, scaling);
}
// Fills the array p with signed integers.
void Randomize(int16_t *p, int size, int max_bit_range) {
ASSERT_TRUE(max_bit_range < 16) << "max_bit_range has to be less than 16";
for (int i = 0; i < size; ++i) {
p[i] = rnd_.Rand15Signed() & ((1 << max_bit_range) - 1);
}
}
// Fills the array p with signed integers of 31 bit range.
void RandomizeSigned31(int *p, int size, uint32_t max_bit_range) {
assert(max_bit_range <= 31);
uint32_t mask = (uint32_t)(1 << max_bit_range) - 1;
for (int i = 0; i < size; ++i) {
p[i] = (int)(rnd_.Rand31() & mask);
}
}
int *feature_sum_buffers_c_[NUM_PC_WIENER_FEATURES];
int *feature_sum_buffers_simd_[NUM_PC_WIENER_FEATURES];
int16_t *feature_line_buffers_c_[NUM_FEATURE_LINE_BUFFERS];
int16_t *feature_line_buffers_simd_[NUM_FEATURE_LINE_BUFFERS];
const int feature_lead = PC_WIENER_FEATURE_LEAD_LUMA;
const int feature_lag = PC_WIENER_FEATURE_LAG_LUMA;
const int feature_length = PC_WIENER_FEATURE_LENGTH_LUMA;
const int buffer_width_ = MAX_SB_SIZE + kInputPadding;
};
TEST_P(AV1FillDirFeatureBufHighbdTest, RunTest) { RunTest(); }
TEST_P(AV1FillDirFeatureBufHighbdTest, DISABLED_Speed) { RunSpeedTest(); }
#if HAVE_AVX2
INSTANTIATE_TEST_SUITE_P(
AVX2, AV1FillDirFeatureBufHighbdTest,
BuildHighbdPCWienerParams(fill_directional_feature_buffers_highbd_avx2));
#endif // HAVE_AVX2
typedef void (*FillTSkipSumBufferFunc)(int row, const uint8_t *tskip,
int tskip_stride,
int8_t *tskip_sum_buffer, int width,
int height, int tskip_lead,
int tskip_lag, bool use_strict_bounds);
typedef std::tuple<const FillTSkipSumBufferFunc> AV1FillTSkipSumBufferFuncParam;
class AV1Fill_TSkip_Sum_BufferTest
: public ::testing::TestWithParam<AV1FillTSkipSumBufferFuncParam> {
public:
virtual void SetUp() { target_func_ = GET_PARAM(0); }
void RunTest() {
for (int i = 0; i < kTestIterations; i++) {
TestTSkipSum();
}
}
void RunSpeedTest() { SpeedTestTSkipSum(); };
private:
libaom_test::ACMRandom rnd_;
FillTSkipSumBufferFunc target_func_;
static constexpr int kSpeedIterations = 10000;
static constexpr int kTestIterations = 100;
static constexpr int kNumPlanes = 1;
static constexpr int kWidth = RESTORATION_PROC_UNIT_SIZE;
static constexpr int kHeight = RESTORATION_PROC_UNIT_SIZE;
static constexpr int kInputWidth = MI_SIZE_64X64;
static constexpr int kInputStride = MI_SIZE_64X64;
static constexpr int kOutputWidth =
(RESTORATION_PROC_UNIT_SIZE + PC_WIENER_FEATURE_LENGTH_LUMA - 1);
uint8_t input_buffer_[MI_SIZE_64X64 * MI_SIZE_64X64];
int8_t ref_buffer_[kOutputWidth];
int8_t test_buffer_[kOutputWidth];
const bool tskip_strict_ = true;
int RandBool() {
const uint32_t value = rnd_.Rand8();
// There's a bit more entropy in the upper bits of this implementation.
return (value >> 7) & 0x1;
}
void TestTSkipSum() {
for (int i = 0; i < kInputWidth * kInputStride; ++i) {
input_buffer_[i] = static_cast<uint8_t>(RandBool() ? 1 : 0);
}
for (int plane = 0; plane < kNumPlanes; ++plane) {
const int is_uv = (plane > 0);
const int ss_x = is_uv ? 1 : 0;
const int ss_y = is_uv ? 1 : 0;
const int plane_width = kWidth >> ss_x;
const int plane_height = kHeight >> ss_y;
const int tskip_lead = PC_WIENER_TSKIP_LEAD_LUMA;
const int tskip_lag = PC_WIENER_TSKIP_LAG_LUMA;
memset(ref_buffer_, 0, sizeof(*ref_buffer_) * kOutputWidth);
memset(test_buffer_, 0, sizeof(*test_buffer_) * kOutputWidth);
// Reference function
for (int row = -tskip_lead; row < (tskip_lag + plane_height); ++row) {
av1_fill_tskip_sum_buffer_c(row, input_buffer_, kInputStride,
ref_buffer_, plane_width, plane_height,
tskip_lead, tskip_lag, tskip_strict_);
}
// Test function
for (int row = -tskip_lead; row < (tskip_lag + plane_height); ++row) {
target_func_(row, input_buffer_, kInputStride, test_buffer_,
plane_width, plane_height, tskip_lead, tskip_lag,
tskip_strict_);
}
// Compare the output of reference and test for bit match
for (int i = 0; i < kOutputWidth; ++i) {
ASSERT_EQ(ref_buffer_[i], test_buffer_[i])
<< " Mismatch at (" << i << ")";
}
}
}
void SpeedTestTSkipSum() {
for (int i = 0; i < kInputWidth * kInputStride; ++i) {
input_buffer_[i] = static_cast<uint8_t>(RandBool() ? 1 : 0);
}
for (int plane = 0; plane < kNumPlanes; ++plane) {
const int is_uv = (plane > 0);
const int ss_x = is_uv ? 1 : 0;
const int ss_y = is_uv ? 1 : 0;
const int plane_width = kWidth >> ss_x;
const int plane_height = kHeight >> ss_y;
const int tskip_lead = PC_WIENER_TSKIP_LEAD_LUMA;
const int tskip_lag = PC_WIENER_TSKIP_LAG_LUMA;
memset(ref_buffer_, 0, sizeof(*ref_buffer_) * kOutputWidth);
memset(test_buffer_, 0, sizeof(*test_buffer_) * kOutputWidth);
// Calculate time taken by reference/c function
aom_usec_timer timer;
aom_usec_timer_start(&timer);
for (int i = 0; i < kSpeedIterations; ++i) {
// Reference function
for (int row = -tskip_lead; row < (tskip_lag + plane_height - 1);
++row) {
av1_fill_tskip_sum_buffer_c(row, input_buffer_, kInputStride,
ref_buffer_, plane_width, plane_height,
tskip_lead, tskip_lag, tskip_strict_);
}
}
aom_usec_timer_mark(&timer);
auto elapsed_time_c = aom_usec_timer_elapsed(&timer);
// Calculate time taken by optimized/intrinsic function
aom_usec_timer_start(&timer);
for (int i = 0; i < kSpeedIterations; ++i) {
for (int row = -tskip_lead; row < (tskip_lag + plane_height - 1);
++row) {
target_func_(row, input_buffer_, kInputStride, test_buffer_,
plane_width, plane_height, tskip_lead, tskip_lag,
tskip_strict_);
}
}
aom_usec_timer_mark(&timer);
auto elapsed_time_opt = aom_usec_timer_elapsed(&timer);
float c_time_per_pixel =
(float)1000.0 * elapsed_time_c / kSpeedIterations;
float opt_time_per_pixel =
(float)1000.0 * elapsed_time_opt / kSpeedIterations;
float scaling = c_time_per_pixel / opt_time_per_pixel;
printf(
" %3dx%-3d: c_time_per_pixel=%10.5f, "
"opt_time_per_pixel=%10.5f, scaling=%f \n",
plane_width, plane_height, c_time_per_pixel, opt_time_per_pixel,
scaling);
}
}
};
TEST_P(AV1Fill_TSkip_Sum_BufferTest, RunTest) { RunTest(); }
TEST_P(AV1Fill_TSkip_Sum_BufferTest, DISABLED_Speed) { RunSpeedTest(); }
#if HAVE_AVX2
INSTANTIATE_TEST_SUITE_P(AVX2, AV1Fill_TSkip_Sum_BufferTest,
::testing::Values(av1_fill_tskip_sum_buffer_avx2));
#endif // HAVE_AVX2
//////////////////////////////////////////////////////////
// unit-test for 'directional_feature_accum' //
//////////////////////////////////////////////////////////
typedef void (*FillDirFeatureAccumFunc)(
int dir_feature_accum[NUM_PC_WIENER_FEATURES][PC_WIENER_FEATURE_ACC_SIZE],
int *feature_sum_buf[NUM_PC_WIENER_FEATURES], int width, int col_offset,
int feature_lead, int feature_lag);
typedef std::tuple<const FillDirFeatureAccumFunc>
AV1FillDirFeatureAccumFuncParam;
class AV1FeatureDirAccumHighbdTest
: public ::testing::TestWithParam<AV1FillDirFeatureAccumFuncParam> {
public:
void RunTest() {
for (int i = 0; i < kTestIterations; i++) {
FillInputBufs();
TestFillDirFeatureAccum();
}
}
void RunSpeedTest() { SpeedTestConvolve(); };
virtual void SetUp() {
target_func_ = GET_PARAM(0);
for (int j = 0; j < NUM_PC_WIENER_FEATURES; ++j) {
feature_sum_buf[j] =
(int *)(aom_malloc(kInputWidth * sizeof(*feature_sum_buf[j])));
}
}
virtual void TearDown() {
for (int j = 0; j < NUM_PC_WIENER_FEATURES; ++j) {
aom_free(feature_sum_buf[j]);
feature_sum_buf[j] = NULL;
}
}
private:
libaom_test::ACMRandom rnd_;
FillDirFeatureAccumFunc target_func_;
static constexpr int kSpeedIterations = 1000000;
static constexpr int kTestIterations = 100;
static constexpr int kNumPlanes = 2;
static constexpr int kWidth = RESTORATION_PROC_UNIT_SIZE;
static constexpr int kInputWidth =
(RESTORATION_PROC_UNIT_SIZE + PC_WIENER_FEATURE_LENGTH_LUMA - 1);
int *feature_sum_buf[NUM_PC_WIENER_FEATURES];
int dir_feature_accum_buf_c[NUM_PC_WIENER_FEATURES]
[PC_WIENER_FEATURE_ACC_SIZE] = { { 0 } };
int dir_feature_accum_buf_simd[NUM_PC_WIENER_FEATURES]
[PC_WIENER_FEATURE_ACC_SIZE] = { { 0 } };
int RandBool() {
const uint32_t value = rnd_.Rand8();
// There's a bit more entropy in the upper bits of this implementation.
return (value >> 7) & 0x1;
}
void FillInputBufs() {
for (int i = 0; i < NUM_PC_WIENER_FEATURES; ++i) {
for (int j = 0; j < kInputWidth; ++j) {
// For the extreme values case, the maimum input that feature_sum_buf
// can take is (kInputWidth * 2 * input_max_value). Hence, clipping the
// value generated to 23 bit.
const int max_range = (1 << 23);
const int value = rnd_.Rand31() % max_range;
feature_sum_buf[i][j] =
static_cast<uint8_t>(RandBool() ? value : -value);
}
}
// Reset output buffers
av1_zero(dir_feature_accum_buf_c);
av1_zero(dir_feature_accum_buf_simd);
}
void TestFillDirFeatureAccum() {
for (int plane = 0; plane < kNumPlanes; ++plane) {
const int is_uv = (plane > 0);
const int ss_x = is_uv ? 1 : 0;
const int plane_width = kWidth >> ss_x;
const int feature_lead = PC_WIENER_FEATURE_LEAD_LUMA;
const int feature_lag = PC_WIENER_FEATURE_LAG_LUMA;
// Reset output buffers
av1_zero(dir_feature_accum_buf_c);
av1_zero(dir_feature_accum_buf_simd);
// C function call
av1_fill_directional_feature_accumulators_c(
dir_feature_accum_buf_c, feature_sum_buf, plane_width, feature_lag,
feature_lead, feature_lag);
// SIMD function call
target_func_(dir_feature_accum_buf_simd, feature_sum_buf, plane_width,
feature_lag, feature_lead, feature_lag);
// Compare the output of reference and test for bit match
for (int i = 0; i < NUM_PC_WIENER_FEATURES; i++) {
for (int j = 0; j < PC_WIENER_FEATURE_ACC_SIZE; j++) {
ASSERT_EQ(dir_feature_accum_buf_c[i][j],
dir_feature_accum_buf_simd[i][j])
<< " Feature_Buf: Pixel mismatch at (" << i << ", " << j << ", "
<< plane_width << ")";
}
}
}
}
void SpeedTestConvolve() {
for (int plane = 0; plane < kNumPlanes; ++plane) {
const int is_uv = (plane > 0);
const int ss_x = is_uv ? 1 : 0;
const int plane_width = kWidth >> ss_x;
const int feature_lead = PC_WIENER_FEATURE_LEAD_LUMA;
const int feature_lag = PC_WIENER_FEATURE_LAG_LUMA;
FillInputBufs();
// Calculate time taken by reference/c function
aom_usec_timer timer;
aom_usec_timer_start(&timer);
for (int i = 0; i < kSpeedIterations; ++i) {
av1_fill_directional_feature_accumulators_c(
dir_feature_accum_buf_c, feature_sum_buf, plane_width, feature_lag,
feature_lead, feature_lag);
}
aom_usec_timer_mark(&timer);
auto elapsed_time_c = aom_usec_timer_elapsed(&timer);
// Calculate time taken by optimized/intrinsic function
aom_usec_timer_start(&timer);
for (int i = 0; i < kSpeedIterations; ++i) {
target_func_(dir_feature_accum_buf_simd, feature_sum_buf, plane_width,
feature_lag, feature_lead, feature_lag);
}
aom_usec_timer_mark(&timer);
auto elapsed_time_opt = aom_usec_timer_elapsed(&timer);
float c_time_per_pixel =
(float)1000.0 * elapsed_time_c / (kSpeedIterations * plane_width);
float opt_time_per_pixel =
(float)1000.0 * elapsed_time_opt / (kSpeedIterations * plane_width);
float scaling = c_time_per_pixel / opt_time_per_pixel;
printf(
" %3d: c_time_per_pixel=%10.5f, "
"opt_time_per_pixel=%10.5f, scaling=%f \n",
plane_width, c_time_per_pixel, opt_time_per_pixel, scaling);
}
}
};
TEST_P(AV1FeatureDirAccumHighbdTest, RunTest) { RunTest(); }
TEST_P(AV1FeatureDirAccumHighbdTest, DISABLED_Speed) { RunSpeedTest(); }
#if HAVE_AVX2
INSTANTIATE_TEST_SUITE_P(
AVX2, AV1FeatureDirAccumHighbdTest,
::testing::Values(av1_fill_directional_feature_accumulators_avx2));
#endif // HAVE_AVX2
//////////////////////////////////////////////////////////
// unit-test for 'fill_tskip_feature_accumulator' //
//////////////////////////////////////////////////////////
typedef void (*FillTskip_Accumulator_func)(
int16_t tskip_feature_accum[PC_WIENER_FEATURE_ACC_SIZE],
int8_t *tskip_sum_buff, int width, int col_offset, int tskip_lead,
int tskip_lag);
typedef std::tuple<const FillTskip_Accumulator_func>
AV1FillTSkipAccumBufferFuncParam;
class AV1TskipAccumHighbdTest
: public ::testing::TestWithParam<AV1FillTSkipAccumBufferFuncParam> {
public:
virtual void SetUp() { target_func_ = GET_PARAM(0); }
void RunTest() {
for (int i = 0; i < kTestIterations; i++) TestTskipAccum();
}
void RunSpeedTest() { SpeedTestTskipAccum(); };
private:
libaom_test::ACMRandom rnd_;
FillTskip_Accumulator_func target_func_;
static constexpr int kSpeedIterations = 1000000;
static constexpr int kTestIterations = 100;
static constexpr int kNumPlanes = 2;
static constexpr int kWidth = RESTORATION_PROC_UNIT_SIZE;
static constexpr int kInputWidth =
(RESTORATION_PROC_UNIT_SIZE + PC_WIENER_FEATURE_LENGTH_LUMA - 1);
int8_t *tskip_sum_buf;
int16_t tskip_feature_accum_c[PC_WIENER_FEATURE_ACC_SIZE] = { 0 };
int16_t tskip_feature_accum_simd[PC_WIENER_FEATURE_ACC_SIZE] = { 0 };
void buffer_alloc_and_set_data() {
tskip_sum_buf =
(int8_t *)(aom_malloc(kInputWidth * sizeof(*tskip_sum_buf)));
// Input buffer filling. Tskip buffer max value will not cross width of
// restoration unit size. Hence, the generated values are clipped to the
// same.
for (int i = 0; i < kInputWidth; ++i) {
const int8_t value =
static_cast<int8_t>(rnd_.Rand8() % RESTORATION_PROC_UNIT_SIZE);
tskip_sum_buf[i] = static_cast<uint8_t>(RandBool() ? value : -value);
}
}
int RandBool() {
const uint32_t value = rnd_.Rand8();
// There's a bit more entropy in the upper bits of this implementation.
return (value >> 7) & 0x1;
}
void TestTskipAccum() {
// Allocate memory and fill input buffer
buffer_alloc_and_set_data();
// Loop over luma and chroma plane
for (int plane = 0; plane < kNumPlanes; ++plane) {
const int is_uv = (plane > 0);
const int ss_x = is_uv ? 1 : 0;
const int plane_width = kWidth >> ss_x;
const int tskip_lead = PC_WIENER_TSKIP_LEAD_LUMA;
const int tskip_lag = PC_WIENER_TSKIP_LAG_LUMA;
av1_zero(tskip_feature_accum_c);
av1_zero(tskip_feature_accum_simd);
// C function call
av1_fill_tskip_feature_accumulator_c(tskip_feature_accum_c, tskip_sum_buf,
plane_width, tskip_lag, tskip_lead,
tskip_lag);
// SIMD function call
target_func_(tskip_feature_accum_simd, tskip_sum_buf, plane_width,
tskip_lag, tskip_lead, tskip_lag);
// Compare the output of reference and test for bit match
for (int i = 0; i < PC_WIENER_FEATURE_ACC_SIZE; i++) {
ASSERT_EQ(tskip_feature_accum_c[i], tskip_feature_accum_simd[i])
<< " Feature_Buf: Pixel mismatch at (" << i << "," << plane_width
<< ")";
}
}
aom_free(tskip_sum_buf);
tskip_sum_buf = NULL;
}
void SpeedTestTskipAccum() {
// Allocate memory and fill input buffer
buffer_alloc_and_set_data();
for (int plane = 0; plane < kNumPlanes; ++plane) {
const int is_uv = (plane > 0);
const int ss_x = is_uv ? 1 : 0;
const int plane_width = kWidth >> ss_x;
const int tskip_lead = PC_WIENER_TSKIP_LEAD_LUMA;
const int tskip_lag = PC_WIENER_TSKIP_LAG_LUMA;
// Calculate time taken by reference/c function
aom_usec_timer timer;
aom_usec_timer_start(&timer);
for (int i = 0; i < kSpeedIterations; ++i) {
av1_fill_tskip_feature_accumulator_c(tskip_feature_accum_c,
tskip_sum_buf, plane_width,
tskip_lag, tskip_lead, tskip_lag);
}
aom_usec_timer_mark(&timer);
auto elapsed_time_c = aom_usec_timer_elapsed(&timer);
// Calculate time taken by optimized/intrinsic function
aom_usec_timer_start(&timer);
for (int i = 0; i < kSpeedIterations; ++i) {
target_func_(tskip_feature_accum_simd, tskip_sum_buf, plane_width,
tskip_lag, tskip_lead, tskip_lag);
}
aom_usec_timer_mark(&timer);
auto elapsed_time_opt = aom_usec_timer_elapsed(&timer);
float c_time_per_pixel =
(float)1000.0 * elapsed_time_c / (kSpeedIterations * plane_width);
float opt_time_per_pixel =
(float)1000.0 * elapsed_time_opt / (kSpeedIterations * plane_width);
float scaling = c_time_per_pixel / opt_time_per_pixel;
printf(
" %3d: c_time_per_pixel=%10.5f, "
"opt_time_per_pixel=%10.5f, scaling=%f \n",
plane_width, c_time_per_pixel, opt_time_per_pixel, scaling);
}
aom_free(tskip_sum_buf);
tskip_sum_buf = NULL;
}
};
TEST_P(AV1TskipAccumHighbdTest, RunTest) { RunTest(); }
TEST_P(AV1TskipAccumHighbdTest, DISABLED_Speed) { RunSpeedTest(); }
#if HAVE_AVX2
INSTANTIATE_TEST_SUITE_P(
AVX2, AV1TskipAccumHighbdTest,
::testing::Values(av1_fill_tskip_feature_accumulator_avx2));
#endif // HAVE_AVX2
#endif // CONFIG_LR_IMPROVEMENTS
} // namespace