Add AVX2 for av1_calc_proj_params_high_bd_c
This CL adds AVX2 variant for the function
av1_calc_proj_params_high_bd_c().
Encode Time
cpu-used Reduction(%)
3 0.387
4 0.558
Module gains have improved by factor of ~5.9x w.r.t C code.
Change-Id: I088a5088f78f50bb77ffca4424769a17832d0bce
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 00ba788..58648a5 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -405,6 +405,9 @@
add_proto qw/void av1_calc_proj_params/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params";
specialize qw/av1_calc_proj_params avx2/;
+ add_proto qw/void av1_calc_proj_params_high_bd/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params";
+ specialize qw/av1_calc_proj_params_high_bd avx2/;
+
add_proto qw/int64_t av1_lowbd_pixel_proj_error/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params";
specialize qw/av1_lowbd_pixel_proj_error sse4_1 avx2 neon/;
diff --git a/av1/encoder/pickrst.c b/av1/encoder/pickrst.c
index 4925a3d..0123520 100644
--- a/av1/encoder/pickrst.c
+++ b/av1/encoder/pickrst.c
@@ -631,11 +631,13 @@
}
}
-static AOM_INLINE void av1_calc_proj_params_high_bd_c(
- const uint8_t *src8, int width, int height, int src_stride,
- const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
- int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2],
- const sgr_params_type *params) {
+void av1_calc_proj_params_high_bd_c(const uint8_t *src8, int width, int height,
+ int src_stride, const uint8_t *dat8,
+ int dat_stride, int32_t *flt0,
+ int flt0_stride, int32_t *flt1,
+ int flt1_stride, int64_t H[2][2],
+ int64_t C[2],
+ const sgr_params_type *params) {
if ((params->r[0] > 0) && (params->r[1] > 0)) {
calc_proj_params_r0_r1_high_bd_c(src8, width, height, src_stride, dat8,
dat_stride, flt0, flt0_stride, flt1,
@@ -673,9 +675,15 @@
params);
}
} else {
- av1_calc_proj_params_high_bd_c(src8, width, height, src_stride, dat8,
+ if ((width & 0x7) == 0) {
+ av1_calc_proj_params_high_bd(src8, width, height, src_stride, dat8,
dat_stride, flt0, flt0_stride, flt1,
flt1_stride, H, C, params);
+ } else {
+ av1_calc_proj_params_high_bd_c(src8, width, height, src_stride, dat8,
+ dat_stride, flt0, flt0_stride, flt1,
+ flt1_stride, H, C, params);
+ }
}
if (params->r[0] == 0) {
diff --git a/av1/encoder/x86/pickrst_avx2.c b/av1/encoder/x86/pickrst_avx2.c
index ef70a7b..32438d5 100644
--- a/av1/encoder/x86/pickrst_avx2.c
+++ b/av1/encoder/x86/pickrst_avx2.c
@@ -860,6 +860,229 @@
}
}
+static AOM_INLINE void calc_proj_params_r0_r1_high_bd_avx2(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
+ const int size = width * height;
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+ __m256i h00, h01, h11, c0, c1;
+ const __m256i zero = _mm256_setzero_si256();
+ h01 = h11 = c0 = c1 = h00 = zero;
+
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; j += 8) {
+ const __m256i u_load = _mm256_cvtepu16_epi32(
+ _mm_load_si128((__m128i *)(dat + i * dat_stride + j)));
+ const __m256i s_load = _mm256_cvtepu16_epi32(
+ _mm_load_si128((__m128i *)(src + i * src_stride + j)));
+ __m256i f1 = _mm256_loadu_si256((__m256i *)(flt0 + i * flt0_stride + j));
+ __m256i f2 = _mm256_loadu_si256((__m256i *)(flt1 + i * flt1_stride + j));
+ __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS);
+ __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS);
+ s = _mm256_sub_epi32(s, d);
+ f1 = _mm256_sub_epi32(f1, d);
+ f2 = _mm256_sub_epi32(f2, d);
+
+ const __m256i h00_even = _mm256_mul_epi32(f1, f1);
+ const __m256i h00_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32),
+ _mm256_srli_epi64(f1, 32));
+ h00 = _mm256_add_epi64(h00, h00_even);
+ h00 = _mm256_add_epi64(h00, h00_odd);
+
+ const __m256i h01_even = _mm256_mul_epi32(f1, f2);
+ const __m256i h01_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32),
+ _mm256_srli_epi64(f2, 32));
+ h01 = _mm256_add_epi64(h01, h01_even);
+ h01 = _mm256_add_epi64(h01, h01_odd);
+
+ const __m256i h11_even = _mm256_mul_epi32(f2, f2);
+ const __m256i h11_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32),
+ _mm256_srli_epi64(f2, 32));
+ h11 = _mm256_add_epi64(h11, h11_even);
+ h11 = _mm256_add_epi64(h11, h11_odd);
+
+ const __m256i c0_even = _mm256_mul_epi32(f1, s);
+ const __m256i c0_odd =
+ _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(s, 32));
+ c0 = _mm256_add_epi64(c0, c0_even);
+ c0 = _mm256_add_epi64(c0, c0_odd);
+
+ const __m256i c1_even = _mm256_mul_epi32(f2, s);
+ const __m256i c1_odd =
+ _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(s, 32));
+ c1 = _mm256_add_epi64(c1, c1_even);
+ c1 = _mm256_add_epi64(c1, c1_odd);
+ }
+ }
+
+ __m256i c_low = _mm256_unpacklo_epi64(c0, c1);
+ const __m256i c_high = _mm256_unpackhi_epi64(c0, c1);
+ c_low = _mm256_add_epi64(c_low, c_high);
+ const __m128i c_128bit = _mm_add_epi64(_mm256_extracti128_si256(c_low, 1),
+ _mm256_castsi256_si128(c_low));
+
+ __m256i h0x_low = _mm256_unpacklo_epi64(h00, h01);
+ const __m256i h0x_high = _mm256_unpackhi_epi64(h00, h01);
+ h0x_low = _mm256_add_epi64(h0x_low, h0x_high);
+ const __m128i h0x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h0x_low, 1),
+ _mm256_castsi256_si128(h0x_low));
+
+ // Using the symmetric properties of H, calculations of H[1][0] are not
+ // needed.
+ __m256i h1x_low = _mm256_unpacklo_epi64(zero, h11);
+ const __m256i h1x_high = _mm256_unpackhi_epi64(zero, h11);
+ h1x_low = _mm256_add_epi64(h1x_low, h1x_high);
+ const __m128i h1x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h1x_low, 1),
+ _mm256_castsi256_si128(h1x_low));
+
+ xx_storeu_128(C, c_128bit);
+ xx_storeu_128(H[0], h0x_128bit);
+ xx_storeu_128(H[1], h1x_128bit);
+
+ H[0][0] /= size;
+ H[0][1] /= size;
+ H[1][1] /= size;
+
+ // Since H is a symmetric matrix
+ H[1][0] = H[0][1];
+ C[0] /= size;
+ C[1] /= size;
+}
+
+static AOM_INLINE void calc_proj_params_r0_high_bd_avx2(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int64_t H[2][2], int64_t C[2]) {
+ const int size = width * height;
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+ __m256i h00, c0;
+ const __m256i zero = _mm256_setzero_si256();
+ c0 = h00 = zero;
+
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; j += 8) {
+ const __m256i u_load = _mm256_cvtepu16_epi32(
+ _mm_load_si128((__m128i *)(dat + i * dat_stride + j)));
+ const __m256i s_load = _mm256_cvtepu16_epi32(
+ _mm_load_si128((__m128i *)(src + i * src_stride + j)));
+ __m256i f1 = _mm256_loadu_si256((__m256i *)(flt0 + i * flt0_stride + j));
+ __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS);
+ __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS);
+ s = _mm256_sub_epi32(s, d);
+ f1 = _mm256_sub_epi32(f1, d);
+
+ const __m256i h00_even = _mm256_mul_epi32(f1, f1);
+ const __m256i h00_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32),
+ _mm256_srli_epi64(f1, 32));
+ h00 = _mm256_add_epi64(h00, h00_even);
+ h00 = _mm256_add_epi64(h00, h00_odd);
+
+ const __m256i c0_even = _mm256_mul_epi32(f1, s);
+ const __m256i c0_odd =
+ _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(s, 32));
+ c0 = _mm256_add_epi64(c0, c0_even);
+ c0 = _mm256_add_epi64(c0, c0_odd);
+ }
+ }
+ const __m128i h00_128bit = _mm_add_epi64(_mm256_extracti128_si256(h00, 1),
+ _mm256_castsi256_si128(h00));
+ const __m128i h00_val =
+ _mm_add_epi64(h00_128bit, _mm_srli_si128(h00_128bit, 8));
+
+ const __m128i c0_128bit = _mm_add_epi64(_mm256_extracti128_si256(c0, 1),
+ _mm256_castsi256_si128(c0));
+ const __m128i c0_val = _mm_add_epi64(c0_128bit, _mm_srli_si128(c0_128bit, 8));
+
+ const __m128i c = _mm_unpacklo_epi64(c0_val, _mm256_castsi256_si128(zero));
+ const __m128i h0x = _mm_unpacklo_epi64(h00_val, _mm256_castsi256_si128(zero));
+
+ xx_storeu_128(C, c);
+ xx_storeu_128(H[0], h0x);
+
+ H[0][0] /= size;
+ C[0] /= size;
+}
+
+static AOM_INLINE void calc_proj_params_r1_high_bd_avx2(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride,
+ int64_t H[2][2], int64_t C[2]) {
+ const int size = width * height;
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+ __m256i h11, c1;
+ const __m256i zero = _mm256_setzero_si256();
+ c1 = h11 = zero;
+
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; j += 8) {
+ const __m256i u_load = _mm256_cvtepu16_epi32(
+ _mm_load_si128((__m128i *)(dat + i * dat_stride + j)));
+ const __m256i s_load = _mm256_cvtepu16_epi32(
+ _mm_load_si128((__m128i *)(src + i * src_stride + j)));
+ __m256i f2 = _mm256_loadu_si256((__m256i *)(flt1 + i * flt1_stride + j));
+ __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS);
+ __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS);
+ s = _mm256_sub_epi32(s, d);
+ f2 = _mm256_sub_epi32(f2, d);
+
+ const __m256i h11_even = _mm256_mul_epi32(f2, f2);
+ const __m256i h11_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32),
+ _mm256_srli_epi64(f2, 32));
+ h11 = _mm256_add_epi64(h11, h11_even);
+ h11 = _mm256_add_epi64(h11, h11_odd);
+
+ const __m256i c1_even = _mm256_mul_epi32(f2, s);
+ const __m256i c1_odd =
+ _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(s, 32));
+ c1 = _mm256_add_epi64(c1, c1_even);
+ c1 = _mm256_add_epi64(c1, c1_odd);
+ }
+ }
+
+ const __m128i h11_128bit = _mm_add_epi64(_mm256_extracti128_si256(h11, 1),
+ _mm256_castsi256_si128(h11));
+ const __m128i h11_val =
+ _mm_add_epi64(h11_128bit, _mm_srli_si128(h11_128bit, 8));
+
+ const __m128i c1_128bit = _mm_add_epi64(_mm256_extracti128_si256(c1, 1),
+ _mm256_castsi256_si128(c1));
+ const __m128i c1_val = _mm_add_epi64(c1_128bit, _mm_srli_si128(c1_128bit, 8));
+
+ const __m128i c = _mm_unpacklo_epi64(_mm256_castsi256_si128(zero), c1_val);
+ const __m128i h1x = _mm_unpacklo_epi64(_mm256_castsi256_si128(zero), h11_val);
+
+ xx_storeu_128(C, c);
+ xx_storeu_128(H[1], h1x);
+
+ H[1][1] /= size;
+ C[1] /= size;
+}
+
+// AVX2 variant of av1_calc_proj_params_high_bd_c.
+void av1_calc_proj_params_high_bd_avx2(const uint8_t *src8, int width,
+ int height, int src_stride,
+ const uint8_t *dat8, int dat_stride,
+ int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride,
+ int64_t H[2][2], int64_t C[2],
+ const sgr_params_type *params) {
+ if ((params->r[0] > 0) && (params->r[1] > 0)) {
+ calc_proj_params_r0_r1_high_bd_avx2(src8, width, height, src_stride, dat8,
+ dat_stride, flt0, flt0_stride, flt1,
+ flt1_stride, H, C);
+ } else if (params->r[0] > 0) {
+ calc_proj_params_r0_high_bd_avx2(src8, width, height, src_stride, dat8,
+ dat_stride, flt0, flt0_stride, H, C);
+ } else if (params->r[1] > 0) {
+ calc_proj_params_r1_high_bd_avx2(src8, width, height, src_stride, dat8,
+ dat_stride, flt1, flt1_stride, H, C);
+ }
+}
+
#if CONFIG_AV1_HIGHBITDEPTH
int64_t av1_highbd_pixel_proj_error_avx2(
const uint8_t *src8, int width, int height, int src_stride,
diff --git a/test/pickrst_test.cc b/test/pickrst_test.cc
index 41ef66a..e16555e 100644
--- a/test/pickrst_test.cc
+++ b/test/pickrst_test.cc
@@ -364,6 +364,7 @@
#endif // HAVE_AVX2
} // namespace pickrst_test_highbd
+#endif // CONFIG_AV1_HIGHBITDEPTH
////////////////////////////////////////////////////////////////////////////////
// Get_proj_subspace_Test
@@ -540,4 +541,180 @@
#endif // HAVE_AVX2
} // namespace get_proj_subspace_test_lowbd
+
+#if CONFIG_AV1_HIGHBITDEPTH
+namespace get_proj_subspace_test_hbd {
+static const int kIterations = 100;
+
+typedef void (*set_get_proj_subspace_hbd)(const uint8_t *src8, int width,
+ int height, int src_stride,
+ const uint8_t *dat8, int dat_stride,
+ int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride,
+ int64_t H[2][2], int64_t C[2],
+ const sgr_params_type *params);
+
+typedef std::tuple<const set_get_proj_subspace_hbd> GetProjSubspaceHBDTestParam;
+
+class GetProjSubspaceTestHBD
+ : public ::testing::TestWithParam<GetProjSubspaceHBDTestParam> {
+ public:
+ virtual void SetUp() {
+ target_func_ = GET_PARAM(0);
+ src_ = (uint16_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+ sizeof(*src_)));
+ ASSERT_NE(src_, nullptr);
+ dgd_ = (uint16_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+ sizeof(*dgd_)));
+ ASSERT_NE(dgd_, nullptr);
+ flt0_ = (int32_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+ sizeof(*flt0_)));
+ ASSERT_NE(flt0_, nullptr);
+ flt1_ = (int32_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+ sizeof(*flt1_)));
+ ASSERT_NE(flt1_, nullptr);
+ }
+ virtual void TearDown() {
+ aom_free(src_);
+ aom_free(dgd_);
+ aom_free(flt0_);
+ aom_free(flt1_);
+ }
+ void RunGetProjSubspaceTestHBD(int32_t run_times);
+ void RunGetProjSubspaceTestHBD_ExtremeValues();
+
+ private:
+ set_get_proj_subspace_hbd target_func_;
+ libaom_test::ACMRandom rng_;
+ uint16_t *src_;
+ uint16_t *dgd_;
+ int32_t *flt0_;
+ int32_t *flt1_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GetProjSubspaceTestHBD);
+
+void GetProjSubspaceTestHBD::RunGetProjSubspaceTestHBD(int32_t run_times) {
+ int h_end = run_times != 1
+ ? 128
+ : ((rng_.Rand16() % MAX_DATA_BLOCK) &
+ 2147483640); // We test for widths divisible by 8.
+ int v_end =
+ run_times != 1 ? 128 : ((rng_.Rand16() % MAX_DATA_BLOCK) & 2147483640);
+ const int dgd_stride = MAX_DATA_BLOCK;
+ const int src_stride = MAX_DATA_BLOCK;
+ const int flt0_stride = MAX_DATA_BLOCK;
+ const int flt1_stride = MAX_DATA_BLOCK;
+ sgr_params_type params;
+ const int iters = run_times == 1 ? kIterations : 4;
+ for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
+ int64_t C_ref[2] = { 0 }, C_test[2] = { 0 };
+ int64_t H_ref[2][2] = { { 0, 0 }, { 0, 0 } };
+ int64_t H_test[2][2] = { { 0, 0 }, { 0, 0 } };
+ for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
+ dgd_[i] = rng_.Rand16() % 4095;
+ src_[i] = rng_.Rand16() % 4095;
+ flt0_[i] = rng_.Rand15Signed();
+ flt1_[i] = rng_.Rand15Signed();
+ }
+
+ params.r[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : 1;
+ params.r[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : 1;
+ params.s[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter % 2);
+ params.s[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter / 2);
+ uint8_t *dgd = CONVERT_TO_BYTEPTR(dgd_);
+ uint8_t *src = CONVERT_TO_BYTEPTR(src_);
+
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ av1_calc_proj_params_high_bd_c(src, v_end, h_end, src_stride, dgd,
+ dgd_stride, flt0_, flt0_stride, flt1_,
+ flt1_stride, H_ref, C_ref, ¶ms);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ target_func_(src, v_end, h_end, src_stride, dgd, dgd_stride, flt0_,
+ flt0_stride, flt1_, flt1_stride, H_test, C_test, ¶ms);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ if (run_times > 10) {
+ printf("r0 %d r1 %d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", params.r[0],
+ params.r[1], h_end, v_end, time1, time2, time1 / time2);
+ } else {
+ ASSERT_EQ(H_ref[0][0], H_test[0][0]);
+ ASSERT_EQ(H_ref[0][1], H_test[0][1]);
+ ASSERT_EQ(H_ref[1][0], H_test[1][0]);
+ ASSERT_EQ(H_ref[1][1], H_test[1][1]);
+ ASSERT_EQ(C_ref[0], C_test[0]);
+ ASSERT_EQ(C_ref[1], C_test[1]);
+ }
+ }
+}
+
+void GetProjSubspaceTestHBD::RunGetProjSubspaceTestHBD_ExtremeValues() {
+ const int h_start = 0;
+ int h_end = MAX_DATA_BLOCK;
+ const int v_start = 0;
+ int v_end = MAX_DATA_BLOCK;
+ const int dgd_stride = MAX_DATA_BLOCK;
+ const int src_stride = MAX_DATA_BLOCK;
+ const int flt0_stride = MAX_DATA_BLOCK;
+ const int flt1_stride = MAX_DATA_BLOCK;
+ sgr_params_type params;
+ const int iters = kIterations;
+ for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
+ int64_t C_ref[2] = { 0 }, C_test[2] = { 0 };
+ int64_t H_ref[2][2] = { { 0, 0 }, { 0, 0 } };
+ int64_t H_test[2][2] = { { 0, 0 }, { 0, 0 } };
+ for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
+ dgd_[i] = 0;
+ src_[i] = 4095;
+ flt0_[i] = rng_.Rand15Signed();
+ flt1_[i] = rng_.Rand15Signed();
+ }
+ params.r[0] = 1;
+ params.r[1] = 1;
+ params.s[0] = rng_.Rand8() % MAX_RADIUS;
+ params.s[1] = rng_.Rand8() % MAX_RADIUS;
+ uint8_t *dgd = CONVERT_TO_BYTEPTR(dgd_);
+ uint8_t *src = CONVERT_TO_BYTEPTR(src_);
+
+ av1_calc_proj_params_high_bd_c(
+ src, h_end - h_start, v_end - v_start, src_stride, dgd, dgd_stride,
+ flt0_, flt0_stride, flt1_, flt1_stride, H_ref, C_ref, ¶ms);
+
+ target_func_(src, h_end - h_start, v_end - v_start, src_stride, dgd,
+ dgd_stride, flt0_, flt0_stride, flt1_, flt1_stride, H_test,
+ C_test, ¶ms);
+
+ ASSERT_EQ(H_ref[0][0], H_test[0][0]);
+ ASSERT_EQ(H_ref[0][1], H_test[0][1]);
+ ASSERT_EQ(H_ref[1][0], H_test[1][0]);
+ ASSERT_EQ(H_ref[1][1], H_test[1][1]);
+ ASSERT_EQ(C_ref[0], C_test[0]);
+ ASSERT_EQ(C_ref[1], C_test[1]);
+ }
+}
+
+TEST_P(GetProjSubspaceTestHBD, RandomValues) { RunGetProjSubspaceTestHBD(1); }
+
+TEST_P(GetProjSubspaceTestHBD, ExtremeValues) {
+ RunGetProjSubspaceTestHBD_ExtremeValues();
+}
+
+TEST_P(GetProjSubspaceTestHBD, DISABLED_Speed) {
+ RunGetProjSubspaceTestHBD(200000);
+}
+
+#if HAVE_AVX2
+
+INSTANTIATE_TEST_SUITE_P(AVX2, GetProjSubspaceTestHBD,
+ ::testing::Values(av1_calc_proj_params_high_bd_avx2));
+#endif // HAVE_AVX2
+
+} // namespace get_proj_subspace_test_hbd
+
#endif // CONFIG_AV1_HIGHBITDEPTH