Add AVX2 for av1_calc_proj_params_high_bd_c

This CL adds AVX2 variant for the function
av1_calc_proj_params_high_bd_c().

                Encode Time
    cpu-used    Reduction(%)
       3          0.387
       4          0.558

Module gains have improved by factor of ~5.9x w.r.t C code.

Change-Id: I088a5088f78f50bb77ffca4424769a17832d0bce
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 00ba788..58648a5 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -405,6 +405,9 @@
   add_proto qw/void av1_calc_proj_params/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params";
   specialize qw/av1_calc_proj_params avx2/;
 
+  add_proto qw/void av1_calc_proj_params_high_bd/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params";
+  specialize qw/av1_calc_proj_params_high_bd avx2/;
+
   add_proto qw/int64_t av1_lowbd_pixel_proj_error/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params";
   specialize qw/av1_lowbd_pixel_proj_error sse4_1 avx2 neon/;
 
diff --git a/av1/encoder/pickrst.c b/av1/encoder/pickrst.c
index 4925a3d..0123520 100644
--- a/av1/encoder/pickrst.c
+++ b/av1/encoder/pickrst.c
@@ -631,11 +631,13 @@
   }
 }
 
-static AOM_INLINE void av1_calc_proj_params_high_bd_c(
-    const uint8_t *src8, int width, int height, int src_stride,
-    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
-    int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2],
-    const sgr_params_type *params) {
+void av1_calc_proj_params_high_bd_c(const uint8_t *src8, int width, int height,
+                                    int src_stride, const uint8_t *dat8,
+                                    int dat_stride, int32_t *flt0,
+                                    int flt0_stride, int32_t *flt1,
+                                    int flt1_stride, int64_t H[2][2],
+                                    int64_t C[2],
+                                    const sgr_params_type *params) {
   if ((params->r[0] > 0) && (params->r[1] > 0)) {
     calc_proj_params_r0_r1_high_bd_c(src8, width, height, src_stride, dat8,
                                      dat_stride, flt0, flt0_stride, flt1,
@@ -673,9 +675,15 @@
                              params);
     }
   } else {
-    av1_calc_proj_params_high_bd_c(src8, width, height, src_stride, dat8,
+    if ((width & 0x7) == 0) {
+      av1_calc_proj_params_high_bd(src8, width, height, src_stride, dat8,
                                    dat_stride, flt0, flt0_stride, flt1,
                                    flt1_stride, H, C, params);
+    } else {
+      av1_calc_proj_params_high_bd_c(src8, width, height, src_stride, dat8,
+                                     dat_stride, flt0, flt0_stride, flt1,
+                                     flt1_stride, H, C, params);
+    }
   }
 
   if (params->r[0] == 0) {
diff --git a/av1/encoder/x86/pickrst_avx2.c b/av1/encoder/x86/pickrst_avx2.c
index ef70a7b..32438d5 100644
--- a/av1/encoder/x86/pickrst_avx2.c
+++ b/av1/encoder/x86/pickrst_avx2.c
@@ -860,6 +860,229 @@
   }
 }
 
+static AOM_INLINE void calc_proj_params_r0_r1_high_bd_avx2(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+  __m256i h00, h01, h11, c0, c1;
+  const __m256i zero = _mm256_setzero_si256();
+  h01 = h11 = c0 = c1 = h00 = zero;
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; j += 8) {
+      const __m256i u_load = _mm256_cvtepu16_epi32(
+          _mm_load_si128((__m128i *)(dat + i * dat_stride + j)));
+      const __m256i s_load = _mm256_cvtepu16_epi32(
+          _mm_load_si128((__m128i *)(src + i * src_stride + j)));
+      __m256i f1 = _mm256_loadu_si256((__m256i *)(flt0 + i * flt0_stride + j));
+      __m256i f2 = _mm256_loadu_si256((__m256i *)(flt1 + i * flt1_stride + j));
+      __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS);
+      __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS);
+      s = _mm256_sub_epi32(s, d);
+      f1 = _mm256_sub_epi32(f1, d);
+      f2 = _mm256_sub_epi32(f2, d);
+
+      const __m256i h00_even = _mm256_mul_epi32(f1, f1);
+      const __m256i h00_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32),
+                                               _mm256_srli_epi64(f1, 32));
+      h00 = _mm256_add_epi64(h00, h00_even);
+      h00 = _mm256_add_epi64(h00, h00_odd);
+
+      const __m256i h01_even = _mm256_mul_epi32(f1, f2);
+      const __m256i h01_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32),
+                                               _mm256_srli_epi64(f2, 32));
+      h01 = _mm256_add_epi64(h01, h01_even);
+      h01 = _mm256_add_epi64(h01, h01_odd);
+
+      const __m256i h11_even = _mm256_mul_epi32(f2, f2);
+      const __m256i h11_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32),
+                                               _mm256_srli_epi64(f2, 32));
+      h11 = _mm256_add_epi64(h11, h11_even);
+      h11 = _mm256_add_epi64(h11, h11_odd);
+
+      const __m256i c0_even = _mm256_mul_epi32(f1, s);
+      const __m256i c0_odd =
+          _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(s, 32));
+      c0 = _mm256_add_epi64(c0, c0_even);
+      c0 = _mm256_add_epi64(c0, c0_odd);
+
+      const __m256i c1_even = _mm256_mul_epi32(f2, s);
+      const __m256i c1_odd =
+          _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(s, 32));
+      c1 = _mm256_add_epi64(c1, c1_even);
+      c1 = _mm256_add_epi64(c1, c1_odd);
+    }
+  }
+
+  __m256i c_low = _mm256_unpacklo_epi64(c0, c1);
+  const __m256i c_high = _mm256_unpackhi_epi64(c0, c1);
+  c_low = _mm256_add_epi64(c_low, c_high);
+  const __m128i c_128bit = _mm_add_epi64(_mm256_extracti128_si256(c_low, 1),
+                                         _mm256_castsi256_si128(c_low));
+
+  __m256i h0x_low = _mm256_unpacklo_epi64(h00, h01);
+  const __m256i h0x_high = _mm256_unpackhi_epi64(h00, h01);
+  h0x_low = _mm256_add_epi64(h0x_low, h0x_high);
+  const __m128i h0x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h0x_low, 1),
+                                           _mm256_castsi256_si128(h0x_low));
+
+  // Using the symmetric properties of H,  calculations of H[1][0] are not
+  // needed.
+  __m256i h1x_low = _mm256_unpacklo_epi64(zero, h11);
+  const __m256i h1x_high = _mm256_unpackhi_epi64(zero, h11);
+  h1x_low = _mm256_add_epi64(h1x_low, h1x_high);
+  const __m128i h1x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h1x_low, 1),
+                                           _mm256_castsi256_si128(h1x_low));
+
+  xx_storeu_128(C, c_128bit);
+  xx_storeu_128(H[0], h0x_128bit);
+  xx_storeu_128(H[1], h1x_128bit);
+
+  H[0][0] /= size;
+  H[0][1] /= size;
+  H[1][1] /= size;
+
+  // Since H is a symmetric matrix
+  H[1][0] = H[0][1];
+  C[0] /= size;
+  C[1] /= size;
+}
+
+static AOM_INLINE void calc_proj_params_r0_high_bd_avx2(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+  __m256i h00, c0;
+  const __m256i zero = _mm256_setzero_si256();
+  c0 = h00 = zero;
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; j += 8) {
+      const __m256i u_load = _mm256_cvtepu16_epi32(
+          _mm_load_si128((__m128i *)(dat + i * dat_stride + j)));
+      const __m256i s_load = _mm256_cvtepu16_epi32(
+          _mm_load_si128((__m128i *)(src + i * src_stride + j)));
+      __m256i f1 = _mm256_loadu_si256((__m256i *)(flt0 + i * flt0_stride + j));
+      __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS);
+      __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS);
+      s = _mm256_sub_epi32(s, d);
+      f1 = _mm256_sub_epi32(f1, d);
+
+      const __m256i h00_even = _mm256_mul_epi32(f1, f1);
+      const __m256i h00_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32),
+                                               _mm256_srli_epi64(f1, 32));
+      h00 = _mm256_add_epi64(h00, h00_even);
+      h00 = _mm256_add_epi64(h00, h00_odd);
+
+      const __m256i c0_even = _mm256_mul_epi32(f1, s);
+      const __m256i c0_odd =
+          _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(s, 32));
+      c0 = _mm256_add_epi64(c0, c0_even);
+      c0 = _mm256_add_epi64(c0, c0_odd);
+    }
+  }
+  const __m128i h00_128bit = _mm_add_epi64(_mm256_extracti128_si256(h00, 1),
+                                           _mm256_castsi256_si128(h00));
+  const __m128i h00_val =
+      _mm_add_epi64(h00_128bit, _mm_srli_si128(h00_128bit, 8));
+
+  const __m128i c0_128bit = _mm_add_epi64(_mm256_extracti128_si256(c0, 1),
+                                          _mm256_castsi256_si128(c0));
+  const __m128i c0_val = _mm_add_epi64(c0_128bit, _mm_srli_si128(c0_128bit, 8));
+
+  const __m128i c = _mm_unpacklo_epi64(c0_val, _mm256_castsi256_si128(zero));
+  const __m128i h0x = _mm_unpacklo_epi64(h00_val, _mm256_castsi256_si128(zero));
+
+  xx_storeu_128(C, c);
+  xx_storeu_128(H[0], h0x);
+
+  H[0][0] /= size;
+  C[0] /= size;
+}
+
+static AOM_INLINE void calc_proj_params_r1_high_bd_avx2(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride,
+    int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+  __m256i h11, c1;
+  const __m256i zero = _mm256_setzero_si256();
+  c1 = h11 = zero;
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; j += 8) {
+      const __m256i u_load = _mm256_cvtepu16_epi32(
+          _mm_load_si128((__m128i *)(dat + i * dat_stride + j)));
+      const __m256i s_load = _mm256_cvtepu16_epi32(
+          _mm_load_si128((__m128i *)(src + i * src_stride + j)));
+      __m256i f2 = _mm256_loadu_si256((__m256i *)(flt1 + i * flt1_stride + j));
+      __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS);
+      __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS);
+      s = _mm256_sub_epi32(s, d);
+      f2 = _mm256_sub_epi32(f2, d);
+
+      const __m256i h11_even = _mm256_mul_epi32(f2, f2);
+      const __m256i h11_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32),
+                                               _mm256_srli_epi64(f2, 32));
+      h11 = _mm256_add_epi64(h11, h11_even);
+      h11 = _mm256_add_epi64(h11, h11_odd);
+
+      const __m256i c1_even = _mm256_mul_epi32(f2, s);
+      const __m256i c1_odd =
+          _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(s, 32));
+      c1 = _mm256_add_epi64(c1, c1_even);
+      c1 = _mm256_add_epi64(c1, c1_odd);
+    }
+  }
+
+  const __m128i h11_128bit = _mm_add_epi64(_mm256_extracti128_si256(h11, 1),
+                                           _mm256_castsi256_si128(h11));
+  const __m128i h11_val =
+      _mm_add_epi64(h11_128bit, _mm_srli_si128(h11_128bit, 8));
+
+  const __m128i c1_128bit = _mm_add_epi64(_mm256_extracti128_si256(c1, 1),
+                                          _mm256_castsi256_si128(c1));
+  const __m128i c1_val = _mm_add_epi64(c1_128bit, _mm_srli_si128(c1_128bit, 8));
+
+  const __m128i c = _mm_unpacklo_epi64(_mm256_castsi256_si128(zero), c1_val);
+  const __m128i h1x = _mm_unpacklo_epi64(_mm256_castsi256_si128(zero), h11_val);
+
+  xx_storeu_128(C, c);
+  xx_storeu_128(H[1], h1x);
+
+  H[1][1] /= size;
+  C[1] /= size;
+}
+
+// AVX2 variant of av1_calc_proj_params_high_bd_c.
+void av1_calc_proj_params_high_bd_avx2(const uint8_t *src8, int width,
+                                       int height, int src_stride,
+                                       const uint8_t *dat8, int dat_stride,
+                                       int32_t *flt0, int flt0_stride,
+                                       int32_t *flt1, int flt1_stride,
+                                       int64_t H[2][2], int64_t C[2],
+                                       const sgr_params_type *params) {
+  if ((params->r[0] > 0) && (params->r[1] > 0)) {
+    calc_proj_params_r0_r1_high_bd_avx2(src8, width, height, src_stride, dat8,
+                                        dat_stride, flt0, flt0_stride, flt1,
+                                        flt1_stride, H, C);
+  } else if (params->r[0] > 0) {
+    calc_proj_params_r0_high_bd_avx2(src8, width, height, src_stride, dat8,
+                                     dat_stride, flt0, flt0_stride, H, C);
+  } else if (params->r[1] > 0) {
+    calc_proj_params_r1_high_bd_avx2(src8, width, height, src_stride, dat8,
+                                     dat_stride, flt1, flt1_stride, H, C);
+  }
+}
+
 #if CONFIG_AV1_HIGHBITDEPTH
 int64_t av1_highbd_pixel_proj_error_avx2(
     const uint8_t *src8, int width, int height, int src_stride,
diff --git a/test/pickrst_test.cc b/test/pickrst_test.cc
index 41ef66a..e16555e 100644
--- a/test/pickrst_test.cc
+++ b/test/pickrst_test.cc
@@ -364,6 +364,7 @@
 #endif  // HAVE_AVX2
 
 }  // namespace pickrst_test_highbd
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 ////////////////////////////////////////////////////////////////////////////////
 // Get_proj_subspace_Test
@@ -540,4 +541,180 @@
 #endif  // HAVE_AVX2
 
 }  // namespace get_proj_subspace_test_lowbd
+
+#if CONFIG_AV1_HIGHBITDEPTH
+namespace get_proj_subspace_test_hbd {
+static const int kIterations = 100;
+
+typedef void (*set_get_proj_subspace_hbd)(const uint8_t *src8, int width,
+                                          int height, int src_stride,
+                                          const uint8_t *dat8, int dat_stride,
+                                          int32_t *flt0, int flt0_stride,
+                                          int32_t *flt1, int flt1_stride,
+                                          int64_t H[2][2], int64_t C[2],
+                                          const sgr_params_type *params);
+
+typedef std::tuple<const set_get_proj_subspace_hbd> GetProjSubspaceHBDTestParam;
+
+class GetProjSubspaceTestHBD
+    : public ::testing::TestWithParam<GetProjSubspaceHBDTestParam> {
+ public:
+  virtual void SetUp() {
+    target_func_ = GET_PARAM(0);
+    src_ = (uint16_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+                                   sizeof(*src_)));
+    ASSERT_NE(src_, nullptr);
+    dgd_ = (uint16_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+                                   sizeof(*dgd_)));
+    ASSERT_NE(dgd_, nullptr);
+    flt0_ = (int32_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+                                   sizeof(*flt0_)));
+    ASSERT_NE(flt0_, nullptr);
+    flt1_ = (int32_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+                                   sizeof(*flt1_)));
+    ASSERT_NE(flt1_, nullptr);
+  }
+  virtual void TearDown() {
+    aom_free(src_);
+    aom_free(dgd_);
+    aom_free(flt0_);
+    aom_free(flt1_);
+  }
+  void RunGetProjSubspaceTestHBD(int32_t run_times);
+  void RunGetProjSubspaceTestHBD_ExtremeValues();
+
+ private:
+  set_get_proj_subspace_hbd target_func_;
+  libaom_test::ACMRandom rng_;
+  uint16_t *src_;
+  uint16_t *dgd_;
+  int32_t *flt0_;
+  int32_t *flt1_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GetProjSubspaceTestHBD);
+
+void GetProjSubspaceTestHBD::RunGetProjSubspaceTestHBD(int32_t run_times) {
+  int h_end = run_times != 1
+                  ? 128
+                  : ((rng_.Rand16() % MAX_DATA_BLOCK) &
+                     2147483640);  // We test for widths divisible by 8.
+  int v_end =
+      run_times != 1 ? 128 : ((rng_.Rand16() % MAX_DATA_BLOCK) & 2147483640);
+  const int dgd_stride = MAX_DATA_BLOCK;
+  const int src_stride = MAX_DATA_BLOCK;
+  const int flt0_stride = MAX_DATA_BLOCK;
+  const int flt1_stride = MAX_DATA_BLOCK;
+  sgr_params_type params;
+  const int iters = run_times == 1 ? kIterations : 4;
+  for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
+    int64_t C_ref[2] = { 0 }, C_test[2] = { 0 };
+    int64_t H_ref[2][2] = { { 0, 0 }, { 0, 0 } };
+    int64_t H_test[2][2] = { { 0, 0 }, { 0, 0 } };
+    for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
+      dgd_[i] = rng_.Rand16() % 4095;
+      src_[i] = rng_.Rand16() % 4095;
+      flt0_[i] = rng_.Rand15Signed();
+      flt1_[i] = rng_.Rand15Signed();
+    }
+
+    params.r[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : 1;
+    params.r[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : 1;
+    params.s[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter % 2);
+    params.s[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter / 2);
+    uint8_t *dgd = CONVERT_TO_BYTEPTR(dgd_);
+    uint8_t *src = CONVERT_TO_BYTEPTR(src_);
+
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      av1_calc_proj_params_high_bd_c(src, v_end, h_end, src_stride, dgd,
+                                     dgd_stride, flt0_, flt0_stride, flt1_,
+                                     flt1_stride, H_ref, C_ref, &params);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      target_func_(src, v_end, h_end, src_stride, dgd, dgd_stride, flt0_,
+                   flt0_stride, flt1_, flt1_stride, H_test, C_test, &params);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    if (run_times > 10) {
+      printf("r0 %d r1 %d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", params.r[0],
+             params.r[1], h_end, v_end, time1, time2, time1 / time2);
+    } else {
+      ASSERT_EQ(H_ref[0][0], H_test[0][0]);
+      ASSERT_EQ(H_ref[0][1], H_test[0][1]);
+      ASSERT_EQ(H_ref[1][0], H_test[1][0]);
+      ASSERT_EQ(H_ref[1][1], H_test[1][1]);
+      ASSERT_EQ(C_ref[0], C_test[0]);
+      ASSERT_EQ(C_ref[1], C_test[1]);
+    }
+  }
+}
+
+void GetProjSubspaceTestHBD::RunGetProjSubspaceTestHBD_ExtremeValues() {
+  const int h_start = 0;
+  int h_end = MAX_DATA_BLOCK;
+  const int v_start = 0;
+  int v_end = MAX_DATA_BLOCK;
+  const int dgd_stride = MAX_DATA_BLOCK;
+  const int src_stride = MAX_DATA_BLOCK;
+  const int flt0_stride = MAX_DATA_BLOCK;
+  const int flt1_stride = MAX_DATA_BLOCK;
+  sgr_params_type params;
+  const int iters = kIterations;
+  for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
+    int64_t C_ref[2] = { 0 }, C_test[2] = { 0 };
+    int64_t H_ref[2][2] = { { 0, 0 }, { 0, 0 } };
+    int64_t H_test[2][2] = { { 0, 0 }, { 0, 0 } };
+    for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
+      dgd_[i] = 0;
+      src_[i] = 4095;
+      flt0_[i] = rng_.Rand15Signed();
+      flt1_[i] = rng_.Rand15Signed();
+    }
+    params.r[0] = 1;
+    params.r[1] = 1;
+    params.s[0] = rng_.Rand8() % MAX_RADIUS;
+    params.s[1] = rng_.Rand8() % MAX_RADIUS;
+    uint8_t *dgd = CONVERT_TO_BYTEPTR(dgd_);
+    uint8_t *src = CONVERT_TO_BYTEPTR(src_);
+
+    av1_calc_proj_params_high_bd_c(
+        src, h_end - h_start, v_end - v_start, src_stride, dgd, dgd_stride,
+        flt0_, flt0_stride, flt1_, flt1_stride, H_ref, C_ref, &params);
+
+    target_func_(src, h_end - h_start, v_end - v_start, src_stride, dgd,
+                 dgd_stride, flt0_, flt0_stride, flt1_, flt1_stride, H_test,
+                 C_test, &params);
+
+    ASSERT_EQ(H_ref[0][0], H_test[0][0]);
+    ASSERT_EQ(H_ref[0][1], H_test[0][1]);
+    ASSERT_EQ(H_ref[1][0], H_test[1][0]);
+    ASSERT_EQ(H_ref[1][1], H_test[1][1]);
+    ASSERT_EQ(C_ref[0], C_test[0]);
+    ASSERT_EQ(C_ref[1], C_test[1]);
+  }
+}
+
+TEST_P(GetProjSubspaceTestHBD, RandomValues) { RunGetProjSubspaceTestHBD(1); }
+
+TEST_P(GetProjSubspaceTestHBD, ExtremeValues) {
+  RunGetProjSubspaceTestHBD_ExtremeValues();
+}
+
+TEST_P(GetProjSubspaceTestHBD, DISABLED_Speed) {
+  RunGetProjSubspaceTestHBD(200000);
+}
+
+#if HAVE_AVX2
+
+INSTANTIATE_TEST_SUITE_P(AVX2, GetProjSubspaceTestHBD,
+                         ::testing::Values(av1_calc_proj_params_high_bd_avx2));
+#endif  // HAVE_AVX2
+
+}  // namespace get_proj_subspace_test_hbd
+
 #endif  // CONFIG_AV1_HIGHBITDEPTH