Highway: Fix uninitialized value in selfguided
Speed up (avx512 vs avx2) on Sapphire Rapids:
bitdepth speedup
8 14.3%
10 12.2%
12 13.2%
Change-Id: I76e67ad8c2a21e7eb3031a059e0b2028a4c248dd
diff --git a/av1/common/selfguided_hwy.h b/av1/common/selfguided_hwy.h
index 2c04112..3dd7cfb 100644
--- a/av1/common/selfguided_hwy.h
+++ b/av1/common/selfguided_hwy.h
@@ -122,8 +122,8 @@
constexpr hn::Rebind<T, D> uint_tag;
constexpr hn::Repartition<int16_t, D> int16_tag;
// Write out the zero top row
- hwy::ZeroBytes(A, width);
- hwy::ZeroBytes(B, width);
+ hwy::ZeroBytes(A, 4 * (width + 8));
+ hwy::ZeroBytes(B, 4 * (width + 8));
for (int i = 0; i < height; ++i) {
// Zero the left column.
@@ -316,61 +316,48 @@
}
for (int i = -1; i < height + 1; i += Step) {
- constexpr int kLineBufferElements = 2 * hn::MaxLanes(int32_tag);
- for (int j = -1; j < width + 1; j += kLineBufferElements) {
- // Gathering is incredibly high latency; store to intermediates to hide as
- // much latency as possible.
- HWY_ALIGN int32_t sum1_array[kLineBufferElements];
- HWY_ALIGN int32_t a_res_array[kLineBufferElements];
- for (int k = 0; k < kLineBufferElements; k += hn::MaxLanes(int32_tag)) {
- const int32_t *HWY_RESTRICT Cij = C + i * buf_stride + j + k;
- const int32_t *HWY_RESTRICT Dij = D + i * buf_stride + j + k;
+ for (int j = -1; j < width + 1;
+ j += static_cast<int>(hn::MaxLanes(int32_tag))) {
+ const int32_t *HWY_RESTRICT Cij = C + i * buf_stride + j;
+ const int32_t *HWY_RESTRICT Dij = D + i * buf_stride + j;
- auto sum1 = BoxSumFromII(int32_tag, Dij, buf_stride, r);
- auto sum2 = BoxSumFromII(int32_tag, Cij, buf_stride, r);
+ auto sum1 = BoxSumFromII(int32_tag, Dij, buf_stride, r);
+ auto sum2 = BoxSumFromII(int32_tag, Cij, buf_stride, r);
- // When width + 2 isn't a multiple of 8, sum1 and sum2 will contain
- // some uninitialised data in their upper words. We use a mask to
- // ensure that these bits are set to 0.
- int idx = AOMMIN(8, width + 1 - j);
- assert(idx >= 1);
+ // When width + 2 isn't a multiple of 8, sum1 and sum2 will contain
+ // some uninitialised data in their upper words. We use a mask to
+ // ensure that these bits are set to 0.
+ int idx = AOMMIN(8, width + 1 - j);
+ assert(idx >= 1);
- if (idx < 8) {
- sum1 = hn::IfThenElseZero(mask[idx], sum1);
- sum2 = hn::IfThenElseZero(mask[idx], sum2);
- }
-
- const auto p = ComputeP(int32_tag, sum1, sum2, bit_depth, n);
-
- const auto z = hn::BitCast(
- int32_tag, hn::Min(hn::ShiftRight<SGRPROJ_MTABLE_BITS>(hn::BitCast(
- uint32_tag, hn::MulAdd(p, s, rnd_z))),
- hn::Set(uint32_tag, 255)));
-
- const auto a_res = hn::GatherIndex(int32_tag, av1_x_by_xplus1, z);
- hn::Store(sum1, int32_tag, &sum1_array[k]);
- hn::Store(a_res, int32_tag, &a_res_array[k]);
+ if (idx < 8) {
+ sum1 = hn::IfThenElseZero(mask[idx], sum1);
+ sum2 = hn::IfThenElseZero(mask[idx], sum2);
}
- for (int k = 0; k < kLineBufferElements; k += hn::MaxLanes(int32_tag)) {
- const auto a_res = hn::Load(int32_tag, &a_res_array[k]);
- const auto sum1 = hn::Load(int32_tag, &sum1_array[k]);
- hn::StoreU(a_res, int32_tag, A + i * buf_stride + j + k);
+ const auto p = ComputeP(int32_tag, sum1, sum2, bit_depth, n);
- const auto a_complement =
- hn::Sub(hn::Set(int32_tag, SGRPROJ_SGR), a_res);
+ const auto z = hn::BitCast(
+ int32_tag, hn::Min(hn::ShiftRight<SGRPROJ_MTABLE_BITS>(hn::BitCast(
+ uint32_tag, hn::MulAdd(p, s, rnd_z))),
+ hn::Set(uint32_tag, 255)));
- // sum1 might have lanes greater than 2^15, so we can't use madd to do
- // multiplication involving sum1. However, a_complement and one_over_n
- // are both less than 256, so we can multiply them first.
- const auto a_comp_over_n = hn::WidenMulPairwiseAdd(
- int32_tag, hn::BitCast(int16_tag, a_complement), one_over_n);
- const auto b_int = hn::Mul(a_comp_over_n, sum1);
- const auto b_res =
- hn::ShiftRight<SGRPROJ_RECIP_BITS>(hn::Add(b_int, rnd_res));
+ const auto a_res = hn::GatherIndex(int32_tag, av1_x_by_xplus1, z);
- hn::StoreU(b_res, int32_tag, B + i * buf_stride + j + k);
- }
+ hn::StoreU(a_res, int32_tag, A + i * buf_stride + j);
+
+ const auto a_complement = hn::Sub(hn::Set(int32_tag, SGRPROJ_SGR), a_res);
+
+ // sum1 might have lanes greater than 2^15, so we can't use madd to do
+ // multiplication involving sum1. However, a_complement and one_over_n
+ // are both less than 256, so we can multiply them first.
+ const auto a_comp_over_n = hn::WidenMulPairwiseAdd(
+ int32_tag, hn::BitCast(int16_tag, a_complement), one_over_n);
+ const auto b_int = hn::Mul(a_comp_over_n, sum1);
+ const auto b_res =
+ hn::ShiftRight<SGRPROJ_RECIP_BITS>(hn::Add(b_int, rnd_res));
+
+ hn::StoreU(b_res, int32_tag, B + i * buf_stride + j);
}
}
}