Highway: Fix uninitialized value in selfguided Speed up (avx512 vs avx2) on Sapphire Rapids: bitdepth speedup 8 14.3% 10 12.2% 12 13.2% Change-Id: I76e67ad8c2a21e7eb3031a059e0b2028a4c248dd
diff --git a/av1/common/selfguided_hwy.h b/av1/common/selfguided_hwy.h index 2c04112..3dd7cfb 100644 --- a/av1/common/selfguided_hwy.h +++ b/av1/common/selfguided_hwy.h
@@ -122,8 +122,8 @@ constexpr hn::Rebind<T, D> uint_tag; constexpr hn::Repartition<int16_t, D> int16_tag; // Write out the zero top row - hwy::ZeroBytes(A, width); - hwy::ZeroBytes(B, width); + hwy::ZeroBytes(A, 4 * (width + 8)); + hwy::ZeroBytes(B, 4 * (width + 8)); for (int i = 0; i < height; ++i) { // Zero the left column. @@ -316,61 +316,48 @@ } for (int i = -1; i < height + 1; i += Step) { - constexpr int kLineBufferElements = 2 * hn::MaxLanes(int32_tag); - for (int j = -1; j < width + 1; j += kLineBufferElements) { - // Gathering is incredibly high latency; store to intermediates to hide as - // much latency as possible. - HWY_ALIGN int32_t sum1_array[kLineBufferElements]; - HWY_ALIGN int32_t a_res_array[kLineBufferElements]; - for (int k = 0; k < kLineBufferElements; k += hn::MaxLanes(int32_tag)) { - const int32_t *HWY_RESTRICT Cij = C + i * buf_stride + j + k; - const int32_t *HWY_RESTRICT Dij = D + i * buf_stride + j + k; + for (int j = -1; j < width + 1; + j += static_cast<int>(hn::MaxLanes(int32_tag))) { + const int32_t *HWY_RESTRICT Cij = C + i * buf_stride + j; + const int32_t *HWY_RESTRICT Dij = D + i * buf_stride + j; - auto sum1 = BoxSumFromII(int32_tag, Dij, buf_stride, r); - auto sum2 = BoxSumFromII(int32_tag, Cij, buf_stride, r); + auto sum1 = BoxSumFromII(int32_tag, Dij, buf_stride, r); + auto sum2 = BoxSumFromII(int32_tag, Cij, buf_stride, r); - // When width + 2 isn't a multiple of 8, sum1 and sum2 will contain - // some uninitialised data in their upper words. We use a mask to - // ensure that these bits are set to 0. - int idx = AOMMIN(8, width + 1 - j); - assert(idx >= 1); + // When width + 2 isn't a multiple of 8, sum1 and sum2 will contain + // some uninitialised data in their upper words. We use a mask to + // ensure that these bits are set to 0. + int idx = AOMMIN(8, width + 1 - j); + assert(idx >= 1); - if (idx < 8) { - sum1 = hn::IfThenElseZero(mask[idx], sum1); - sum2 = hn::IfThenElseZero(mask[idx], sum2); - } - - const auto p = ComputeP(int32_tag, sum1, sum2, bit_depth, n); - - const auto z = hn::BitCast( - int32_tag, hn::Min(hn::ShiftRight<SGRPROJ_MTABLE_BITS>(hn::BitCast( - uint32_tag, hn::MulAdd(p, s, rnd_z))), - hn::Set(uint32_tag, 255))); - - const auto a_res = hn::GatherIndex(int32_tag, av1_x_by_xplus1, z); - hn::Store(sum1, int32_tag, &sum1_array[k]); - hn::Store(a_res, int32_tag, &a_res_array[k]); + if (idx < 8) { + sum1 = hn::IfThenElseZero(mask[idx], sum1); + sum2 = hn::IfThenElseZero(mask[idx], sum2); } - for (int k = 0; k < kLineBufferElements; k += hn::MaxLanes(int32_tag)) { - const auto a_res = hn::Load(int32_tag, &a_res_array[k]); - const auto sum1 = hn::Load(int32_tag, &sum1_array[k]); - hn::StoreU(a_res, int32_tag, A + i * buf_stride + j + k); + const auto p = ComputeP(int32_tag, sum1, sum2, bit_depth, n); - const auto a_complement = - hn::Sub(hn::Set(int32_tag, SGRPROJ_SGR), a_res); + const auto z = hn::BitCast( + int32_tag, hn::Min(hn::ShiftRight<SGRPROJ_MTABLE_BITS>(hn::BitCast( + uint32_tag, hn::MulAdd(p, s, rnd_z))), + hn::Set(uint32_tag, 255))); - // sum1 might have lanes greater than 2^15, so we can't use madd to do - // multiplication involving sum1. However, a_complement and one_over_n - // are both less than 256, so we can multiply them first. - const auto a_comp_over_n = hn::WidenMulPairwiseAdd( - int32_tag, hn::BitCast(int16_tag, a_complement), one_over_n); - const auto b_int = hn::Mul(a_comp_over_n, sum1); - const auto b_res = - hn::ShiftRight<SGRPROJ_RECIP_BITS>(hn::Add(b_int, rnd_res)); + const auto a_res = hn::GatherIndex(int32_tag, av1_x_by_xplus1, z); - hn::StoreU(b_res, int32_tag, B + i * buf_stride + j + k); - } + hn::StoreU(a_res, int32_tag, A + i * buf_stride + j); + + const auto a_complement = hn::Sub(hn::Set(int32_tag, SGRPROJ_SGR), a_res); + + // sum1 might have lanes greater than 2^15, so we can't use madd to do + // multiplication involving sum1. However, a_complement and one_over_n + // are both less than 256, so we can multiply them first. + const auto a_comp_over_n = hn::WidenMulPairwiseAdd( + int32_tag, hn::BitCast(int16_tag, a_complement), one_over_n); + const auto b_int = hn::Mul(a_comp_over_n, sum1); + const auto b_res = + hn::ShiftRight<SGRPROJ_RECIP_BITS>(hn::Add(b_int, rnd_res)); + + hn::StoreU(b_res, int32_tag, B + i * buf_stride + j); } } }