JNT_COMP: Round the weighted sum Previously the weighted sums in convolve are right shifted without rounding. This patch adds rounding value before right shifts. Change-Id: Iea39aca419ac0ca0c32756f345293ce5e28dbd5b
diff --git a/av1/common/convolve.c b/av1/common/convolve.c index b09a60b..3da90b3 100644 --- a/av1/common/convolve.c +++ b/av1/common/convolve.c
@@ -476,7 +476,8 @@ } else { dst[y * dst_stride + x] += res * conv_params->bck_offset; - dst[y * dst_stride + x] >>= (DIST_PRECISION_BITS - 1); + dst[y * dst_stride + x] = ROUND_POWER_OF_TWO(dst[y * dst_stride + x], + DIST_PRECISION_BITS - 1); } } } @@ -546,7 +547,8 @@ } else { dst[y * dst_stride + x] += res * conv_params->bck_offset; - dst[y * dst_stride + x] >>= (DIST_PRECISION_BITS - 1); + dst[y * dst_stride + x] = ROUND_POWER_OF_TWO(dst[y * dst_stride + x], + DIST_PRECISION_BITS - 1); } } #else @@ -676,7 +678,8 @@ if (conv_params->do_average) { dst[y * dst_stride + x] += res * conv_params->bck_offset; - dst[y * dst_stride + x] >>= (DIST_PRECISION_BITS - 1); + dst[y * dst_stride + x] = ROUND_POWER_OF_TWO(dst[y * dst_stride + x], + DIST_PRECISION_BITS - 1); } else { dst[y * dst_stride + x] = res * conv_params->fwd_offset; } @@ -752,7 +755,8 @@ if (conv_params->do_average) { dst[y * dst_stride + x] += res * conv_params->bck_offset; - dst[y * dst_stride + x] >>= (DIST_PRECISION_BITS - 1); + dst[y * dst_stride + x] = ROUND_POWER_OF_TWO(dst[y * dst_stride + x], + DIST_PRECISION_BITS - 1); } else { dst[y * dst_stride + x] = res * conv_params->fwd_offset; }
diff --git a/av1/common/x86/av1_convolve_scale_sse4.c b/av1/common/x86/av1_convolve_scale_sse4.c index c877f64..7590318 100644 --- a/av1/common/x86/av1_convolve_scale_sse4.c +++ b/av1/common/x86/av1_convolve_scale_sse4.c
@@ -347,7 +347,8 @@ if (conv_params->do_average) { dst[y * dst_stride + x] += res * conv_params->bck_offset; - dst[y * dst_stride + x] >>= (DIST_PRECISION_BITS - 1); + dst[y * dst_stride + x] = ROUND_POWER_OF_TWO(dst[y * dst_stride + x], + DIST_PRECISION_BITS - 1); } else { dst[y * dst_stride + x] = res * conv_params->fwd_offset; } @@ -465,7 +466,8 @@ if (conv_params->do_average) { dst[y * dst_stride + x] += res * conv_params->bck_offset; - dst[y * dst_stride + x] >>= (DIST_PRECISION_BITS - 1); + dst[y * dst_stride + x] = ROUND_POWER_OF_TWO(dst[y * dst_stride + x], + DIST_PRECISION_BITS - 1); } else { dst[y * dst_stride + x] = res * conv_params->fwd_offset; }
diff --git a/av1/common/x86/convolve_2d_sse4.c b/av1/common/x86/convolve_2d_sse4.c index a3f4649..d36b4bf 100644 --- a/av1/common/x86/convolve_2d_sse4.c +++ b/av1/common/x86/convolve_2d_sse4.c
@@ -42,6 +42,9 @@ const int w1 = conv_params->bck_offset; const __m128i wt0 = _mm_set_epi32(w0, w0, w0, w0); const __m128i wt1 = _mm_set_epi32(w1, w1, w1, w1); + const int jnt_round_const = 1 << (DIST_PRECISION_BITS - 2); + const __m128i jnt_r = _mm_set_epi32(jnt_round_const, jnt_round_const, + jnt_round_const, jnt_round_const); /* Horizontal filter */ { @@ -196,14 +199,18 @@ if (do_average) { _mm_storeu_si128( p + 0, _mm_srai_epi32( - _mm_add_epi32(_mm_loadu_si128(p + 0), - _mm_mullo_epi32(res_lo_round, wt1)), + _mm_add_epi32(_mm_add_epi32(_mm_loadu_si128(p + 0), + _mm_mullo_epi32( + res_lo_round, wt1)), + jnt_r), DIST_PRECISION_BITS - 1)); _mm_storeu_si128( p + 1, _mm_srai_epi32( - _mm_add_epi32(_mm_loadu_si128(p + 1), - _mm_mullo_epi32(res_hi_round, wt1)), + _mm_add_epi32(_mm_add_epi32(_mm_loadu_si128(p + 1), + _mm_mullo_epi32( + res_hi_round, wt1)), + jnt_r), DIST_PRECISION_BITS - 1)); } else { _mm_storeu_si128(p + 0, _mm_mullo_epi32(res_lo_round, wt0)); @@ -251,6 +258,9 @@ const int w1 = conv_params->bck_offset; const __m128i wt0 = _mm_set_epi32(w0, w0, w0, w0); const __m128i wt1 = _mm_set_epi32(w1, w1, w1, w1); + const int jnt_round_const = 1 << (DIST_PRECISION_BITS - 2); + const __m128i jnt_r = _mm_set_epi32(jnt_round_const, jnt_round_const, + jnt_round_const, jnt_round_const); /* Horizontal filter */ { @@ -406,14 +416,18 @@ if (do_average) { _mm_storeu_si128( p + 0, _mm_srai_epi32( - _mm_add_epi32(_mm_loadu_si128(p + 0), - _mm_mullo_epi32(res_lo_round, wt1)), + _mm_add_epi32(_mm_add_epi32(_mm_loadu_si128(p + 0), + _mm_mullo_epi32( + res_lo_round, wt1)), + jnt_r), DIST_PRECISION_BITS - 1)); _mm_storeu_si128( p + 1, _mm_srai_epi32( - _mm_add_epi32(_mm_loadu_si128(p + 1), - _mm_mullo_epi32(res_hi_round, wt1)), + _mm_add_epi32(_mm_add_epi32(_mm_loadu_si128(p + 1), + _mm_mullo_epi32( + res_hi_round, wt1)), + jnt_r), DIST_PRECISION_BITS - 1)); } else { _mm_storeu_si128(p + 0, _mm_mullo_epi32(res_lo_round, wt0));