JNT_COMP: Round the weighted sum

Previously the weighted sums in convolve are right shifted without
rounding. This patch adds rounding value before right shifts.

Change-Id: Iea39aca419ac0ca0c32756f345293ce5e28dbd5b
diff --git a/av1/common/convolve.c b/av1/common/convolve.c
index b09a60b..3da90b3 100644
--- a/av1/common/convolve.c
+++ b/av1/common/convolve.c
@@ -476,7 +476,8 @@
         } else {
           dst[y * dst_stride + x] += res * conv_params->bck_offset;
 
-          dst[y * dst_stride + x] >>= (DIST_PRECISION_BITS - 1);
+          dst[y * dst_stride + x] = ROUND_POWER_OF_TWO(dst[y * dst_stride + x],
+                                                       DIST_PRECISION_BITS - 1);
         }
       }
     }
@@ -546,7 +547,8 @@
         } else {
           dst[y * dst_stride + x] += res * conv_params->bck_offset;
 
-          dst[y * dst_stride + x] >>= (DIST_PRECISION_BITS - 1);
+          dst[y * dst_stride + x] = ROUND_POWER_OF_TWO(dst[y * dst_stride + x],
+                                                       DIST_PRECISION_BITS - 1);
         }
       }
 #else
@@ -676,7 +678,8 @@
         if (conv_params->do_average) {
           dst[y * dst_stride + x] += res * conv_params->bck_offset;
 
-          dst[y * dst_stride + x] >>= (DIST_PRECISION_BITS - 1);
+          dst[y * dst_stride + x] = ROUND_POWER_OF_TWO(dst[y * dst_stride + x],
+                                                       DIST_PRECISION_BITS - 1);
         } else {
           dst[y * dst_stride + x] = res * conv_params->fwd_offset;
         }
@@ -752,7 +755,8 @@
         if (conv_params->do_average) {
           dst[y * dst_stride + x] += res * conv_params->bck_offset;
 
-          dst[y * dst_stride + x] >>= (DIST_PRECISION_BITS - 1);
+          dst[y * dst_stride + x] = ROUND_POWER_OF_TWO(dst[y * dst_stride + x],
+                                                       DIST_PRECISION_BITS - 1);
         } else {
           dst[y * dst_stride + x] = res * conv_params->fwd_offset;
         }
diff --git a/av1/common/x86/av1_convolve_scale_sse4.c b/av1/common/x86/av1_convolve_scale_sse4.c
index c877f64..7590318 100644
--- a/av1/common/x86/av1_convolve_scale_sse4.c
+++ b/av1/common/x86/av1_convolve_scale_sse4.c
@@ -347,7 +347,8 @@
         if (conv_params->do_average) {
           dst[y * dst_stride + x] += res * conv_params->bck_offset;
 
-          dst[y * dst_stride + x] >>= (DIST_PRECISION_BITS - 1);
+          dst[y * dst_stride + x] = ROUND_POWER_OF_TWO(dst[y * dst_stride + x],
+                                                       DIST_PRECISION_BITS - 1);
         } else {
           dst[y * dst_stride + x] = res * conv_params->fwd_offset;
         }
@@ -465,7 +466,8 @@
         if (conv_params->do_average) {
           dst[y * dst_stride + x] += res * conv_params->bck_offset;
 
-          dst[y * dst_stride + x] >>= (DIST_PRECISION_BITS - 1);
+          dst[y * dst_stride + x] = ROUND_POWER_OF_TWO(dst[y * dst_stride + x],
+                                                       DIST_PRECISION_BITS - 1);
         } else {
           dst[y * dst_stride + x] = res * conv_params->fwd_offset;
         }
diff --git a/av1/common/x86/convolve_2d_sse4.c b/av1/common/x86/convolve_2d_sse4.c
index a3f4649..d36b4bf 100644
--- a/av1/common/x86/convolve_2d_sse4.c
+++ b/av1/common/x86/convolve_2d_sse4.c
@@ -42,6 +42,9 @@
   const int w1 = conv_params->bck_offset;
   const __m128i wt0 = _mm_set_epi32(w0, w0, w0, w0);
   const __m128i wt1 = _mm_set_epi32(w1, w1, w1, w1);
+  const int jnt_round_const = 1 << (DIST_PRECISION_BITS - 2);
+  const __m128i jnt_r = _mm_set_epi32(jnt_round_const, jnt_round_const,
+                                      jnt_round_const, jnt_round_const);
 
   /* Horizontal filter */
   {
@@ -196,14 +199,18 @@
           if (do_average) {
             _mm_storeu_si128(
                 p + 0, _mm_srai_epi32(
-                           _mm_add_epi32(_mm_loadu_si128(p + 0),
-                                         _mm_mullo_epi32(res_lo_round, wt1)),
+                           _mm_add_epi32(_mm_add_epi32(_mm_loadu_si128(p + 0),
+                                                       _mm_mullo_epi32(
+                                                           res_lo_round, wt1)),
+                                         jnt_r),
                            DIST_PRECISION_BITS - 1));
 
             _mm_storeu_si128(
                 p + 1, _mm_srai_epi32(
-                           _mm_add_epi32(_mm_loadu_si128(p + 1),
-                                         _mm_mullo_epi32(res_hi_round, wt1)),
+                           _mm_add_epi32(_mm_add_epi32(_mm_loadu_si128(p + 1),
+                                                       _mm_mullo_epi32(
+                                                           res_hi_round, wt1)),
+                                         jnt_r),
                            DIST_PRECISION_BITS - 1));
           } else {
             _mm_storeu_si128(p + 0, _mm_mullo_epi32(res_lo_round, wt0));
@@ -251,6 +258,9 @@
   const int w1 = conv_params->bck_offset;
   const __m128i wt0 = _mm_set_epi32(w0, w0, w0, w0);
   const __m128i wt1 = _mm_set_epi32(w1, w1, w1, w1);
+  const int jnt_round_const = 1 << (DIST_PRECISION_BITS - 2);
+  const __m128i jnt_r = _mm_set_epi32(jnt_round_const, jnt_round_const,
+                                      jnt_round_const, jnt_round_const);
 
   /* Horizontal filter */
   {
@@ -406,14 +416,18 @@
           if (do_average) {
             _mm_storeu_si128(
                 p + 0, _mm_srai_epi32(
-                           _mm_add_epi32(_mm_loadu_si128(p + 0),
-                                         _mm_mullo_epi32(res_lo_round, wt1)),
+                           _mm_add_epi32(_mm_add_epi32(_mm_loadu_si128(p + 0),
+                                                       _mm_mullo_epi32(
+                                                           res_lo_round, wt1)),
+                                         jnt_r),
                            DIST_PRECISION_BITS - 1));
 
             _mm_storeu_si128(
                 p + 1, _mm_srai_epi32(
-                           _mm_add_epi32(_mm_loadu_si128(p + 1),
-                                         _mm_mullo_epi32(res_hi_round, wt1)),
+                           _mm_add_epi32(_mm_add_epi32(_mm_loadu_si128(p + 1),
+                                                       _mm_mullo_epi32(
+                                                           res_hi_round, wt1)),
+                                         jnt_r),
                            DIST_PRECISION_BITS - 1));
           } else {
             _mm_storeu_si128(p + 0, _mm_mullo_epi32(res_lo_round, wt0));