Fix high bit-depth convolve function

Fix high bit depth c function, to make is compatible with
convolve functions which make intermediate results 16 bits.

Change-Id: I3844b69098168c09f8a120d39ee3f4b326031063
diff --git a/av1/common/convolve.c b/av1/common/convolve.c
index 674ee3f..6c8531f 100644
--- a/av1/common/convolve.c
+++ b/av1/common/convolve.c
@@ -1438,15 +1438,20 @@
       res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
       if (conv_params->use_jnt_comp_avg) {
         if (conv_params->do_average) {
-          dst[y * dst_stride + x] += res * conv_params->bck_offset;
+          int32_t tmp = dst[y * dst_stride + x];
+          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+          dst[y * dst_stride + x] = tmp >> DIST_PRECISION_BITS;
         } else {
-          dst[y * dst_stride + x] = res * conv_params->fwd_offset;
+          dst[y * dst_stride + x] = res;
         }
       } else {
-        if (conv_params->do_average)
-          dst[y * dst_stride + x] += res;
-        else
+        if (conv_params->do_average) {
+          int32_t tmp = dst[y * dst_stride + x];
+          tmp += res;
+          dst[y * dst_stride + x] = tmp >> 1;
+        } else {
           dst[y * dst_stride + x] = res;
+        }
       }
     }
   }
@@ -1481,15 +1486,20 @@
       res = ROUND_POWER_OF_TWO(res, conv_params->round_1);
       if (conv_params->use_jnt_comp_avg) {
         if (conv_params->do_average) {
-          dst[y * dst_stride + x] += res * conv_params->bck_offset;
+          int32_t tmp = dst[y * dst_stride + x];
+          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+          dst[y * dst_stride + x] = tmp >> DIST_PRECISION_BITS;
         } else {
-          dst[y * dst_stride + x] = res * conv_params->fwd_offset;
+          dst[y * dst_stride + x] = res;
         }
       } else {
-        if (conv_params->do_average)
-          dst[y * dst_stride + x] += res;
-        else
+        if (conv_params->do_average) {
+          int32_t tmp = dst[y * dst_stride + x];
+          tmp += res;
+          dst[y * dst_stride + x] = tmp >> 1;
+        } else {
           dst[y * dst_stride + x] = res;
+        }
       }
     }
   }
@@ -1518,15 +1528,20 @@
       CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
       if (conv_params->use_jnt_comp_avg) {
         if (conv_params->do_average) {
-          dst[y * dst_stride + x] += res * conv_params->bck_offset;
+          int32_t tmp = dst[y * dst_stride + x];
+          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+          dst[y * dst_stride + x] = tmp >> DIST_PRECISION_BITS;
         } else {
-          dst[y * dst_stride + x] = res * conv_params->fwd_offset;
+          dst[y * dst_stride + x] = res;
         }
       } else {
-        if (conv_params->do_average)
-          dst[y * dst_stride + x] += res;
-        else
+        if (conv_params->do_average) {
+          int32_t tmp = dst[y * dst_stride + x];
+          tmp += res;
+          dst[y * dst_stride + x] = tmp >> 1;
+        } else {
           dst[y * dst_stride + x] = res;
+        }
       }
     }
   }