Increase mask sad resolution

Return the actual SAD result without downscaling for higher
precision. This slightly improves the coding efficiency:

      avg PSNR   ovr PSNR   SSIM
low   -0.040	-0.048	    -0.075
mid   -0.050	-0.062	    -0.075

STATS_CHANGED

Change-Id: Ie9302f6271b5c6389a8543c58dcfc8d8153c1874
diff --git a/aom_dsp/sad_av1.c b/aom_dsp/sad_av1.c
index 10c6ac7..1ff3d4f 100644
--- a/aom_dsp/sad_av1.c
+++ b/aom_dsp/sad_av1.c
@@ -35,7 +35,6 @@
     b += b_stride;
     m += m_stride;
   }
-  sad = (sad + 31) >> 6;
   return sad;
 }
 
@@ -101,7 +100,6 @@
     b += b_stride;
     m += m_stride;
   }
-  sad = (sad + 31) >> 6;
 
   return sad;
 }
diff --git a/aom_dsp/x86/masked_sad_intrin_avx2.c b/aom_dsp/x86/masked_sad_intrin_avx2.c
index 774fab6..60f0ab3 100644
--- a/aom_dsp/x86/masked_sad_intrin_avx2.c
+++ b/aom_dsp/x86/masked_sad_intrin_avx2.c
@@ -64,7 +64,7 @@
   res = _mm256_hadd_epi32(res, res);
   res = _mm256_hadd_epi32(res, res);
   int32_t sad = _mm256_extract_epi32(res, 0);
-  return (sad + 31) >> 6;
+  return sad;
 }
 
 static INLINE __m256i xx_loadu2_m128i(const void *hi, const void *lo) {
@@ -117,7 +117,7 @@
   res = _mm256_hadd_epi32(res, res);
   res = _mm256_hadd_epi32(res, res);
   int32_t sad = _mm256_extract_epi32(res, 0);
-  return (sad + 31) >> 6;
+  return sad;
 }
 
 static INLINE unsigned int aom_masked_sad_avx2(
@@ -253,7 +253,7 @@
   res = _mm256_hadd_epi32(res, res);
   res = _mm256_hadd_epi32(res, res);
   int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4);
-  return (sad + 31) >> 6;
+  return sad;
 }
 
 static INLINE unsigned int highbd_masked_sad16xh_avx2(
@@ -311,7 +311,7 @@
   res = _mm256_hadd_epi32(res, res);
   res = _mm256_hadd_epi32(res, res);
   int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4);
-  return (sad + 31) >> 6;
+  return sad;
 }
 
 static INLINE unsigned int aom_highbd_masked_sad_avx2(
diff --git a/aom_dsp/x86/masked_sad_intrin_ssse3.c b/aom_dsp/x86/masked_sad_intrin_ssse3.c
index a179f2e..7168277 100644
--- a/aom_dsp/x86/masked_sad_intrin_ssse3.c
+++ b/aom_dsp/x86/masked_sad_intrin_ssse3.c
@@ -134,7 +134,7 @@
   // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'.
   int32_t sad =
       _mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8));
-  return (sad + 31) >> 6;
+  return sad;
 }
 
 unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride,
@@ -179,7 +179,7 @@
   }
   int32_t sad =
       _mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8));
-  return (sad + 31) >> 6;
+  return sad;
 }
 
 unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride,
@@ -223,7 +223,7 @@
   }
   // At this point, the SAD is stored in lane 0 of 'res'
   int32_t sad = _mm_cvtsi128_si32(res);
-  return (sad + 31) >> 6;
+  return sad;
 }
 
 // For width a multiple of 8
@@ -338,7 +338,7 @@
   res = _mm_hadd_epi32(res, res);
   res = _mm_hadd_epi32(res, res);
   int sad = _mm_cvtsi128_si32(res);
-  return (sad + 31) >> 6;
+  return sad;
 }
 
 unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride,
@@ -398,5 +398,5 @@
   res = _mm_hadd_epi32(res, res);
   res = _mm_hadd_epi32(res, res);
   int sad = _mm_cvtsi128_si32(res);
-  return (sad + 31) >> 6;
+  return sad;
 }