Prevent negative variance in 10/12-bit.

Due to rounding the variance can become negative.
Similar behaviour has been fixed in commit b7e7c044b0.
I have made the corresponding changes to the masked_variance code.

BUG=aomedia:646

Change-Id: I21ed5a3f58080def08ac7ab9a77cbcad90318c99
diff --git a/aom_dsp/x86/masked_variance_intrin_ssse3.c b/aom_dsp/x86/masked_variance_intrin_ssse3.c
index ec8aa3b..34043bf 100644
--- a/aom_dsp/x86/masked_variance_intrin_ssse3.c
+++ b/aom_dsp/x86/masked_variance_intrin_ssse3.c
@@ -570,6 +570,7 @@
       const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
     uint64_t sse64;                                                         \
     int sum;                                                                \
+    int64_t var;                                                            \
     uint16_t temp[(H + 1) * W];                                             \
     const uint16_t *src = CONVERT_TO_SHORTPTR(src8);                        \
     const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                        \
@@ -585,7 +586,8 @@
                              msk_stride, W, H, &sse64, &sum);               \
     *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 4);                          \
     sum = ROUND_POWER_OF_TWO(sum, 2);                                       \
-    return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));               \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));               \
+    return (var >= 0) ? (uint32_t)var : 0;                                  \
   }                                                                         \
   unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_ssse3(    \
       const uint8_t *src8, int src_stride, int xoffset, int yoffset,        \
@@ -593,6 +595,7 @@
       const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
     uint64_t sse64;                                                         \
     int sum;                                                                \
+    int64_t var;                                                            \
     uint16_t temp[(H + 1) * W];                                             \
     const uint16_t *src = CONVERT_TO_SHORTPTR(src8);                        \
     const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                        \
@@ -608,7 +611,8 @@
                              msk_stride, W, H, &sse64, &sum);               \
     *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 8);                          \
     sum = ROUND_POWER_OF_TWO(sum, 4);                                       \
-    return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));               \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));               \
+    return (var >= 0) ? (uint32_t)var : 0;                                  \
   }
 
 #define HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(H)                                  \
@@ -640,6 +644,7 @@
       const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
     int sse_;                                                               \
     int sum;                                                                \
+    int64_t var;                                                            \
     uint16_t temp[(H + 1) * 4];                                             \
     const uint16_t *src = CONVERT_TO_SHORTPTR(src8);                        \
     const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                        \
@@ -655,7 +660,8 @@
                                 msk_stride, H, &sse_, &sum);                \
     *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_, 4);                           \
     sum = ROUND_POWER_OF_TWO(sum, 2);                                       \
-    return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H));               \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (4 * H));               \
+    return (var >= 0) ? (uint32_t)var : 0;                                  \
   }                                                                         \
   unsigned int aom_highbd_12_masked_sub_pixel_variance4x##H##_ssse3(        \
       const uint8_t *src8, int src_stride, int xoffset, int yoffset,        \
@@ -663,6 +669,7 @@
       const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
     int sse_;                                                               \
     int sum;                                                                \
+    int64_t var;                                                            \
     uint16_t temp[(H + 1) * 4];                                             \
     const uint16_t *src = CONVERT_TO_SHORTPTR(src8);                        \
     const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                        \
@@ -678,7 +685,8 @@
                                 msk_stride, H, &sse_, &sum);                \
     *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_, 8);                           \
     sum = ROUND_POWER_OF_TWO(sum, 4);                                       \
-    return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H));               \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (4 * H));               \
+    return (var >= 0) ? (uint32_t)var : 0;                                  \
   }
 
 #if CONFIG_EXT_PARTITION