[wedge/compound-segment, normative] Reduce multiple rounding

As described in the linked bug report, the masked blend operation
contains multiple stages of rounding. This commit replaces one
intermediate round with a right shift, which should be slightly
faster and more accurate.

BUG=aomedia:1292

Change-Id: Ib24ce687e628b05d645fbde5306ee552f7ad876b
diff --git a/aom_dsp/blend_a64_mask.c b/aom_dsp/blend_a64_mask.c
index f8f9a1c..4c42274 100644
--- a/aom_dsp/blend_a64_mask.c
+++ b/aom_dsp/blend_a64_mask.c
@@ -22,6 +22,17 @@
 // as described for AOM_BLEND_A64 in aom_dsp/blend.h. src0 or src1 can
 // be the same as dst, or dst can be different from both sources.
 
+// NOTE(david.barker): The input and output of aom_blend_a64_d32_mask_c() are
+// in a higher intermediate precision, and will later be rounded down to pixel
+// precision.
+// Thus, in order to avoid double-rounding, we want to use normal right shifts
+// within this function, not ROUND_POWER_OF_TWO.
+// This works because of the identity:
+// ROUND_POWER_OF_TWO(x >> y, z) == ROUND_POWER_OF_TWO(x, y+z)
+//
+// In contrast, the output of the non-d32 functions will not be further rounded,
+// so we *should* use ROUND_POWER_OF_TWO there.
+
 void aom_blend_a64_d32_mask_c(int32_t *dst, uint32_t dst_stride,
                               const int32_t *src0, uint32_t src0_stride,
                               const int32_t *src1, uint32_t src1_stride,
@@ -41,8 +52,10 @@
     for (i = 0; i < h; ++i) {
       for (j = 0; j < w; ++j) {
         const int m = mask[i * mask_stride + j];
-        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
-                                                src1[i * src1_stride + j]);
+        dst[i * dst_stride + j] =
+            ((m * src0[i * src0_stride + j] +
+              (AOM_BLEND_A64_MAX_ALPHA - m) * src1[i * src1_stride + j]) >>
+             AOM_BLEND_A64_ROUND_BITS);
       }
     }
   } else if (subw == 1 && subh == 1) {
@@ -54,8 +67,10 @@
                 mask[(2 * i) * mask_stride + (2 * j + 1)] +
                 mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
             2);
-        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
-                                                src1[i * src1_stride + j]);
+        dst[i * dst_stride + j] =
+            ((m * src0[i * src0_stride + j] +
+              (AOM_BLEND_A64_MAX_ALPHA - m) * src1[i * src1_stride + j]) >>
+             AOM_BLEND_A64_ROUND_BITS);
       }
     }
   } else if (subw == 1 && subh == 0) {
@@ -63,8 +78,10 @@
       for (j = 0; j < w; ++j) {
         const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)],
                                     mask[i * mask_stride + (2 * j + 1)]);
-        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
-                                                src1[i * src1_stride + j]);
+        dst[i * dst_stride + j] =
+            ((m * src0[i * src0_stride + j] +
+              (AOM_BLEND_A64_MAX_ALPHA - m) * src1[i * src1_stride + j]) >>
+             AOM_BLEND_A64_ROUND_BITS);
       }
     }
   } else {
@@ -72,8 +89,10 @@
       for (j = 0; j < w; ++j) {
         const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j],
                                     mask[(2 * i + 1) * mask_stride + j]);
-        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
-                                                src1[i * src1_stride + j]);
+        dst[i * dst_stride + j] =
+            ((m * src0[i * src0_stride + j] +
+              (AOM_BLEND_A64_MAX_ALPHA - m) * src1[i * src1_stride + j]) >>
+             AOM_BLEND_A64_ROUND_BITS);
       }
     }
   }