[wedge/compound-segment, normative] Reduce multiple rounding As described in the linked bug report, the masked blend operation contains multiple stages of rounding. This commit replaces one intermediate round with a right shift, which should be slightly faster and more accurate. BUG=aomedia:1292 Change-Id: Ib24ce687e628b05d645fbde5306ee552f7ad876b

commit: d3b9973879bf869ab09534bf45d4cd36519b4f8d [log] [tgz]
author: David Barker <david.barker@argondesign.com> Tue Jan 30 16:09:42 2018 +0000
committer: Debargha Mukherjee <debargha@google.com> Fri Feb 09 19:05:43 2018 +0000
tree: 6306425adc841ecf7a1c16a37a33332aabd0b178
parent: f7a1242075b91e2c8ae922278e0170a3780bbcd3 [diff]
diff --git a/aom_dsp/blend_a64_mask.c b/aom_dsp/blend_a64_mask.c
index f8f9a1c..4c42274 100644
--- a/aom_dsp/blend_a64_mask.c
+++ b/aom_dsp/blend_a64_mask.c

@@ -22,6 +22,17 @@
 // as described for AOM_BLEND_A64 in aom_dsp/blend.h. src0 or src1 can
 // be the same as dst, or dst can be different from both sources.
 
+// NOTE(david.barker): The input and output of aom_blend_a64_d32_mask_c() are
+// in a higher intermediate precision, and will later be rounded down to pixel
+// precision.
+// Thus, in order to avoid double-rounding, we want to use normal right shifts
+// within this function, not ROUND_POWER_OF_TWO.
+// This works because of the identity:
+// ROUND_POWER_OF_TWO(x >> y, z) == ROUND_POWER_OF_TWO(x, y+z)
+//
+// In contrast, the output of the non-d32 functions will not be further rounded,
+// so we *should* use ROUND_POWER_OF_TWO there.
+
 void aom_blend_a64_d32_mask_c(int32_t *dst, uint32_t dst_stride,
                               const int32_t *src0, uint32_t src0_stride,
                               const int32_t *src1, uint32_t src1_stride,
@@ -41,8 +52,10 @@
     for (i = 0; i < h; ++i) {
       for (j = 0; j < w; ++j) {
         const int m = mask[i * mask_stride + j];
-        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
-                                                src1[i * src1_stride + j]);
+        dst[i * dst_stride + j] =
+            ((m * src0[i * src0_stride + j] +
+              (AOM_BLEND_A64_MAX_ALPHA - m) * src1[i * src1_stride + j]) >>
+             AOM_BLEND_A64_ROUND_BITS);
       }
     }
   } else if (subw == 1 && subh == 1) {
@@ -54,8 +67,10 @@
                 mask[(2 * i) * mask_stride + (2 * j + 1)] +
                 mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
             2);
-        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
-                                                src1[i * src1_stride + j]);
+        dst[i * dst_stride + j] =
+            ((m * src0[i * src0_stride + j] +
+              (AOM_BLEND_A64_MAX_ALPHA - m) * src1[i * src1_stride + j]) >>
+             AOM_BLEND_A64_ROUND_BITS);
       }
     }
   } else if (subw == 1 && subh == 0) {
@@ -63,8 +78,10 @@
       for (j = 0; j < w; ++j) {
         const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)],
                                     mask[i * mask_stride + (2 * j + 1)]);
-        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
-                                                src1[i * src1_stride + j]);
+        dst[i * dst_stride + j] =
+            ((m * src0[i * src0_stride + j] +
+              (AOM_BLEND_A64_MAX_ALPHA - m) * src1[i * src1_stride + j]) >>
+             AOM_BLEND_A64_ROUND_BITS);
       }
     }
   } else {
@@ -72,8 +89,10 @@
       for (j = 0; j < w; ++j) {
         const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j],
                                     mask[(2 * i + 1) * mask_stride + j]);
-        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
-                                                src1[i * src1_stride + j]);
+        dst[i * dst_stride + j] =
+            ((m * src0[i * src0_stride + j] +
+              (AOM_BLEND_A64_MAX_ALPHA - m) * src1[i * src1_stride + j]) >>
+             AOM_BLEND_A64_ROUND_BITS);
       }
     }
   }
commit	d3b9973879bf869ab09534bf45d4cd36519b4f8d	[log] [tgz]
author	David Barker <david.barker@argondesign.com>	Tue Jan 30 16:09:42 2018 +0000
committer	Debargha Mukherjee <debargha@google.com>	Fri Feb 09 19:05:43 2018 +0000
tree	6306425adc841ecf7a1c16a37a33332aabd0b178
parent	f7a1242075b91e2c8ae922278e0170a3780bbcd3 [diff]