[wedge/compound-segment, normative] Reduce multiple rounding
As described in the linked bug report, the masked blend operation
contains multiple stages of rounding. This commit replaces one
intermediate round with a right shift, which should be slightly
faster and more accurate.
BUG=aomedia:1292
Change-Id: Ib24ce687e628b05d645fbde5306ee552f7ad876b
diff --git a/aom_dsp/blend_a64_mask.c b/aom_dsp/blend_a64_mask.c
index f8f9a1c..4c42274 100644
--- a/aom_dsp/blend_a64_mask.c
+++ b/aom_dsp/blend_a64_mask.c
@@ -22,6 +22,17 @@
// as described for AOM_BLEND_A64 in aom_dsp/blend.h. src0 or src1 can
// be the same as dst, or dst can be different from both sources.
+// NOTE(david.barker): The input and output of aom_blend_a64_d32_mask_c() are
+// in a higher intermediate precision, and will later be rounded down to pixel
+// precision.
+// Thus, in order to avoid double-rounding, we want to use normal right shifts
+// within this function, not ROUND_POWER_OF_TWO.
+// This works because of the identity:
+// ROUND_POWER_OF_TWO(x >> y, z) == ROUND_POWER_OF_TWO(x, y+z)
+//
+// In contrast, the output of the non-d32 functions will not be further rounded,
+// so we *should* use ROUND_POWER_OF_TWO there.
+
void aom_blend_a64_d32_mask_c(int32_t *dst, uint32_t dst_stride,
const int32_t *src0, uint32_t src0_stride,
const int32_t *src1, uint32_t src1_stride,
@@ -41,8 +52,10 @@
for (i = 0; i < h; ++i) {
for (j = 0; j < w; ++j) {
const int m = mask[i * mask_stride + j];
- dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
- src1[i * src1_stride + j]);
+ dst[i * dst_stride + j] =
+ ((m * src0[i * src0_stride + j] +
+ (AOM_BLEND_A64_MAX_ALPHA - m) * src1[i * src1_stride + j]) >>
+ AOM_BLEND_A64_ROUND_BITS);
}
}
} else if (subw == 1 && subh == 1) {
@@ -54,8 +67,10 @@
mask[(2 * i) * mask_stride + (2 * j + 1)] +
mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
2);
- dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
- src1[i * src1_stride + j]);
+ dst[i * dst_stride + j] =
+ ((m * src0[i * src0_stride + j] +
+ (AOM_BLEND_A64_MAX_ALPHA - m) * src1[i * src1_stride + j]) >>
+ AOM_BLEND_A64_ROUND_BITS);
}
}
} else if (subw == 1 && subh == 0) {
@@ -63,8 +78,10 @@
for (j = 0; j < w; ++j) {
const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)],
mask[i * mask_stride + (2 * j + 1)]);
- dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
- src1[i * src1_stride + j]);
+ dst[i * dst_stride + j] =
+ ((m * src0[i * src0_stride + j] +
+ (AOM_BLEND_A64_MAX_ALPHA - m) * src1[i * src1_stride + j]) >>
+ AOM_BLEND_A64_ROUND_BITS);
}
}
} else {
@@ -72,8 +89,10 @@
for (j = 0; j < w; ++j) {
const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j],
mask[(2 * i + 1) * mask_stride + j]);
- dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
- src1[i * src1_stride + j]);
+ dst[i * dst_stride + j] =
+ ((m * src0[i * src0_stride + j] +
+ (AOM_BLEND_A64_MAX_ALPHA - m) * src1[i * src1_stride + j]) >>
+ AOM_BLEND_A64_ROUND_BITS);
}
}
}