Optimize aom_highbd_blend_a64_d16_mask_c() Branch removal inside inner loops by changing local variable value for different bit-depths. Change-Id: I64ed51c76f17f79bf49c1d094479afa419f1eb8e
diff --git a/aom_dsp/aom_dsp_common.h b/aom_dsp/aom_dsp_common.h index ba8e04d..309d0c3 100644 --- a/aom_dsp/aom_dsp_common.h +++ b/aom_dsp/aom_dsp_common.h
@@ -86,6 +86,14 @@ } } +// The result of this branchless code is equivalent to (value < 0 ? 0 : value) +// or max(0, value) and might be faster in some cases. +// Care should be taken since the behavior of right shifting signed type +// negative value is undefined by C standards and implementation defined, +static INLINE unsigned int negative_to_zero(int value) { + return value & ~(value >> (sizeof(value) * 8 - 1)); +} + #ifdef __cplusplus } // extern "C" #endif
diff --git a/aom_dsp/blend_a64_mask.c b/aom_dsp/blend_a64_mask.c index 99b0d20..c705221 100644 --- a/aom_dsp/blend_a64_mask.c +++ b/aom_dsp/blend_a64_mask.c
@@ -127,7 +127,6 @@ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int subh, int subw, ConvolveParams *conv_params, const int bd) { - int i, j; const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; const int round_offset = (1 << (offset_bits - conv_params->round_1)) + (1 << (offset_bits - conv_params->round_1 - 1)); @@ -143,68 +142,82 @@ assert(IS_POWER_OF_TWO(h)); assert(IS_POWER_OF_TWO(w)); + // excerpt from clip_pixel_highbd() + // set saturation_value to (1 << bd) - 1 + unsigned int saturation_value; + switch (bd) { + case 8: + default: saturation_value = 255; break; + case 10: saturation_value = 1023; break; + case 12: saturation_value = 4095; break; + } + if (subw == 0 && subh == 0) { - for (i = 0; i < h; ++i) { - for (j = 0; j < w; ++j) { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { int32_t res; - const int m = mask[i * mask_stride + j]; - res = ((m * (int32_t)src0[i * src0_stride + j] + - (AOM_BLEND_A64_MAX_ALPHA - m) * - (int32_t)src1[i * src1_stride + j]) >> + const int m = mask[j]; + res = ((m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >> AOM_BLEND_A64_ROUND_BITS); res -= round_offset; - dst[i * dst_stride + j] = - clip_pixel_highbd(ROUND_POWER_OF_TWO(res, round_bits), bd); + unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits)); + dst[j] = AOMMIN(v, saturation_value); } + mask += mask_stride; + src0 += src0_stride; + src1 += src1_stride; + dst += dst_stride; } } else if (subw == 1 && subh == 1) { - for (i = 0; i < h; ++i) { - for (j = 0; j < w; ++j) { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { int32_t res; const int m = ROUND_POWER_OF_TWO( - mask[(2 * i) * mask_stride + (2 * j)] + - mask[(2 * i + 1) * mask_stride + (2 * j)] + - mask[(2 * i) * mask_stride + (2 * j + 1)] + - mask[(2 * i + 1) * mask_stride + (2 * j + 1)], + mask[2 * j] + mask[mask_stride + 2 * j] + mask[2 * j + 1] + + mask[mask_stride + 2 * j + 1], 2); - res = ((m * (int32_t)src0[i * src0_stride + j] + - (AOM_BLEND_A64_MAX_ALPHA - m) * - (int32_t)src1[i * src1_stride + j]) >> - AOM_BLEND_A64_ROUND_BITS); + res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >> + AOM_BLEND_A64_ROUND_BITS; res -= round_offset; - dst[i * dst_stride + j] = - clip_pixel_highbd(ROUND_POWER_OF_TWO(res, round_bits), bd); + unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits)); + dst[j] = AOMMIN(v, saturation_value); } + mask += 2 * mask_stride; + src0 += src0_stride; + src1 += src1_stride; + dst += dst_stride; } } else if (subw == 1 && subh == 0) { - for (i = 0; i < h; ++i) { - for (j = 0; j < w; ++j) { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { int32_t res; - const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)], - mask[i * mask_stride + (2 * j + 1)]); - res = ((m * (int32_t)src0[i * src0_stride + j] + - (AOM_BLEND_A64_MAX_ALPHA - m) * - (int32_t)src1[i * src1_stride + j]) >> - AOM_BLEND_A64_ROUND_BITS); + const int m = AOM_BLEND_AVG(mask[2 * j], mask[2 * j + 1]); + res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >> + AOM_BLEND_A64_ROUND_BITS; res -= round_offset; - dst[i * dst_stride + j] = - clip_pixel_highbd(ROUND_POWER_OF_TWO(res, round_bits), bd); + unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits)); + dst[j] = AOMMIN(v, saturation_value); } + mask += mask_stride; + src0 += src0_stride; + src1 += src1_stride; + dst += dst_stride; } } else { - for (i = 0; i < h; ++i) { - for (j = 0; j < w; ++j) { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { int32_t res; - const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j], - mask[(2 * i + 1) * mask_stride + j]); - res = ((int32_t)(m * (int32_t)src0[i * src0_stride + j] + - (AOM_BLEND_A64_MAX_ALPHA - m) * - (int32_t)src1[i * src1_stride + j]) >> - AOM_BLEND_A64_ROUND_BITS); + const int m = AOM_BLEND_AVG(mask[j], mask[mask_stride + j]); + res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >> + AOM_BLEND_A64_ROUND_BITS; res -= round_offset; - dst[i * dst_stride + j] = - clip_pixel_highbd(ROUND_POWER_OF_TWO(res, round_bits), bd); + unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits)); + dst[j] = AOMMIN(v, saturation_value); } + mask += 2 * mask_stride; + src0 += src0_stride; + src1 += src1_stride; + dst += dst_stride; } } }