Add aom_comp_mask_<upsampled>pred_ssse3
1) For encoder speed, overall ~1% faster with no impact on coding performance.
2) aom_comp_mask_pred_ssse3 is 3.5x - 6x faster than aom_comp_mask_pred_c
3) aom_comp_mask_upsampled_pred_ssse3 1.5x - 3x faster than
aom_comp_mask_upsampled_pred_c, for special case where subpel_x ==
subpel_y == 0, optimized version achieves 4x - 7x speedup
Unittest for both functions have been added.
Change-Id: Ib498317975e0dbd9cdcf61be327b640dfac9a7e5
diff --git a/av1/common/reconinter.h b/av1/common/reconinter.h
index df92cfe..db5dfff 100644
--- a/av1/common/reconinter.h
+++ b/av1/common/reconinter.h
@@ -232,8 +232,10 @@
static INLINE int is_any_masked_compound_used(BLOCK_SIZE sb_type) {
COMPOUND_TYPE comp_type;
+ int i;
if (!is_comp_ref_allowed(sb_type)) return 0;
- for (comp_type = 0; comp_type < COMPOUND_TYPES; comp_type++) {
+ for (i = 0; i < COMPOUND_TYPES; i++) {
+ comp_type = (COMPOUND_TYPE)i;
if (is_masked_compound_type(comp_type) &&
is_interinter_compound_used(comp_type, sb_type))
return 1;
@@ -286,8 +288,8 @@
const int spel_right = spel_left - SUBPEL_SHIFTS;
const int spel_top = (AOM_INTERP_EXTEND + bh) << SUBPEL_BITS;
const int spel_bottom = spel_top - SUBPEL_SHIFTS;
- MV clamped_mv = { src_mv->row * (1 << (1 - ss_y)),
- src_mv->col * (1 << (1 - ss_x)) };
+ MV clamped_mv = { (int16_t)(src_mv->row * (1 << (1 - ss_y))),
+ (int16_t)(src_mv->col * (1 << (1 - ss_x))) };
assert(ss_x <= 1);
assert(ss_y <= 1);