Add aom_comp_mask_<upsampled>pred_ssse3

1) For encoder speed, overall ~1% faster with no impact on coding performance.
2) aom_comp_mask_pred_ssse3 is 3.5x - 6x faster than aom_comp_mask_pred_c
3) aom_comp_mask_upsampled_pred_ssse3 1.5x - 3x faster than
aom_comp_mask_upsampled_pred_c, for special case where subpel_x ==
subpel_y == 0, optimized version achieves 4x - 7x speedup

Unittest for both functions have been added.

Change-Id: Ib498317975e0dbd9cdcf61be327b640dfac9a7e5
diff --git a/av1/common/reconinter.h b/av1/common/reconinter.h
index df92cfe..db5dfff 100644
--- a/av1/common/reconinter.h
+++ b/av1/common/reconinter.h
@@ -232,8 +232,10 @@
 
 static INLINE int is_any_masked_compound_used(BLOCK_SIZE sb_type) {
   COMPOUND_TYPE comp_type;
+  int i;
   if (!is_comp_ref_allowed(sb_type)) return 0;
-  for (comp_type = 0; comp_type < COMPOUND_TYPES; comp_type++) {
+  for (i = 0; i < COMPOUND_TYPES; i++) {
+    comp_type = (COMPOUND_TYPE)i;
     if (is_masked_compound_type(comp_type) &&
         is_interinter_compound_used(comp_type, sb_type))
       return 1;
@@ -286,8 +288,8 @@
   const int spel_right = spel_left - SUBPEL_SHIFTS;
   const int spel_top = (AOM_INTERP_EXTEND + bh) << SUBPEL_BITS;
   const int spel_bottom = spel_top - SUBPEL_SHIFTS;
-  MV clamped_mv = { src_mv->row * (1 << (1 - ss_y)),
-                    src_mv->col * (1 << (1 - ss_x)) };
+  MV clamped_mv = { (int16_t)(src_mv->row * (1 << (1 - ss_y))),
+                    (int16_t)(src_mv->col * (1 << (1 - ss_x))) };
   assert(ss_x <= 1);
   assert(ss_y <= 1);