Reduce the bandwidth requirement of sub8x8 chroma obmc

HW does not support < 4x4 prediction. To limit the bandwidth
requirement, for small blocks, only blend with neighbors from one
side. If block-size of current plane is 4x4 or 8x4, the above
neighbor will be skipped. If it is 4x8, the left neighbor (dir = 1)
will be skipped.
This change will keep the bandwidth requirement of OBMC not more
than what is required by normal compound inter modes.

Loss of gain (PSNR-Y/PSNR-Cb/PSNR-Cr/CIEDE2000)
AWCY HL: 0.05/0.09/0.14/0.04
AWCY LL: 0.06/0.07/0.10/0.12

Change-Id: I3854afc69c3014da99bde4b19bb726e4c077d59e
diff --git a/av1/common/reconinter.c b/av1/common/reconinter.c
index 1b6cdc0..71c8ebd 100644
--- a/av1/common/reconinter.c
+++ b/av1/common/reconinter.c
@@ -1686,6 +1686,27 @@
   }
 }
 
+// HW does not support < 4x4 prediction. To limit the bandwidth requirement, for
+// small blocks, only blend with neighbors from one side. If block-size of
+// current plane is 4x4 or 8x4, the above neighbor (dir = 0) will be skipped. If
+// it is 4x8, the left neighbor (dir = 1) will be skipped.
+int skip_u4x4_pred_in_obmc(BLOCK_SIZE bsize, const struct macroblockd_plane *pd,
+                           int dir) {
+  assert(is_motion_variation_allowed_bsize(bsize));
+
+  BLOCK_SIZE bsize_plane =
+      ss_size_lookup[bsize][pd->subsampling_x][pd->subsampling_y];
+#if CONFIG_CB4X4
+  if (bsize_plane < BLOCK_4X4) return 1;
+#endif
+  switch (bsize_plane) {
+    case BLOCK_4X4:
+    case BLOCK_8X4: return dir == 0; break;
+    case BLOCK_4X8: return dir == 1; break;
+    default: return 0;
+  }
+}
+
 // This function combines motion compensated predictions that is generated by
 // top/left neighboring blocks' inter predictors with the regular inter
 // prediction. We assume the original prediction (bmc) is stored in
@@ -1727,6 +1748,9 @@
           const struct macroblockd_plane *pd = &xd->plane[plane];
           const int bw = (mi_step * MI_SIZE) >> pd->subsampling_x;
           const int bh = overlap >> pd->subsampling_y;
+
+          if (skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue;
+
           const int dst_stride = pd->dst.stride;
           uint8_t *const dst = &pd->dst.buf[(i * MI_SIZE) >> pd->subsampling_x];
           const int tmp_stride = above_stride[plane];
@@ -1773,6 +1797,9 @@
           const struct macroblockd_plane *pd = &xd->plane[plane];
           const int bw = overlap >> pd->subsampling_x;
           const int bh = (mi_step * MI_SIZE) >> pd->subsampling_y;
+
+          if (skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
+
           const int dst_stride = pd->dst.stride;
           uint8_t *const dst =
               &pd->dst.buf[(i * MI_SIZE * dst_stride) >> pd->subsampling_y];
@@ -1876,6 +1903,8 @@
       bh = AOMMAX((num_4x4_blocks_high_lookup[bsize] * 2) >> pd->subsampling_y,
                   4);
 
+      if (skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue;
+
 #if CONFIG_WARPED_MOTION
       if (above_mbmi->motion_mode == WARPED_CAUSAL &&
           WARP_NEIGHBORS_WITH_OBMC) {
@@ -1981,6 +2010,8 @@
                   4);
       bh = (mi_step << MI_SIZE_LOG2) >> pd->subsampling_y;
 
+      if (skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
+
 #if CONFIG_WARPED_MOTION
       if (left_mbmi->motion_mode == WARPED_CAUSAL && WARP_NEIGHBORS_WITH_OBMC) {
         assert_motion_mode_valid(WARPED_CAUSAL,