AV1 RT: Implementing VERT and HORZ var partition

Gives 0.02 BDRate improvement and ~1-2% speedup

Change-Id: Id876f904fdc086f72dc537845ce772d5e9c533fa
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 4ac4b4d..f2e961b 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -2008,10 +2008,39 @@
       encode_b(cpi, tile_data, td, tp, mi_row, mi_col, 0, bsize, partition,
                &pc_tree->none, NULL);
       break;
-    // TODO(kyslov@) Add HORZ and VERT partitions
-    case PARTITION_HORZ:
     case PARTITION_VERT:
-      assert(0 && "Cannot yet handle non-square partition types");
+      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &dummy_cost,
+                    PARTITION_VERT, subsize, &pc_tree->vertical[0], INT64_MAX,
+                    sf->use_fast_nonrd_pick_mode ? PICK_MODE_FAST_NONRD
+                                                 : PICK_MODE_NONRD);
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, 0, subsize,
+               PARTITION_VERT, &pc_tree->vertical[0], NULL);
+      if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) {
+        pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, &dummy_cost,
+                      PARTITION_VERT, subsize, &pc_tree->vertical[1], INT64_MAX,
+                      sf->use_fast_nonrd_pick_mode ? PICK_MODE_FAST_NONRD
+                                                   : PICK_MODE_NONRD);
+        encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, 0, subsize,
+                 PARTITION_VERT, &pc_tree->vertical[1], NULL);
+      }
+      break;
+    case PARTITION_HORZ:
+      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &dummy_cost,
+                    PARTITION_HORZ, subsize, &pc_tree->horizontal[0], INT64_MAX,
+                    sf->use_fast_nonrd_pick_mode ? PICK_MODE_FAST_NONRD
+                                                 : PICK_MODE_NONRD);
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, 0, subsize,
+               PARTITION_HORZ, &pc_tree->horizontal[0], NULL);
+
+      if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) {
+        pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, &dummy_cost,
+                      PARTITION_HORZ, subsize, &pc_tree->horizontal[1],
+                      INT64_MAX,
+                      sf->use_fast_nonrd_pick_mode ? PICK_MODE_FAST_NONRD
+                                                   : PICK_MODE_NONRD);
+        encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, 0, subsize,
+                 PARTITION_HORZ, &pc_tree->horizontal[1], NULL);
+      }
       break;
     case PARTITION_SPLIT:
       for (int i = 0; i < 4; i++) {
diff --git a/av1/encoder/var_based_part.c b/av1/encoder/var_based_part.c
index 437e9fd..a5772dc 100644
--- a/av1/encoder/var_based_part.c
+++ b/av1/encoder/var_based_part.c
@@ -194,18 +194,14 @@
 
   if (force_split == 1) return 0;
 
-  if (mi_col + block_width > tile->mi_col_end ||
-      mi_row + block_height > tile->mi_row_end)
-    return 0;
-
   // For bsize=bsize_min (16x16/8x8 for 8x8/4x4 downsampling), select if
   // variance is below threshold, otherwise split will be selected.
   // No check for vert/horiz split as too few samples for variance.
   if (bsize == bsize_min) {
     // Variance already computed to set the force_split.
     if (frame_is_intra_only(cm)) get_variance(&vt.part_variances->none);
-    if (mi_col + block_width / 2 < cm->mi_cols &&
-        mi_row + block_height / 2 < cm->mi_rows &&
+    if (mi_col + block_width <= tile->mi_col_end &&
+        mi_row + block_height <= tile->mi_row_end &&
         vt.part_variances->none.variance < threshold) {
       set_block_size(cpi, x, xd, mi_row, mi_col, bsize);
       return 1;
@@ -221,12 +217,42 @@
       return 0;
     }
     // If variance is low, take the bsize (no split).
-    if (mi_col + block_width / 2 < cm->mi_cols &&
-        mi_row + block_height / 2 < cm->mi_rows &&
+    if (mi_col + block_width <= tile->mi_col_end &&
+        mi_row + block_height <= tile->mi_row_end &&
         vt.part_variances->none.variance < threshold) {
       set_block_size(cpi, x, xd, mi_row, mi_col, bsize);
       return 1;
     }
+    // Check vertical split.
+    if (mi_row + block_height <= tile->mi_row_end &&
+        mi_col + block_width / 2 <= tile->mi_col_end) {
+      BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_VERT);
+      get_variance(&vt.part_variances->vert[0]);
+      get_variance(&vt.part_variances->vert[1]);
+      if (vt.part_variances->vert[0].variance < threshold &&
+          vt.part_variances->vert[1].variance < threshold &&
+          get_plane_block_size(subsize, xd->plane[1].subsampling_x,
+                               xd->plane[1].subsampling_y) < BLOCK_INVALID) {
+        set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
+        set_block_size(cpi, x, xd, mi_row, mi_col + block_width / 2, subsize);
+        return 1;
+      }
+    }
+    // Check horizontal split.
+    if (mi_col + block_width <= tile->mi_col_end &&
+        mi_row + block_height / 2 <= tile->mi_row_end) {
+      BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_HORZ);
+      get_variance(&vt.part_variances->horz[0]);
+      get_variance(&vt.part_variances->horz[1]);
+      if (vt.part_variances->horz[0].variance < threshold &&
+          vt.part_variances->horz[1].variance < threshold &&
+          get_plane_block_size(subsize, xd->plane[1].subsampling_x,
+                               xd->plane[1].subsampling_y) < BLOCK_INVALID) {
+        set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
+        set_block_size(cpi, x, xd, mi_row + block_height / 2, mi_col, subsize);
+        return 1;
+      }
+    }
     return 0;
   }
   return 0;
@@ -687,7 +713,8 @@
       force_split[0] = 1;
   }
 
-  if (!set_vt_partitioning(cpi, x, xd, tile, vt, BLOCK_128X128, mi_row, mi_col,
+  if (mi_col + 32 > tile->mi_col_end || mi_row + 32 > tile->mi_row_end ||
+      !set_vt_partitioning(cpi, x, xd, tile, vt, BLOCK_128X128, mi_row, mi_col,
                            thresholds[0], BLOCK_16X16, force_split[0])) {
     for (m = 0; m < num_64x64_blocks; ++m) {
       const int x64_idx = ((m & 1) << 4);