RT: fix variance low flag for mode skipping.

Fix that vertical & horizontal partitions were set to same flag.

Used to skip golden ref frame.
Only used on 720p and above and speed 8 for now.

2% speed up and 0.17% quality loss on 720p.

Change-Id: I6c5862d54ec8521a8d9e3cf9aafbcd3b766f6263
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index a8cc8ff..89fd4d3 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -443,6 +443,14 @@
   float log_q;
 #endif
   int thresh_freq_fact[BLOCK_SIZES_ALL][MAX_MODES];
+  // 0 - 128x128
+  // 1-2 - 128x64
+  // 3-4 - 64x128
+  // 5-8 - 64x64
+  // 9-16 - 64x32
+  // 17-24 - 32x64
+  // 25-40 - 32x32
+  // 41-104 - 16x16
   uint8_t variance_low[105];
   uint8_t content_state_sb;
   // Strong color activity detection. Used in REALTIME coding mode to enhance
diff --git a/av1/encoder/nonrd_pickmode.c b/av1/encoder/nonrd_pickmode.c
index 9a1fc14..28002b9 100644
--- a/av1/encoder/nonrd_pickmode.c
+++ b/av1/encoder/nonrd_pickmode.c
@@ -1170,51 +1170,70 @@
                                               int mi_col, BLOCK_SIZE bsize) {
   int force_skip_low_temp_var = 0;
   int x, y;
+  x = (mi_col & 0x1F) >> 4;
+  // y = (mi_row & 0x1F) >> 4;
+  // const int idx64 = (y << 1) + x;
+  y = (mi_row & 0x17) >> 3;
+  const int idx64 = y + x;
+
+  x = (mi_col & 0xF) >> 3;
+  // y = (mi_row & 0xF) >> 3;
+  // const int idx32 = (y << 1) + x;
+  y = (mi_row & 0xB) >> 2;
+  const int idx32 = y + x;
+
+  x = (mi_col & 0x7) >> 2;
+  // y = (mi_row & 0x7) >> 2;
+  // const int idx16 = (y << 1) + x;
+  y = (mi_row & 0x5) >> 1;
+  const int idx16 = y + x;
   // Set force_skip_low_temp_var based on the block size and block offset.
   switch (bsize) {
     case BLOCK_128X128: force_skip_low_temp_var = variance_low[0]; break;
+    case BLOCK_128X64:
+      assert((mi_col & 0x1F) == 0);
+      force_skip_low_temp_var = variance_low[1 + ((mi_row & 0x1F) != 0)];
+      break;
+    case BLOCK_64X128:
+      assert((mi_row & 0x1F) == 0);
+      force_skip_low_temp_var = variance_low[3 + ((mi_col & 0x1F) != 0)];
+      break;
     case BLOCK_64X64:
+      // Location of this 64x64 block inside the 128x128 superblock
+      force_skip_low_temp_var = variance_low[5 + idx64];
+      break;
+    case BLOCK_64X32:
+      x = (mi_col & 0x1F) >> 4;
+      y = (mi_row & 0x1F) >> 3;
+      /*
+      .---------------.---------------.
+      | x=0,y=0,idx=0 | x=0,y=0,idx=2 |
+      :---------------+---------------:
+      | x=0,y=1,idx=1 | x=1,y=1,idx=3 |
+      :---------------+---------------:
+      | x=0,y=2,idx=4 | x=1,y=2,idx=6 |
+      :---------------+---------------:
+      | x=0,y=3,idx=5 | x=1,y=3,idx=7 |
+      '---------------'---------------'
+      */
+      const int idx64x32 = (x << 1) + (y % 2) + ((y >> 1) << 2);
+      force_skip_low_temp_var = variance_low[9 + idx64x32];
+      break;
+    case BLOCK_32X64:
+      x = (mi_col & 0x1F) >> 3;
+      y = (mi_row & 0x1F) >> 4;
+      const int idx32x64 = (y << 2) + x;
+      force_skip_low_temp_var = variance_low[17 + idx32x64];
+      break;
     case BLOCK_32X32:
+      force_skip_low_temp_var = variance_low[25 + (idx64 << 2) + idx32];
+      break;
+    case BLOCK_32X16:
+    case BLOCK_16X32:
     case BLOCK_16X16:
-      x = mi_col % 32;
-      y = mi_row % 32;
-      if (bsize == BLOCK_64X64) {
-        assert((x == 0 || x == 16) && (y == 0 || y == 16));
-      }
-      x >>= 4;
-      y >>= 4;
-      const int idx64 = y * 2 + x;
-      if (bsize == BLOCK_64X64) {
-        force_skip_low_temp_var = variance_low[1 + idx64];
-        break;
-      }
-
-      x = mi_col % 16;
-      y = mi_row % 16;
-      if (bsize == BLOCK_32X32) {
-        assert((x == 0 || x == 8) && (y == 0 || y == 8));
-      }
-      x >>= 3;
-      y >>= 3;
-      const int idx32 = y * 2 + x;
-      if (bsize == BLOCK_32X32) {
-        force_skip_low_temp_var = variance_low[5 + (idx64 << 2) + idx32];
-        break;
-      }
-
-      x = mi_col % 8;
-      y = mi_row % 8;
-      if (bsize == BLOCK_16X16) {
-        assert((x == 0 || x == 4) && (y == 0 || y == 4));
-      }
-      x >>= 2;
-      y >>= 2;
-      const int idx16 = y * 2 + x;
-      if (bsize == BLOCK_16X16) {
-        force_skip_low_temp_var =
-            variance_low[21 + (idx64 << 4) + (idx32 << 2) + idx16];
-        break;
-      }
+      force_skip_low_temp_var =
+          variance_low[41 + (idx64 << 4) + (idx32 << 2) + idx16];
+      break;
     default: break;
   }
   return force_skip_low_temp_var;
@@ -1542,15 +1561,13 @@
 
   const int mi_row = xd->mi_row;
   const int mi_col = xd->mi_col;
-  if (cpi->sf.rt_sf.short_circuit_low_temp_var &&
+  const int is_small_sb = (cm->seq_params.sb_size == BLOCK_64X64);
+  if (!is_small_sb && cpi->sf.rt_sf.short_circuit_low_temp_var &&
       x->nonrd_prune_ref_frame_search) {
     force_skip_low_temp_var =
         get_force_skip_low_temp_var(&x->variance_low[0], mi_row, mi_col, bsize);
-    // If force_skip_low_temp_var is set, and for short circuit mode = 1 and 3,
-    // skip golden reference.
-    if ((cpi->sf.rt_sf.short_circuit_low_temp_var == 1 ||
-         cpi->sf.rt_sf.short_circuit_low_temp_var == 3) &&
-        force_skip_low_temp_var) {
+    // If force_skip_low_temp_var is set, skip golden reference.
+    if (force_skip_low_temp_var) {
       usable_ref_frame = LAST_FRAME;
     }
   }
diff --git a/av1/encoder/var_based_part.c b/av1/encoder/var_based_part.c
index c74526e..2202d37 100644
--- a/av1/encoder/var_based_part.c
+++ b/av1/encoder/var_based_part.c
@@ -468,51 +468,61 @@
         xd->mi[0]->mv[0].as_mv.col > -mv_thr &&
         xd->mi[0]->mv[0].as_mv.row < mv_thr &&
         xd->mi[0]->mv[0].as_mv.row > -mv_thr))) {
-    if (xd->mi[0]->sb_type == BLOCK_128X128 ||
-        xd->mi[0]->sb_type == BLOCK_64X128 ||
-        xd->mi[0]->sb_type == BLOCK_128X64) {
-      if ((vt->part_variances).none.variance < (thresholds[0] >> 1))
+    if (xd->mi[0]->sb_type == BLOCK_128X128) {
+      if (vt->part_variances.none.variance < (thresholds[0] >> 1))
         x->variance_low[0] = 1;
+    } else if (xd->mi[0]->sb_type == BLOCK_128X64) {
+      for (i = 0; i < 2; i++) {
+        if (vt->part_variances.horz[i].variance < (thresholds[0] >> 2))
+          x->variance_low[i + 1] = 1;
+      }
+    } else if (xd->mi[0]->sb_type == BLOCK_64X128) {
+      for (i = 0; i < 2; i++) {
+        if (vt->part_variances.vert[i].variance < (thresholds[0] >> 2))
+          x->variance_low[i + 3] = 1;
+      }
     } else {
       for (i = 0; i < 4; i++) {
         const int idx[4][2] = { { 0, 0 }, { 0, 16 }, { 16, 0 }, { 16, 16 } };
         const int idx_str =
             cm->mi_stride * (mi_row + idx[i][0]) + mi_col + idx[i][1];
         MB_MODE_INFO **mi_64 = cm->mi_grid_base + idx_str;
-
+        if (*mi_64 == NULL) continue;
         if (cm->mi_cols <= mi_col + idx[i][1] ||
             cm->mi_rows <= mi_row + idx[i][0])
           continue;
-
-        if ((*mi_64)->sb_type == BLOCK_64X64 ||
-            (*mi_64)->sb_type == BLOCK_64X32 ||
-            (*mi_64)->sb_type == BLOCK_32X64) {
-          int64_t threshold_64x64 =
-              (cpi->sf.rt_sf.short_circuit_low_temp_var == 1 ||
-               cpi->sf.rt_sf.short_circuit_low_temp_var == 3)
-                  ? ((5 * thresholds[1]) >> 3)
-                  : (thresholds[1] >> 1);
+        const int64_t threshold_64x64 = (5 * thresholds[1]) >> 3;
+        if ((*mi_64)->sb_type == BLOCK_64X64) {
           if (vt->split[i].part_variances.none.variance < threshold_64x64)
-            x->variance_low[1 + i] = 1;
+            x->variance_low[5 + i] = 1;
+        } else if ((*mi_64)->sb_type == BLOCK_64X32) {
+          for (j = 0; j < 2; j++)
+            if (vt->split[i].part_variances.horz[j].variance <
+                (threshold_64x64 >> 1))
+              x->variance_low[9 + (i << 1) + j] = 1;
+        } else if ((*mi_64)->sb_type == BLOCK_32X64) {
+          for (j = 0; j < 2; j++)
+            if (vt->split[i].part_variances.vert[j].variance <
+                (threshold_64x64 >> 1))
+              x->variance_low[17 + (i << 1) + j] = 1;
         } else {
           for (k = 0; k < 4; k++) {
             const int idx1[4][2] = { { 0, 0 }, { 0, 8 }, { 8, 0 }, { 8, 8 } };
             const int idx_str1 = cm->mi_stride * idx1[k][0] + idx1[k][1];
             MB_MODE_INFO **mi_32 = cm->mi_grid_base + idx_str + idx_str1;
+            if (*mi_32 == NULL) continue;
 
             if (cm->mi_cols <= mi_col + idx[i][1] + idx1[k][1] ||
                 cm->mi_rows <= mi_row + idx[i][0] + idx1[k][0])
               continue;
+            const int64_t threshold_32x32 = (5 * thresholds[2]) >> 3;
             if ((*mi_32)->sb_type == BLOCK_32X32) {
-              int64_t threshold_32x32 =
-                  (cpi->sf.rt_sf.short_circuit_low_temp_var == 1 ||
-                   cpi->sf.rt_sf.short_circuit_low_temp_var == 3)
-                      ? ((5 * thresholds[2]) >> 3)
-                      : (thresholds[2] >> 1);
               if (vt->split[i].split[k].part_variances.none.variance <
                   threshold_32x32)
-                x->variance_low[5 + (i << 2) + k] = 1;
-            } else if (cpi->sf.rt_sf.short_circuit_low_temp_var >= 2) {
+                x->variance_low[25 + (i << 2) + k] = 1;
+            } else {
+              // For 32x16 and 16x32 blocks, the flag is set on each 16x16 block
+              // inside.
               if ((*mi_32)->sb_type == BLOCK_16X16 ||
                   (*mi_32)->sb_type == BLOCK_32X16 ||
                   (*mi_32)->sb_type == BLOCK_16X32) {
@@ -521,7 +531,7 @@
                           .split[k]
                           .split[j]
                           .part_variances.none.variance < (thresholds[3] >> 8))
-                    x->variance_low[21 + (i << 4) + (k << 2) + j] = 1;
+                    x->variance_low[41 + (i << 4) + (k << 2) + j] = 1;
                 }
               }
             }