RT: Handle variance low flag properly for superblock 64x64

Neutral bdrate loss on rtc_derf.
2-3% speed gain on low res speed 8 50K on linux, 1-2% speed up on ARM.

Change-Id: Ic9bbc1c07a8dd44cd8bd1315cb40b603088b1cec
diff --git a/av1/encoder/nonrd_pickmode.c b/av1/encoder/nonrd_pickmode.c
index bc59f96..e807fd0 100644
--- a/av1/encoder/nonrd_pickmode.c
+++ b/av1/encoder/nonrd_pickmode.c
@@ -58,6 +58,10 @@
   PREDICTION_MODE pred_mode;
 } REF_MODE;
 
+static const int pos_shift_16x16[4][4] = {
+  { 9, 10, 13, 14 }, { 11, 12, 15, 16 }, { 17, 18, 21, 22 }, { 19, 20, 23, 24 }
+};
+
 #define RT_INTER_MODES 9
 static const REF_MODE ref_mode_set[RT_INTER_MODES] = {
   { LAST_FRAME, NEARESTMV },   { LAST_FRAME, NEARMV },
@@ -1166,6 +1170,55 @@
   }
 }
 
+static INLINE int get_force_skip_low_temp_var_small_sb(uint8_t *variance_low,
+                                                       int mi_row, int mi_col,
+                                                       BLOCK_SIZE bsize) {
+  // Relative indices of MB inside the superblock.
+  const int mi_x = mi_row & 0xF;
+  const int mi_y = mi_col & 0xF;
+  // Relative indices of 16x16 block inside the superblock.
+  const int i = mi_x >> 2;
+  const int j = mi_y >> 2;
+  int force_skip_low_temp_var = 0;
+  // Set force_skip_low_temp_var based on the block size and block offset.
+  switch (bsize) {
+    case BLOCK_64X64: force_skip_low_temp_var = variance_low[0]; break;
+    case BLOCK_64X32:
+      if (!mi_y && !mi_x) {
+        force_skip_low_temp_var = variance_low[1];
+      } else if (!mi_y && mi_x) {
+        force_skip_low_temp_var = variance_low[2];
+      }
+      break;
+    case BLOCK_32X64:
+      if (!mi_y && !mi_x) {
+        force_skip_low_temp_var = variance_low[3];
+      } else if (mi_y && !mi_x) {
+        force_skip_low_temp_var = variance_low[4];
+      }
+      break;
+    case BLOCK_32X32:
+      if (!mi_y && !mi_x) {
+        force_skip_low_temp_var = variance_low[5];
+      } else if (mi_y && !mi_x) {
+        force_skip_low_temp_var = variance_low[6];
+      } else if (!mi_y && mi_x) {
+        force_skip_low_temp_var = variance_low[7];
+      } else if (mi_y && mi_x) {
+        force_skip_low_temp_var = variance_low[8];
+      }
+      break;
+    case BLOCK_32X16:
+    case BLOCK_16X32:
+    case BLOCK_16X16:
+      force_skip_low_temp_var = variance_low[pos_shift_16x16[i][j]];
+      break;
+    default: break;
+  }
+
+  return force_skip_low_temp_var;
+}
+
 static INLINE int get_force_skip_low_temp_var(uint8_t *variance_low, int mi_row,
                                               int mi_col, BLOCK_SIZE bsize) {
   int force_skip_low_temp_var = 0;
@@ -1562,11 +1615,15 @@
   const int mi_row = xd->mi_row;
   const int mi_col = xd->mi_col;
   const int is_small_sb = (cm->seq_params.sb_size == BLOCK_64X64);
-  if (!is_small_sb && cpi->sf.rt_sf.short_circuit_low_temp_var &&
+  if (cpi->sf.rt_sf.short_circuit_low_temp_var &&
       x->nonrd_prune_ref_frame_search) {
-    force_skip_low_temp_var =
-        get_force_skip_low_temp_var(&x->variance_low[0], mi_row, mi_col, bsize);
-    // If force_skip_low_temp_var is set, skip non-LAST references.
+    if (is_small_sb)
+      force_skip_low_temp_var = get_force_skip_low_temp_var_small_sb(
+          &x->variance_low[0], mi_row, mi_col, bsize);
+    else
+      force_skip_low_temp_var = get_force_skip_low_temp_var(
+          &x->variance_low[0], mi_row, mi_col, bsize);
+    // If force_skip_low_temp_var is set, skip golden reference.
     if (force_skip_low_temp_var) {
       usable_ref_frame = LAST_FRAME;
     }
diff --git a/av1/encoder/var_based_part.c b/av1/encoder/var_based_part.c
index dddc21f..90e9413 100644
--- a/av1/encoder/var_based_part.c
+++ b/av1/encoder/var_based_part.c
@@ -450,11 +450,145 @@
   }
 }
 
+// Set temporal variance low flag for superblock 64x64.
+// Only first 25 in the array are used in this case.
+static AOM_INLINE void set_low_temp_var_flag_64x64(AV1_COMP *cpi, MACROBLOCK *x,
+                                                   MACROBLOCKD *xd, v64x64 *vt,
+                                                   int64_t thresholds[],
+                                                   int mi_col, int mi_row) {
+  AV1_COMMON *const cm = &cpi->common;
+
+  if (xd->mi[0]->sb_type == BLOCK_64X64) {
+    if ((vt->part_variances).none.variance < (thresholds[0] >> 1))
+      x->variance_low[0] = 1;
+  } else if (xd->mi[0]->sb_type == BLOCK_64X32) {
+    for (int i = 0; i < 2; i++) {
+      if (vt->part_variances.horz[i].variance < (thresholds[0] >> 2))
+        x->variance_low[i + 1] = 1;
+    }
+  } else if (xd->mi[0]->sb_type == BLOCK_32X64) {
+    for (int i = 0; i < 2; i++) {
+      if (vt->part_variances.vert[i].variance < (thresholds[0] >> 2))
+        x->variance_low[i + 3] = 1;
+    }
+  } else {
+    static const int idx[4][2] = { { 0, 0 }, { 0, 8 }, { 8, 0 }, { 8, 8 } };
+    for (int i = 0; i < 4; i++) {
+      const int idx_str =
+          cm->mi_stride * (mi_row + idx[i][0]) + mi_col + idx[i][1];
+      MB_MODE_INFO **this_mi = cm->mi_grid_base + idx_str;
+
+      if (cm->mi_cols <= mi_col + idx[i][1] ||
+          cm->mi_rows <= mi_row + idx[i][0])
+        continue;
+
+      if (*this_mi == NULL) continue;
+
+      if ((*this_mi)->sb_type == BLOCK_32X32) {
+        int64_t threshold_32x32 = (5 * thresholds[1]) >> 3;
+        if (vt->split[i].part_variances.none.variance < threshold_32x32)
+          x->variance_low[i + 5] = 1;
+      } else {
+        // For 32x16 and 16x32 blocks, the flag is set on each 16x16 block
+        // inside.
+        if ((*this_mi)->sb_type == BLOCK_16X16 ||
+            (*this_mi)->sb_type == BLOCK_32X16 ||
+            (*this_mi)->sb_type == BLOCK_16X32) {
+          for (int j = 0; j < 4; j++) {
+            if (vt->split[i].split[j].part_variances.none.variance <
+                (thresholds[2] >> 8))
+              x->variance_low[(i << 2) + j + 9] = 1;
+          }
+        }
+      }
+    }
+  }
+}
+
+static AOM_INLINE void set_low_temp_var_flag_128x128(
+    AV1_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, v128x128 *vt,
+    int64_t thresholds[], int mi_col, int mi_row) {
+  AV1_COMMON *const cm = &cpi->common;
+
+  if (xd->mi[0]->sb_type == BLOCK_128X128) {
+    if (vt->part_variances.none.variance < (thresholds[0] >> 1))
+      x->variance_low[0] = 1;
+  } else if (xd->mi[0]->sb_type == BLOCK_128X64) {
+    for (int i = 0; i < 2; i++) {
+      if (vt->part_variances.horz[i].variance < (thresholds[0] >> 2))
+        x->variance_low[i + 1] = 1;
+    }
+  } else if (xd->mi[0]->sb_type == BLOCK_64X128) {
+    for (int i = 0; i < 2; i++) {
+      if (vt->part_variances.vert[i].variance < (thresholds[0] >> 2))
+        x->variance_low[i + 3] = 1;
+    }
+  } else {
+    static const int idx64[4][2] = {
+      { 0, 0 }, { 0, 16 }, { 16, 0 }, { 16, 16 }
+    };
+    static const int idx32[4][2] = { { 0, 0 }, { 0, 8 }, { 8, 0 }, { 8, 8 } };
+    for (int i = 0; i < 4; i++) {
+      const int idx_str =
+          cm->mi_stride * (mi_row + idx64[i][0]) + mi_col + idx64[i][1];
+      MB_MODE_INFO **mi_64 = cm->mi_grid_base + idx_str;
+      if (*mi_64 == NULL) continue;
+      if (cm->mi_cols <= mi_col + idx64[i][1] ||
+          cm->mi_rows <= mi_row + idx64[i][0])
+        continue;
+      const int64_t threshold_64x64 = (5 * thresholds[1]) >> 3;
+      if ((*mi_64)->sb_type == BLOCK_64X64) {
+        if (vt->split[i].part_variances.none.variance < threshold_64x64)
+          x->variance_low[5 + i] = 1;
+      } else if ((*mi_64)->sb_type == BLOCK_64X32) {
+        for (int j = 0; j < 2; j++)
+          if (vt->split[i].part_variances.horz[j].variance <
+              (threshold_64x64 >> 1))
+            x->variance_low[9 + (i << 1) + j] = 1;
+      } else if ((*mi_64)->sb_type == BLOCK_32X64) {
+        for (int j = 0; j < 2; j++)
+          if (vt->split[i].part_variances.vert[j].variance <
+              (threshold_64x64 >> 1))
+            x->variance_low[17 + (i << 1) + j] = 1;
+      } else {
+        for (int k = 0; k < 4; k++) {
+          const int idx_str1 = cm->mi_stride * idx32[k][0] + idx32[k][1];
+          MB_MODE_INFO **mi_32 = cm->mi_grid_base + idx_str + idx_str1;
+          if (*mi_32 == NULL) continue;
+
+          if (cm->mi_cols <= mi_col + idx64[i][1] + idx32[k][1] ||
+              cm->mi_rows <= mi_row + idx64[i][0] + idx32[k][0])
+            continue;
+          const int64_t threshold_32x32 = (5 * thresholds[2]) >> 3;
+          if ((*mi_32)->sb_type == BLOCK_32X32) {
+            if (vt->split[i].split[k].part_variances.none.variance <
+                threshold_32x32)
+              x->variance_low[25 + (i << 2) + k] = 1;
+          } else {
+            // For 32x16 and 16x32 blocks, the flag is set on each 16x16 block
+            // inside.
+            if ((*mi_32)->sb_type == BLOCK_16X16 ||
+                (*mi_32)->sb_type == BLOCK_32X16 ||
+                (*mi_32)->sb_type == BLOCK_16X32) {
+              for (int j = 0; j < 4; j++) {
+                if (vt->split[i]
+                        .split[k]
+                        .split[j]
+                        .part_variances.none.variance < (thresholds[3] >> 8))
+                  x->variance_low[41 + (i << 4) + (k << 2) + j] = 1;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
 static AOM_INLINE void set_low_temp_var_flag(
     AV1_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, v128x128 *vt,
     int64_t thresholds[], MV_REFERENCE_FRAME ref_frame_partition, int mi_col,
     int mi_row) {
-  int i, j, k;
   AV1_COMMON *const cm = &cpi->common;
   const int mv_thr = cm->width > 640 ? 8 : 4;
   // Check temporal variance for bsize >= 16x16, if LAST_FRAME was selected and
@@ -468,77 +602,12 @@
         xd->mi[0]->mv[0].as_mv.col > -mv_thr &&
         xd->mi[0]->mv[0].as_mv.row < mv_thr &&
         xd->mi[0]->mv[0].as_mv.row > -mv_thr))) {
-    if (xd->mi[0]->sb_type == BLOCK_128X128) {
-      if (vt->part_variances.none.variance < (thresholds[0] >> 1))
-        x->variance_low[0] = 1;
-    } else if (xd->mi[0]->sb_type == BLOCK_128X64) {
-      for (i = 0; i < 2; i++) {
-        if (vt->part_variances.horz[i].variance < (thresholds[0] >> 2))
-          x->variance_low[i + 1] = 1;
-      }
-    } else if (xd->mi[0]->sb_type == BLOCK_64X128) {
-      for (i = 0; i < 2; i++) {
-        if (vt->part_variances.vert[i].variance < (thresholds[0] >> 2))
-          x->variance_low[i + 3] = 1;
-      }
-    } else {
-      for (i = 0; i < 4; i++) {
-        const int idx[4][2] = { { 0, 0 }, { 0, 16 }, { 16, 0 }, { 16, 16 } };
-        const int idx_str =
-            cm->mi_stride * (mi_row + idx[i][0]) + mi_col + idx[i][1];
-        MB_MODE_INFO **mi_64 = cm->mi_grid_base + idx_str;
-        if (*mi_64 == NULL) continue;
-        if (cm->mi_cols <= mi_col + idx[i][1] ||
-            cm->mi_rows <= mi_row + idx[i][0])
-          continue;
-        const int64_t threshold_64x64 = (5 * thresholds[1]) >> 3;
-        if ((*mi_64)->sb_type == BLOCK_64X64) {
-          if (vt->split[i].part_variances.none.variance < threshold_64x64)
-            x->variance_low[5 + i] = 1;
-        } else if ((*mi_64)->sb_type == BLOCK_64X32) {
-          for (j = 0; j < 2; j++)
-            if (vt->split[i].part_variances.horz[j].variance <
-                (threshold_64x64 >> 1))
-              x->variance_low[9 + (i << 1) + j] = 1;
-        } else if ((*mi_64)->sb_type == BLOCK_32X64) {
-          for (j = 0; j < 2; j++)
-            if (vt->split[i].part_variances.vert[j].variance <
-                (threshold_64x64 >> 1))
-              x->variance_low[17 + (i << 1) + j] = 1;
-        } else {
-          for (k = 0; k < 4; k++) {
-            const int idx1[4][2] = { { 0, 0 }, { 0, 8 }, { 8, 0 }, { 8, 8 } };
-            const int idx_str1 = cm->mi_stride * idx1[k][0] + idx1[k][1];
-            MB_MODE_INFO **mi_32 = cm->mi_grid_base + idx_str + idx_str1;
-            if (*mi_32 == NULL) continue;
-
-            if (cm->mi_cols <= mi_col + idx[i][1] + idx1[k][1] ||
-                cm->mi_rows <= mi_row + idx[i][0] + idx1[k][0])
-              continue;
-            const int64_t threshold_32x32 = (5 * thresholds[2]) >> 3;
-            if ((*mi_32)->sb_type == BLOCK_32X32) {
-              if (vt->split[i].split[k].part_variances.none.variance <
-                  threshold_32x32)
-                x->variance_low[25 + (i << 2) + k] = 1;
-            } else {
-              // For 32x16 and 16x32 blocks, the flag is set on each 16x16 block
-              // inside.
-              if ((*mi_32)->sb_type == BLOCK_16X16 ||
-                  (*mi_32)->sb_type == BLOCK_32X16 ||
-                  (*mi_32)->sb_type == BLOCK_16X32) {
-                for (j = 0; j < 4; j++) {
-                  if (vt->split[i]
-                          .split[k]
-                          .split[j]
-                          .part_variances.none.variance < (thresholds[3] >> 8))
-                    x->variance_low[41 + (i << 4) + (k << 2) + j] = 1;
-                }
-              }
-            }
-          }
-        }
-      }
-    }
+    const int is_small_sb = (cm->seq_params.sb_size == BLOCK_64X64);
+    if (is_small_sb)
+      set_low_temp_var_flag_64x64(cpi, x, xd, &(vt->split[0]), thresholds,
+                                  mi_col, mi_row);
+    else
+      set_low_temp_var_flag_128x128(cpi, x, xd, vt, thresholds, mi_col, mi_row);
   }
 }
 
@@ -1009,7 +1078,7 @@
     }
   }
 
-  if (cpi->sf.rt_sf.short_circuit_low_temp_var && !is_small_sb) {
+  if (cpi->sf.rt_sf.short_circuit_low_temp_var) {
     set_low_temp_var_flag(cpi, x, xd, vt, thresholds, ref_frame_partition,
                           mi_col, mi_row);
   }