Add coef_interleave experiment

This commit adds an experiment to interleave the coding of transform
coefficients from YUV planes.  The experiment can be enabled at config
time by --enable-coef-interleave.

Change-Id: Ifd92f9c367304bca9732f13fa026eb8996363677
diff --git a/av1/common/blockd.c b/av1/common/blockd.c
index 8cfd223..9cae785 100644
--- a/av1/common/blockd.c
+++ b/av1/common/blockd.c
@@ -39,6 +39,87 @@
   }
 }
 
+#if CONFIG_COEF_INTERLEAVE
+void av1_foreach_transformed_block_interleave(
+    const MACROBLOCKD *const xd, BLOCK_SIZE bsize,
+    foreach_transformed_block_visitor visit, void *arg) {
+  const struct macroblockd_plane *const pd_y = &xd->plane[0];
+  const struct macroblockd_plane *const pd_c = &xd->plane[1];
+  const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+
+  const TX_SIZE tx_log2_y = mbmi->tx_size;
+  const TX_SIZE tx_log2_c = get_uv_tx_size(mbmi, pd_c);
+  const int tx_sz_y = (1 << tx_log2_y);
+  const int tx_sz_c = (1 << tx_log2_c);
+
+  const BLOCK_SIZE plane_bsize_y = get_plane_block_size(bsize, pd_y);
+  const BLOCK_SIZE plane_bsize_c = get_plane_block_size(bsize, pd_c);
+
+  const int num_4x4_w_y = num_4x4_blocks_wide_lookup[plane_bsize_y];
+  const int num_4x4_w_c = num_4x4_blocks_wide_lookup[plane_bsize_c];
+  const int num_4x4_h_y = num_4x4_blocks_high_lookup[plane_bsize_y];
+  const int num_4x4_h_c = num_4x4_blocks_high_lookup[plane_bsize_c];
+
+  const int step_y = 1 << (tx_log2_y << 1);
+  const int step_c = 1 << (tx_log2_c << 1);
+
+  const int max_4x4_w_y =
+      get_max_4x4_size(num_4x4_w_y, xd->mb_to_right_edge, pd_y->subsampling_x);
+  const int max_4x4_h_y =
+      get_max_4x4_size(num_4x4_h_y, xd->mb_to_bottom_edge, pd_y->subsampling_y);
+
+  const int extra_step_y = ((num_4x4_w_y - max_4x4_w_y) >> tx_log2_y) * step_y;
+
+  const int max_4x4_w_c =
+      get_max_4x4_size(num_4x4_w_c, xd->mb_to_right_edge, pd_c->subsampling_x);
+  const int max_4x4_h_c =
+      get_max_4x4_size(num_4x4_h_c, xd->mb_to_bottom_edge, pd_c->subsampling_y);
+
+  const int extra_step_c = ((num_4x4_w_c - max_4x4_w_c) >> tx_log2_c) * step_c;
+
+  // The max_4x4_w/h may be smaller than tx_sz under some corner cases,
+  // i.e. when the SB is splitted by tile boundaries.
+  const int tu_num_w_y = (max_4x4_w_y + tx_sz_y - 1) / tx_sz_y;
+  const int tu_num_h_y = (max_4x4_h_y + tx_sz_y - 1) / tx_sz_y;
+  const int tu_num_w_c = (max_4x4_w_c + tx_sz_c - 1) / tx_sz_c;
+  const int tu_num_h_c = (max_4x4_h_c + tx_sz_c - 1) / tx_sz_c;
+  const int tu_num_y = tu_num_w_y * tu_num_h_y;
+  const int tu_num_c = tu_num_w_c * tu_num_h_c;
+
+  int tu_idx_c = 0;
+  int offset_y, row_y, col_y;
+  int offset_c, row_c, col_c;
+
+  for (row_y = 0; row_y < tu_num_h_y; row_y++) {
+    for (col_y = 0; col_y < tu_num_w_y; col_y++) {
+      // luma
+      offset_y = (row_y * tu_num_w_y + col_y) * step_y + row_y * extra_step_y;
+      visit(0, offset_y, row_y * tx_sz_y, col_y * tx_sz_y, plane_bsize_y,
+            tx_log2_y, arg);
+      // chroma
+      if (tu_idx_c < tu_num_c) {
+        row_c = (tu_idx_c / tu_num_w_c) * tx_sz_c;
+        col_c = (tu_idx_c % tu_num_w_c) * tx_sz_c;
+        offset_c = tu_idx_c * step_c + (tu_idx_c / tu_num_w_c) * extra_step_c;
+        visit(1, offset_c, row_c, col_c, plane_bsize_c, tx_log2_c, arg);
+        visit(2, offset_c, row_c, col_c, plane_bsize_c, tx_log2_c, arg);
+        tu_idx_c++;
+      }
+    }
+  }
+
+  // In 422 case, it's possible that Chroma has more TUs than Luma
+  while (tu_idx_c < tu_num_c) {
+    row_c = (tu_idx_c / tu_num_w_c) * tx_sz_c;
+    col_c = (tu_idx_c % tu_num_w_c) * tx_sz_c;
+    offset_c = tu_idx_c * step_c + row_c * extra_step_c;
+    visit(1, offset_c, row_c, col_c, plane_bsize_c, tx_log2_c, arg);
+    visit(2, offset_c, row_c, col_c, plane_bsize_c, tx_log2_c, arg);
+    tu_idx_c++;
+  }
+}
+#endif
+
 void av1_foreach_transformed_block_in_plane(
     const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
     foreach_transformed_block_visitor visit, void *arg) {
diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index 1766a7c..7e1a4b9 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h
@@ -833,6 +833,17 @@
                                    foreach_transformed_block_visitor visit,
                                    void *arg);
 
+#if CONFIG_COEF_INTERLEAVE
+static INLINE int get_max_4x4_size(int num_4x4, int mb_to_edge,
+                                   int subsampling) {
+  return num_4x4 + (mb_to_edge >= 0 ? 0 : mb_to_edge >> (5 + subsampling));
+}
+
+void av1_foreach_transformed_block_interleave(
+    const MACROBLOCKD *const xd, BLOCK_SIZE bsize,
+    foreach_transformed_block_visitor visit, void *arg);
+#endif
+
 void av1_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
                       TX_SIZE tx_size, int has_eob, int aoff, int loff);
 
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index 54f2310..91c2b3a 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -1435,6 +1435,127 @@
   if (mbmi->skip) {
     dec_reset_skip_context(xd);
   }
+#if CONFIG_COEF_INTERLEAVE
+  {
+    const struct macroblockd_plane *const pd_y = &xd->plane[0];
+    const struct macroblockd_plane *const pd_c = &xd->plane[1];
+    const TX_SIZE tx_log2_y = mbmi->tx_size;
+    const TX_SIZE tx_log2_c = get_uv_tx_size(mbmi, pd_c);
+    const int tx_sz_y = (1 << tx_log2_y);
+    const int tx_sz_c = (1 << tx_log2_c);
+    const int num_4x4_w_y = pd_y->n4_w;
+    const int num_4x4_h_y = pd_y->n4_h;
+    const int num_4x4_w_c = pd_c->n4_w;
+    const int num_4x4_h_c = pd_c->n4_h;
+    const int max_4x4_w_y = get_max_4x4_size(num_4x4_w_y, xd->mb_to_right_edge,
+                                             pd_y->subsampling_x);
+    const int max_4x4_h_y = get_max_4x4_size(num_4x4_h_y, xd->mb_to_bottom_edge,
+                                             pd_y->subsampling_y);
+    const int max_4x4_w_c = get_max_4x4_size(num_4x4_w_c, xd->mb_to_right_edge,
+                                             pd_c->subsampling_x);
+    const int max_4x4_h_c = get_max_4x4_size(num_4x4_h_c, xd->mb_to_bottom_edge,
+                                             pd_c->subsampling_y);
+
+    // The max_4x4_w/h may be smaller than tx_sz under some corner cases,
+    // i.e. when the SB is splitted by tile boundaries.
+    const int tu_num_w_y = (max_4x4_w_y + tx_sz_y - 1) / tx_sz_y;
+    const int tu_num_h_y = (max_4x4_h_y + tx_sz_y - 1) / tx_sz_y;
+    const int tu_num_w_c = (max_4x4_w_c + tx_sz_c - 1) / tx_sz_c;
+    const int tu_num_h_c = (max_4x4_h_c + tx_sz_c - 1) / tx_sz_c;
+    const int tu_num_y = tu_num_w_y * tu_num_h_y;
+    const int tu_num_c = tu_num_w_c * tu_num_h_c;
+
+    if (!is_inter_block(mbmi)) {
+      int tu_idx_c = 0;
+      int row_y, col_y, row_c, col_c;
+      int plane;
+
+#if CONFIG_PALETTE
+      for (plane = 0; plane <= 1; ++plane) {
+        if (mbmi->palette_mode_info.palette_size[plane])
+          av1_decode_palette_tokens(xd, plane, r);
+      }
+#endif
+
+      for (row_y = 0; row_y < tu_num_h_y; row_y++) {
+        for (col_y = 0; col_y < tu_num_w_y; col_y++) {
+          // luma
+          predict_and_reconstruct_intra_block(
+              cm, xd, r, mbmi, 0, row_y * tx_sz_y, col_y * tx_sz_y, tx_log2_y);
+          // chroma
+          if (tu_idx_c < tu_num_c) {
+            row_c = (tu_idx_c / tu_num_w_c) * tx_sz_c;
+            col_c = (tu_idx_c % tu_num_w_c) * tx_sz_c;
+            predict_and_reconstruct_intra_block(cm, xd, r, mbmi, 1, row_c,
+                                                col_c, tx_log2_c);
+            predict_and_reconstruct_intra_block(cm, xd, r, mbmi, 2, row_c,
+                                                col_c, tx_log2_c);
+            tu_idx_c++;
+          }
+        }
+      }
+
+      // In 422 case, it's possilbe that Chroma has more TUs than Luma
+      while (tu_idx_c < tu_num_c) {
+        row_c = (tu_idx_c / tu_num_w_c) * tx_sz_c;
+        col_c = (tu_idx_c % tu_num_w_c) * tx_sz_c;
+        predict_and_reconstruct_intra_block(cm, xd, r, mbmi, 1, row_c, col_c,
+                                            tx_log2_c);
+        predict_and_reconstruct_intra_block(cm, xd, r, mbmi, 2, row_c, col_c,
+                                            tx_log2_c);
+        tu_idx_c++;
+      }
+    } else {
+      // Prediction
+      av1_build_inter_predictors_sb(xd, mi_row, mi_col,
+                                    AOMMAX(bsize, BLOCK_8X8));
+
+      // Reconstruction
+      if (!mbmi->skip) {
+        int eobtotal = 0;
+        int tu_idx_c = 0;
+        int row_y, col_y, row_c, col_c;
+
+        for (row_y = 0; row_y < tu_num_h_y; row_y++) {
+          for (col_y = 0; col_y < tu_num_w_y; col_y++) {
+            // luma
+            eobtotal += reconstruct_inter_block(cm, xd, r, mbmi->segment_id, 0,
+                                                row_y * tx_sz_y,
+                                                col_y * tx_sz_y, tx_log2_y);
+            // chroma
+            if (tu_idx_c < tu_num_c) {
+              row_c = (tu_idx_c / tu_num_w_c) * tx_sz_c;
+              col_c = (tu_idx_c % tu_num_w_c) * tx_sz_c;
+              eobtotal += reconstruct_inter_block(cm, xd, r, mbmi->segment_id,
+                                                  1, row_c, col_c, tx_log2_c);
+              eobtotal += reconstruct_inter_block(cm, xd, r, mbmi->segment_id,
+                                                  2, row_c, col_c, tx_log2_c);
+              tu_idx_c++;
+            }
+          }
+        }
+
+        // In 422 case, it's possilbe that Chroma has more TUs than Luma
+        while (tu_idx_c < tu_num_c) {
+          row_c = (tu_idx_c / tu_num_w_c) * tx_sz_c;
+          col_c = (tu_idx_c % tu_num_w_c) * tx_sz_c;
+          eobtotal += reconstruct_inter_block(cm, xd, r, mbmi->segment_id, 1,
+                                              row_c, col_c, tx_log2_c);
+          eobtotal += reconstruct_inter_block(cm, xd, r, mbmi->segment_id, 2,
+                                              row_c, col_c, tx_log2_c);
+          tu_idx_c++;
+        }
+
+        if (bsize >= BLOCK_8X8 && eobtotal == 0)
+#if CONFIG_MISC_FIXES
+          mbmi->has_no_coeffs = 1;
+#else
+          mbmi->skip = 1;
+#endif
+      }
+    }
+  }
+#else
   if (!is_inter_block(mbmi)) {
     int plane;
 #if CONFIG_PALETTE
@@ -1545,6 +1666,7 @@
       }
     }
   }
+#endif
 
   xd->corrupted |= aom_reader_has_error(r);
 }
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index 2c64880..67322e7 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -1883,6 +1883,82 @@
   if (supertx_enabled) return;
 #endif  // CONFIG_SUPERTX
 
+#if CONFIG_COEF_INTERLEAVE
+  if (!m->mbmi.skip) {
+    const struct macroblockd_plane *const pd_y = &xd->plane[0];
+    const struct macroblockd_plane *const pd_c = &xd->plane[1];
+    const TX_SIZE tx_log2_y = m->mbmi.tx_size;
+    const TX_SIZE tx_log2_c = get_uv_tx_size(&m->mbmi, pd_c);
+    const int tx_sz_y = (1 << tx_log2_y);
+    const int tx_sz_c = (1 << tx_log2_c);
+
+    const BLOCK_SIZE plane_bsize_y =
+        get_plane_block_size(AOMMAX(m->mbmi.sb_type, 3), pd_y);
+    const BLOCK_SIZE plane_bsize_c =
+        get_plane_block_size(AOMMAX(m->mbmi.sb_type, 3), pd_c);
+
+    const int num_4x4_w_y = num_4x4_blocks_wide_lookup[plane_bsize_y];
+    const int num_4x4_w_c = num_4x4_blocks_wide_lookup[plane_bsize_c];
+    const int num_4x4_h_y = num_4x4_blocks_high_lookup[plane_bsize_y];
+    const int num_4x4_h_c = num_4x4_blocks_high_lookup[plane_bsize_c];
+
+    const int max_4x4_w_y = get_max_4x4_size(num_4x4_w_y, xd->mb_to_right_edge,
+                                             pd_y->subsampling_x);
+    const int max_4x4_h_y = get_max_4x4_size(num_4x4_h_y, xd->mb_to_bottom_edge,
+                                             pd_y->subsampling_y);
+    const int max_4x4_w_c = get_max_4x4_size(num_4x4_w_c, xd->mb_to_right_edge,
+                                             pd_c->subsampling_x);
+    const int max_4x4_h_c = get_max_4x4_size(num_4x4_h_c, xd->mb_to_bottom_edge,
+                                             pd_c->subsampling_y);
+
+    // The max_4x4_w/h may be smaller than tx_sz under some corner cases,
+    // i.e. when the SB is splitted by tile boundaries.
+    const int tu_num_w_y = (max_4x4_w_y + tx_sz_y - 1) / tx_sz_y;
+    const int tu_num_h_y = (max_4x4_h_y + tx_sz_y - 1) / tx_sz_y;
+    const int tu_num_w_c = (max_4x4_w_c + tx_sz_c - 1) / tx_sz_c;
+    const int tu_num_h_c = (max_4x4_h_c + tx_sz_c - 1) / tx_sz_c;
+    const int tu_num_y = tu_num_w_y * tu_num_h_y;
+    const int tu_num_c = tu_num_w_c * tu_num_h_c;
+
+    int tu_idx_y = 0, tu_idx_c = 0;
+    TOKEN_STATS token_stats;
+    init_token_stats(&token_stats);
+
+    assert(*tok < tok_end);
+
+    while (tu_idx_y < tu_num_y) {
+      pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx_log2_y, &token_stats);
+      assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
+      (*tok)++;
+      tu_idx_y++;
+
+      if (tu_idx_c < tu_num_c) {
+        pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx_log2_c, &token_stats);
+        assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
+        (*tok)++;
+
+        pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx_log2_c, &token_stats);
+        assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
+        (*tok)++;
+
+        tu_idx_c++;
+      }
+    }
+
+    // In 422 case, it's possilbe that Chroma has more TUs than Luma
+    while (tu_idx_c < tu_num_c) {
+      pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx_log2_c, &token_stats);
+      assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
+      (*tok)++;
+
+      pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx_log2_c, &token_stats);
+      assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
+      (*tok)++;
+
+      tu_idx_c++;
+    }
+  }
+#else  // CONFIG_COEF_INTERLEAVE
   if (!m->mbmi.skip) {
     assert(*tok < tok_end);
     for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
@@ -1965,6 +2041,7 @@
       (*tok)++;
     }
   }
+#endif  // CONFIG_COEF_INTERLEAVE
 #else
   // PVQ writes its tokens (i.e. symbols) here.
   if (!m->mbmi.skip) {
diff --git a/av1/encoder/tokenize.c b/av1/encoder/tokenize.c
index 0012559..0e6b815 100644
--- a/av1/encoder/tokenize.c
+++ b/av1/encoder/tokenize.c
@@ -503,6 +503,11 @@
     ++eob_branch[band[c]][pt];
   }
 
+#if CONFIG_COEF_INTERLEAVE
+  t->token = EOSB_TOKEN;
+  t++;
+#endif
+
   *tp = t;
 
 #if CONFIG_ADAPT_SCAN
@@ -725,6 +730,10 @@
 
 #if !CONFIG_PVQ
   if (!dry_run) {
+#if CONFIG_COEF_INTERLEAVE
+    td->counts->skip[ctx][0] += skip_inc;
+    av1_foreach_transformed_block_interleave(xd, bsize, tokenize_b, &arg);
+#else
     int plane;
 
     td->counts->skip[ctx][0] += skip_inc;
@@ -734,6 +743,7 @@
       (*t)->token = EOSB_TOKEN;
       (*t)++;
     }
+#endif
   } else if (dry_run == DRY_RUN_NORMAL) {
     av1_foreach_transformed_block(xd, bsize, set_entropy_context_b, &arg);
   } else if (dry_run == DRY_RUN_COSTCOEFFS) {
diff --git a/configure b/configure
index d37d735..221e811 100755
--- a/configure
+++ b/configure
@@ -296,6 +296,7 @@
     simp_mv_pred
     rd_debug
     reference_buffer
+    coef_interleave
 "
 CONFIG_LIST="
     dependency_tracking