Remove palette interleave

* Run 45 degree wavefront coding for palette index
with palette_throughput experiment.
* Remove palette index interleave.

Change-Id: Ibb57004401f817dec8b00bc2a941d70a26783ff9
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index b7cd4f4..7ecf9c6 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -799,7 +799,7 @@
                            aom_bit_depth_t bit_depth, const TX_SIZE tx_size,
                            TOKEN_STATS *token_stats) {
   const TOKENEXTRA *p = *tp;
-#if CONFIG_VAR_TX || CONFIG_PALETTE_THROUGHPUT
+#if CONFIG_VAR_TX
   int count = 0;
   const int seg_eob = tx_size_2d[tx_size];
 #endif
@@ -862,7 +862,7 @@
     }
     ++p;
 
-#if CONFIG_VAR_TX || CONFIG_PALETTE_THROUGHPUT
+#if CONFIG_VAR_TX
     ++count;
     if (token == EOB_TOKEN || count == seg_eob) break;
 #endif
@@ -876,7 +876,7 @@
                            aom_bit_depth_t bit_depth, const TX_SIZE tx_size,
                            TOKEN_STATS *token_stats) {
   const TOKENEXTRA *p = *tp;
-#if CONFIG_VAR_TX || CONFIG_PALETTE_THROUGHPUT
+#if CONFIG_VAR_TX
   int count = 0;
   const int seg_eob = tx_size_2d[tx_size];
 #endif
@@ -967,7 +967,7 @@
     }
     ++p;
 
-#if CONFIG_VAR_TX || CONFIG_PALETTE_THROUGHPUT
+#if CONFIG_VAR_TX
     ++count;
     if (token == EOB_TOKEN || count == seg_eob) break;
 #endif
@@ -2182,23 +2182,18 @@
 #endif
 
 #if CONFIG_PALETTE
-#if CONFIG_PALETTE_THROUGHPUT
-  // when block is skipped, palette index is coded here
-  // since there is no coeff to be interleaved.
-  if (m->mbmi.skip)
-#endif  // CONFIG_PALETTE_THROUGHPUT
-    for (plane = 0; plane <= 1; ++plane) {
-      const uint8_t palette_size_plane =
-          m->mbmi.palette_mode_info.palette_size[plane];
-      if (palette_size_plane > 0) {
-        int rows, cols;
-        av1_get_block_dimensions(m->mbmi.sb_type, plane, xd, NULL, NULL, &rows,
-                                 &cols);
-        assert(*tok < tok_end);
-        pack_palette_tokens(w, tok, palette_size_plane, rows * cols - 1);
-        assert(*tok < tok_end + m->mbmi.skip);
-      }
+  for (plane = 0; plane <= 1; ++plane) {
+    const uint8_t palette_size_plane =
+        m->mbmi.palette_mode_info.palette_size[plane];
+    if (palette_size_plane > 0) {
+      int rows, cols;
+      av1_get_block_dimensions(m->mbmi.sb_type, plane, xd, NULL, NULL, &rows,
+                               &cols);
+      assert(*tok < tok_end);
+      pack_palette_tokens(w, tok, palette_size_plane, rows * cols - 1);
+      assert(*tok < tok_end + m->mbmi.skip);
     }
+  }
 #endif  // CONFIG_PALETTE
 
 #if CONFIG_COEF_INTERLEAVE
@@ -2339,30 +2334,8 @@
         TX_SIZE tx = get_tx_size(plane, xd);
         const int bkw = tx_size_wide_unit[tx];
         const int bkh = tx_size_high_unit[tx];
-#if CONFIG_PALETTE && CONFIG_PALETTE_THROUGHPUT
-        const uint8_t palette_size_plane =
-            m->mbmi.palette_mode_info.palette_size[plane > 0];
-        const int bkw_in_pixel = bkw << tx_size_wide_log2[0];
-        const int bkh_in_pixel = bkh << tx_size_wide_log2[0];
-        int rows, cols;
-        av1_get_block_dimensions(m->mbmi.sb_type, plane, xd, NULL, NULL, &rows,
-                                 &cols);
-#endif  // CONFIG_PALETTE && CONFIG_PALETTE_THROUGHPUT
         for (row = 0; row < num_4x4_h; row += bkh) {
           for (col = 0; col < num_4x4_w; col += bkw) {
-#if CONFIG_PALETTE && CONFIG_PALETTE_THROUGHPUT
-            if (palette_size_plane > 0 && plane <= 1) {
-              const int col_in_pixel = col << tx_size_wide_log2[0];
-              const int row_in_pixel = row << tx_size_high_log2[0];
-              const int txbkw = AOMMIN(cols - col_in_pixel, bkw_in_pixel);
-              const int txbkh = AOMMIN(rows - row_in_pixel, bkh_in_pixel);
-              // first palette index is not coded here but in header instead
-              const int num_palette_indexes =
-                  txbkw * txbkh - ((row == 0 && col == 0) ? 1 : 0);
-              pack_palette_tokens(w, tok, palette_size_plane,
-                                  num_palette_indexes);
-            }
-#endif  // CONFIG_PALETTE && CONFIG_PALETTE_THROUGHPUT
 #if !CONFIG_PVQ
             pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx, &token_stats);
 #else
@@ -2374,46 +2347,6 @@
 #else
       TX_SIZE tx = get_tx_size(plane, xd);
       TOKEN_STATS token_stats;
-#if CONFIG_PALETTE && CONFIG_PALETTE_THROUGHPUT
-      const struct macroblockd_plane *const pd = &xd->plane[plane];
-      BLOCK_SIZE bsize = mbmi->sb_type;
-#if CONFIG_CB4X4
-      const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-#else
-      const BLOCK_SIZE plane_bsize =
-          get_plane_block_size(AOMMAX(bsize, BLOCK_8X8), pd);
-#endif
-      const int num_4x4_w =
-          block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
-      const int num_4x4_h =
-          block_size_high[plane_bsize] >> tx_size_wide_log2[0];
-      int row, col;
-      const int bkw = tx_size_wide_unit[tx];
-      const int bkh = tx_size_high_unit[tx];
-      const uint8_t palette_size_plane =
-          m->mbmi.palette_mode_info.palette_size[plane > 0];
-      const int bkw_in_pixel = bkw << tx_size_wide_log2[0];
-      const int bkh_in_pixel = bkh << tx_size_wide_log2[0];
-      int rows, cols;
-      av1_get_block_dimensions(m->mbmi.sb_type, plane, xd, NULL, NULL, &rows,
-                               &cols);
-      for (row = 0; row < num_4x4_h; row += bkh) {
-        for (col = 0; col < num_4x4_w; col += bkw) {
-          if (!is_inter_block(mbmi) && palette_size_plane > 0 && plane <= 1) {
-            const int col_in_pixel = col << tx_size_wide_log2[0];
-            const int row_in_pixel = row << tx_size_high_log2[0];
-            const int txbkw = AOMMIN(cols - col_in_pixel, bkw_in_pixel);
-            const int txbkh = AOMMIN(rows - row_in_pixel, bkh_in_pixel);
-            // first palette index is not coded here but in header instead
-            const int num_palette_indexes =
-                txbkw * txbkh - ((row == 0 && col == 0) ? 1 : 0);
-            pack_palette_tokens(w, tok, palette_size_plane,
-                                num_palette_indexes);
-          }
-          pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx, &token_stats);
-        }
-      }
-#else
 #if !CONFIG_PVQ
       init_token_stats(&token_stats);
       pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx, &token_stats);
@@ -2421,7 +2354,6 @@
       (void)token_stats;
       pack_pvq_tokens(w, x, xd, plane, mbmi->sb_type, tx);
 #endif
-#endif  // CONFIG_PALETTE && CONFIG_PALETTE_THROUGHPUT
 #if CONFIG_RD_DEBUG
       if (is_inter_block(mbmi) && mbmi->sb_type >= BLOCK_8X8 &&
           rd_token_stats_mismatch(&m->mbmi.rd_stats, &token_stats, plane)) {
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 174e796..065ea0e 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -5680,11 +5680,9 @@
         if (mbmi->palette_mode_info.palette_size[plane] > 0) {
           mbmi->palette_mode_info.palette_first_color_idx[plane] =
               xd->plane[plane].color_index_map[0];
-// TODO(huisu): this increases the use of token buffer. Needs stretch
-// test to verify.
-#if !CONFIG_PALETTE_THROUGHPUT
+          // TODO(huisu): this increases the use of token buffer. Needs stretch
+          // test to verify.
           av1_tokenize_palette_sb(cpi, td, plane, t, dry_run, bsize, rate);
-#endif
         }
       }
     }
diff --git a/av1/encoder/tokenize.c b/av1/encoder/tokenize.c
index 9df6f93..e22cf3a 100644
--- a/av1/encoder/tokenize.c
+++ b/av1/encoder/tokenize.c
@@ -399,7 +399,7 @@
 #endif
 #endif  // !CONFIG_PVQ
 
-#if CONFIG_PALETTE && !CONFIG_PALETTE_THROUGHPUT
+#if CONFIG_PALETTE
 void av1_tokenize_palette_sb(const AV1_COMP *cpi,
                              const struct ThreadData *const td, int plane,
                              TOKENEXTRA **t, RUN_TYPE dry_run, BLOCK_SIZE bsize,
@@ -422,8 +422,15 @@
                            &cols);
   assert(plane == 0 || plane == 1);
 
+#if CONFIG_PALETTE_THROUGHPUT
+  int k;
+  for (k = 1; k < rows + cols - 1; ++k) {
+    for (j = AOMMIN(k, cols - 1); j >= AOMMAX(0, k - rows + 1); --j) {
+      i = k - j;
+#else
   for (i = 0; i < rows; ++i) {
     for (j = (i == 0 ? 1 : 0); j < cols; ++j) {
+#endif  // CONFIG_PALETTE_THROUGHPUT
       int color_new_idx;
       const int color_ctx = av1_get_palette_color_index_context(
           color_map, plane_block_width, i, j, n, color_order, &color_new_idx);
@@ -441,71 +448,6 @@
 }
 #endif  // CONFIG_PALETTE
 
-#if CONFIG_PALETTE && CONFIG_PALETTE_THROUGHPUT
-void tokenize_palette_b(int plane, int block, int blk_row, int blk_col,
-                        BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) {
-  struct tokenize_b_args *const args = arg;
-  ThreadData *const td = args->td;
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  struct macroblockd_plane *pd = &xd->plane[plane];
-  TOKENEXTRA **t = args->tp;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  const uint8_t *const color_map = xd->plane[plane].color_index_map;
-  const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-  const int n = pmi->palette_size[plane];
-  int i, j;
-  uint8_t color_order[PALETTE_MAX_SIZE];
-  const aom_prob(
-      *const probs)[PALETTE_COLOR_INDEX_CONTEXTS][PALETTE_COLORS - 1] =
-      plane == 0 ? av1_default_palette_y_color_index_prob
-                 : av1_default_palette_uv_color_index_prob;
-  int bsize = txsize_to_bsize[tx_size];
-  int plane_block_width, plane_block_height, rows, cols;
-  int block_width, block_height, tx_block_width, tx_block_height;
-  (void)block;
-
-  if (n == 0) return;
-
-  plane_block_width = block_size_wide[plane_bsize];
-  plane_block_height = block_size_high[plane_bsize];
-  rows = (xd->mb_to_bottom_edge >= 0)
-             ? plane_block_height
-             : (xd->mb_to_bottom_edge >> (3 + pd->subsampling_y)) +
-                   plane_block_height;
-  cols = (xd->mb_to_right_edge >= 0)
-             ? plane_block_width
-             : (xd->mb_to_right_edge >> (3 + pd->subsampling_x)) +
-                   plane_block_width;
-  assert(plane_block_width >= cols);
-  assert(plane_block_height >= rows);
-  tx_block_width = 1 << tx_size_wide_log2[0];
-  tx_block_height = 1 << tx_size_high_log2[0];
-  block_width = AOMMIN(cols - blk_col * tx_block_width, block_size_wide[bsize]);
-  block_height =
-      AOMMIN(rows - blk_row * tx_block_height, block_size_high[bsize]);
-
-  assert(plane == 0 || plane == 1);
-
-  // run wavefront on the palette map index encoding per transform block
-  for (i = ((blk_row == 0 && blk_col == 0) ? 1 : 0);
-       i < block_width + block_height - 1; ++i) {
-    for (j = AOMMIN(i, block_width - 1); j >= AOMMAX(0, i - block_height + 1);
-         --j) {
-      int color_new_idx;
-      const int color_ctx = av1_get_palette_color_index_context(
-          color_map, plane_block_width, blk_row * tx_block_width + (i - j),
-          blk_col * tx_block_height + j, n, color_order, &color_new_idx);
-      assert(color_new_idx >= 0 && color_new_idx < n);
-      (*t)->token = color_new_idx;
-      (*t)->context_tree = probs[n - 2][color_ctx];
-      (*t)->skip_eob_node = 0;
-      ++(*t);
-    }
-  }
-}
-#endif  // CONFIG_PALETTE && CONFIG_PALETTE_THROUGHPUT
-
 #if CONFIG_PVQ
 static void add_pvq_block(AV1_COMMON *const cm, MACROBLOCK *const x,
                           PVQ_INFO *pvq) {
@@ -709,16 +651,6 @@
 #endif  // !CONFIG_PVQ
 }
 
-#if CONFIG_PALETTE && CONFIG_PALETTE_THROUGHPUT
-void tokenize_joint_b(int plane, int block, int blk_row, int blk_col,
-                      BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) {
-  if (plane < 2)
-    tokenize_palette_b(plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                       arg);
-  tokenize_b(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg);
-}
-#endif  // CONFIG_PALETTE && CONFIG_PALETTE_THROUGHPUT
-
 struct is_skippable_args {
   uint16_t *eobs;
   int *skippable;
@@ -865,15 +797,6 @@
       !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP);
   struct tokenize_b_args arg = { cpi, td, t, 0 };
   if (mbmi->skip) {
-#if CONFIG_PALETTE && CONFIG_PALETTE_THROUGHPUT
-    if (!dry_run) {
-      int plane;
-      for (plane = 0; plane < 2; ++plane) {
-        av1_foreach_transformed_block_in_plane(xd, bsize, plane,
-                                               tokenize_palette_b, &arg);
-      }
-    }
-#endif  // CONFIG_PALETTE && CONFIG_PALETTE_THROUGHPUT
     if (!dry_run) td->counts->skip[ctx][1] += skip_inc;
     reset_skip_context(xd, bsize);
     return;
@@ -882,12 +805,7 @@
   if (!dry_run) {
 #if CONFIG_COEF_INTERLEAVE
     td->counts->skip[ctx][0] += skip_inc;
-#if CONFIG_PALETTE && CONFIG_PALETTE_THROUGHPUT
-    av1_foreach_transformed_block_in_plane(xd, bsize, plane, tokenize_joint_b,
-                                           &arg);
-#else
     av1_foreach_transformed_block_interleave(xd, bsize, tokenize_b, &arg);
-#endif  // CONFIG_PALETTE && CONFIG_PALETTE_THROUGHPUT
 #else
     int plane;
 
@@ -905,13 +823,8 @@
       (void)mi_row;
       (void)mi_col;
 #endif
-#if CONFIG_PALETTE && CONFIG_PALETTE_THROUGHPUT
-      av1_foreach_transformed_block_in_plane(xd, bsize, plane, tokenize_joint_b,
-                                             &arg);
-#else
       av1_foreach_transformed_block_in_plane(xd, bsize, plane, tokenize_b,
                                              &arg);
-#endif  // CONFIG_PALETTE && CONFIG_PALETTE_THROUGHPUT
 #if !CONFIG_PVQ
       (*t)->token = EOSB_TOKEN;
       (*t)++;
diff --git a/av1/encoder/tokenize.h b/av1/encoder/tokenize.h
index dbfb6a8..46d3eff 100644
--- a/av1/encoder/tokenize.h
+++ b/av1/encoder/tokenize.h
@@ -74,15 +74,10 @@
                            int mi_col, BLOCK_SIZE bsize, int *rate);
 #endif
 #if CONFIG_PALETTE
-#if CONFIG_PALETTE_THROUGHPUT
-void tokenize_palette_b(int plane, int block, int blk_row, int blk_col,
-                        BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg);
-#else
 void av1_tokenize_palette_sb(const struct AV1_COMP *cpi,
                              const struct ThreadData *const td, int plane,
                              TOKENEXTRA **t, RUN_TYPE dry_run, BLOCK_SIZE bsize,
                              int *rate);
-#endif  // CONFIG_PALETTE_THROUGHPUT
 #endif  // CONFIG_PALETTE
 void av1_tokenize_sb(const struct AV1_COMP *cpi, struct ThreadData *td,
                      TOKENEXTRA **t, RUN_TYPE dry_run, BLOCK_SIZE bsize,