improving palette throughput

* code the palette color index using 45 wavefront
* interleave the coeff and palette color index in
  transform block level
* the above change does not change code efficiency

Details: 
The 45 wavefront scan allows to compute the ctx of
the diagonal samples' indices  at the same time. 
Interleaving palette indices and palette residual
on a transform block basis means that the entropy
 decoding and further processing of the palette 
residual is not delayed by the entropy decoding 
of all the color indices of the palette encoded 
block.
Change-Id: Ie9f576002a9a68394b99c23b01e9730df06df070
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index d788276..7aa459e 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -783,7 +783,7 @@
                            aom_bit_depth_t bit_depth, const TX_SIZE tx_size,
                            TOKEN_STATS *token_stats) {
   const TOKENEXTRA *p = *tp;
-#if CONFIG_VAR_TX
+#if CONFIG_VAR_TX || CONFIG_PALETTE_THROUGHPUT
   int count = 0;
   const int seg_eob = tx_size_2d[tx_size];
 #endif
@@ -846,7 +846,7 @@
     }
     ++p;
 
-#if CONFIG_VAR_TX
+#if CONFIG_VAR_TX || CONFIG_PALETTE_THROUGHPUT
     ++count;
     if (token == EOB_TOKEN || count == seg_eob) break;
 #endif
@@ -860,7 +860,7 @@
                            aom_bit_depth_t bit_depth, const TX_SIZE tx_size,
                            TOKEN_STATS *token_stats) {
   const TOKENEXTRA *p = *tp;
-#if CONFIG_VAR_TX
+#if CONFIG_VAR_TX || CONFIG_PALETTE_THROUGHPUT
   int count = 0;
   const int seg_eob = tx_size_2d[tx_size];
 #endif
@@ -951,7 +951,7 @@
     }
     ++p;
 
-#if CONFIG_VAR_TX
+#if CONFIG_VAR_TX || CONFIG_PALETTE_THROUGHPUT
     ++count;
     if (token == EOB_TOKEN || count == seg_eob) break;
 #endif
@@ -2062,7 +2062,6 @@
 #else
   set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
 #endif
-
 #if CONFIG_PVQ
   mbmi = &m->mbmi;
   bsize = mbmi->sb_type;
@@ -2070,18 +2069,23 @@
 #endif
 
 #if CONFIG_PALETTE
-  for (plane = 0; plane <= 1; ++plane) {
-    const uint8_t palette_size_plane =
-        m->mbmi.palette_mode_info.palette_size[plane];
-    if (palette_size_plane > 0) {
-      int rows, cols;
-      av1_get_block_dimensions(m->mbmi.sb_type, plane, xd, NULL, NULL, &rows,
-                               &cols);
-      assert(*tok < tok_end);
-      pack_palette_tokens(w, tok, palette_size_plane, rows * cols - 1);
-      assert(*tok < tok_end + m->mbmi.skip);
+#if CONFIG_PALETTE_THROUGHPUT
+  // when block is skipped, palette index is coded here
+  // since there is no coeff to be interleaved.
+  if (m->mbmi.skip)
+#endif  // CONFIG_PALETTE_THROUGHPUT
+    for (plane = 0; plane <= 1; ++plane) {
+      const uint8_t palette_size_plane =
+          m->mbmi.palette_mode_info.palette_size[plane];
+      if (palette_size_plane > 0) {
+        int rows, cols;
+        av1_get_block_dimensions(m->mbmi.sb_type, plane, xd, NULL, NULL, &rows,
+                                 &cols);
+        assert(*tok < tok_end);
+        pack_palette_tokens(w, tok, palette_size_plane, rows * cols - 1);
+        assert(*tok < tok_end + m->mbmi.skip);
+      }
     }
-  }
 #endif  // CONFIG_PALETTE
 
 #if !CONFIG_PVQ
@@ -2219,17 +2223,81 @@
                            : m->mbmi.tx_size;
         const int bkw = tx_size_wide_unit[tx];
         const int bkh = tx_size_high_unit[tx];
-
-        for (row = 0; row < num_4x4_h; row += bkh)
-          for (col = 0; col < num_4x4_w; col += bkw)
+#if CONFIG_PALETTE && CONFIG_PALETTE_THROUGHPUT
+        const uint8_t palette_size_plane =
+            m->mbmi.palette_mode_info.palette_size[plane > 0];
+        const int bkw_in_pixel = bkw << tx_size_wide_log2[0];
+        const int bkh_in_pixel = bkh << tx_size_wide_log2[0];
+        int rows, cols;
+        av1_get_block_dimensions(m->mbmi.sb_type, plane, xd, NULL, NULL, &rows,
+                                 &cols);
+#endif  // CONFIG_PALETTE && CONFIG_PALETTE_THROUGHPUT
+        for (row = 0; row < num_4x4_h; row += bkh) {
+          for (col = 0; col < num_4x4_w; col += bkw) {
+#if CONFIG_PALETTE && CONFIG_PALETTE_THROUGHPUT
+            if (palette_size_plane > 0 && plane <= 1) {
+              const int col_in_pixel = col << tx_size_wide_log2[0];
+              const int row_in_pixel = row << tx_size_high_log2[0];
+              const int txbkw = AOMMIN(cols - col_in_pixel, bkw_in_pixel);
+              const int txbkh = AOMMIN(rows - row_in_pixel, bkh_in_pixel);
+              // first palette index is not coded here but in header instead
+              const int num_palette_indexes =
+                  txbkw * txbkh - ((row == 0 && col == 0) ? 1 : 0);
+              pack_palette_tokens(w, tok, palette_size_plane,
+                                  num_palette_indexes);
+            }
+#endif  // CONFIG_PALETTE && CONFIG_PALETTE_THROUGHPUT
             pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx, &token_stats);
+          }
+        }
       }
 #else
       TX_SIZE tx =
           plane ? get_uv_tx_size(&m->mbmi, &xd->plane[plane]) : m->mbmi.tx_size;
       TOKEN_STATS token_stats;
+#if CONFIG_PALETTE && CONFIG_PALETTE_THROUGHPUT
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      BLOCK_SIZE bsize = mbmi->sb_type;
+#if CONFIG_CB4X4
+      const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+#else
+      const BLOCK_SIZE plane_bsize =
+          get_plane_block_size(AOMMAX(bsize, BLOCK_8X8), pd);
+#endif
+      const int num_4x4_w =
+          block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+      const int num_4x4_h =
+          block_size_high[plane_bsize] >> tx_size_wide_log2[0];
+      int row, col;
+      const int bkw = tx_size_wide_unit[tx];
+      const int bkh = tx_size_high_unit[tx];
+      const uint8_t palette_size_plane =
+          m->mbmi.palette_mode_info.palette_size[plane > 0];
+      const int bkw_in_pixel = bkw << tx_size_wide_log2[0];
+      const int bkh_in_pixel = bkh << tx_size_wide_log2[0];
+      int rows, cols;
+      av1_get_block_dimensions(m->mbmi.sb_type, plane, xd, NULL, NULL, &rows,
+                               &cols);
+      for (row = 0; row < num_4x4_h; row += bkh) {
+        for (col = 0; col < num_4x4_w; col += bkw) {
+          if (!is_inter_block(mbmi) && palette_size_plane > 0 && plane <= 1) {
+            const int col_in_pixel = col << tx_size_wide_log2[0];
+            const int row_in_pixel = row << tx_size_high_log2[0];
+            const int txbkw = AOMMIN(cols - col_in_pixel, bkw_in_pixel);
+            const int txbkh = AOMMIN(rows - row_in_pixel, bkh_in_pixel);
+            // first palette index is not coded here but in header instead
+            const int num_palette_indexes =
+                txbkw * txbkh - ((row == 0 && col == 0) ? 1 : 0);
+            pack_palette_tokens(w, tok, palette_size_plane,
+                                num_palette_indexes);
+          }
+          pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx, &token_stats);
+        }
+      }
+#else
       init_token_stats(&token_stats);
       pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx, &token_stats);
+#endif  // CONFIG_PALETTE && CONFIG_PALETTE_THROUGHPUT
 #if CONFIG_RD_DEBUG
       if (is_inter_block(mbmi) && mbmi->sb_type >= BLOCK_8X8 &&
           rd_token_stats_mismatch(&m->mbmi.rd_stats, &token_stats, plane)) {