Automatically turn on/off screen content tools

Turn "allow_screen_content_tools" on when the source video has many blocks
with only few different colors. The automatic detection is enabled by
defualt (or with command line flag "--tune-content=default"). With
"--tune-content=screen", the screen content tools are always turned on.

On the screen_content test set, the "default" setting is less than 0.3%
worse than the "screen" setting on keyframe encoding.

Change-Id: Iac7ab8952c96531d1fae84da1823291f5987519c
diff --git a/av1/encoder/context_tree.c b/av1/encoder/context_tree.c
index 5c521d4..b1c01b2 100644
--- a/av1/encoder/context_tree.c
+++ b/av1/encoder/context_tree.c
@@ -65,12 +65,10 @@
   }
 
 #if CONFIG_PALETTE
-  if (cm->allow_screen_content_tools) {
-    for (i = 0; i < 2; ++i) {
-      CHECK_MEM_ERROR(
-          cm, ctx->color_index_map[i],
-          aom_memalign(32, num_pix * sizeof(*ctx->color_index_map[i])));
-    }
+  for (i = 0; i < 2; ++i) {
+    CHECK_MEM_ERROR(
+        cm, ctx->color_index_map[i],
+        aom_memalign(32, num_pix * sizeof(*ctx->color_index_map[i])));
   }
 #endif  // CONFIG_PALETTE
 }
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index ec9e410..00cb1a2 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -5011,6 +5011,28 @@
 }
 #endif  // CONFIG_GLOBAL_MOTION
 
+#if CONFIG_PALETTE
+// Estimate if the source frame is screen content, based on the portion of
+// blocks that have no more than 4 (experimentally selected) luma colors.
+static int is_screen_content(const uint8_t *src, int stride, int width,
+                             int height) {
+  assert(src != NULL);
+  int counts = 0;
+  const int blk_w = 16;
+  const int blk_h = 16;
+  const int limit = 4;
+  for (int r = 0; r + blk_h <= height; r += blk_h) {
+    for (int c = 0; c + blk_w <= width; c += blk_w) {
+      const int n_colors =
+          av1_count_colors(src + r * stride + c, stride, blk_w, blk_h);
+      if (n_colors > 1 && n_colors <= limit) counts++;
+    }
+  }
+  // The threshold is 10%.
+  return counts * blk_h * blk_w * 10 > width * height;
+}
+#endif  // CONFIG_PALETTE
+
 static void encode_frame_internal(AV1_COMP *cpi) {
   ThreadData *const td = &cpi->td;
   MACROBLOCK *const x = &td->mb;
@@ -5037,6 +5059,14 @@
   av1_zero(rdc->coef_counts);
   av1_zero(rdc->comp_pred_diff);
 
+#if CONFIG_PALETTE
+  if (cpi->auto_tune_content && frame_is_intra_only(cm)) {
+    cm->allow_screen_content_tools =
+        is_screen_content(cpi->source->y_buffer, cpi->source->y_stride,
+                          cpi->source->y_width, cpi->source->y_height);
+  }
+#endif  // CONFIG_PALETTE
+
 #if CONFIG_GLOBAL_MOTION
   av1_zero(rdc->global_motion_used);
   av1_zero(cpi->gmparams_cost);
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 1af8911..88d5df2 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -500,8 +500,7 @@
   av1_free_pc_tree(&cpi->td);
 
 #if CONFIG_PALETTE
-  if (cpi->common.allow_screen_content_tools)
-    aom_free(cpi->td.mb.palette_buffer);
+  aom_free(cpi->td.mb.palette_buffer);
 #endif  // CONFIG_PALETTE
 
 #if CONFIG_ANS
@@ -2142,6 +2141,7 @@
 void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
   AV1_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
+  MACROBLOCK *const x = &cpi->td.mb;
 
   if (cm->profile != oxcf->profile) cm->profile = oxcf->profile;
   cm->bit_depth = oxcf->bit_depth;
@@ -2158,9 +2158,9 @@
     assert(cm->bit_depth > AOM_BITS_8);
 
   cpi->oxcf = *oxcf;
-  cpi->td.mb.e_mbd.bd = (int)cm->bit_depth;
+  x->e_mbd.bd = (int)cm->bit_depth;
 #if CONFIG_GLOBAL_MOTION
-  cpi->td.mb.e_mbd.global_motion = cm->global_motion;
+  x->e_mbd.global_motion = cm->global_motion;
 #endif  // CONFIG_GLOBAL_MOTION
 
   if ((oxcf->pass == 0) && (oxcf->rc_mode == AOM_Q)) {
@@ -2181,20 +2181,17 @@
           : REFRESH_FRAME_CONTEXT_BACKWARD;
   cm->reset_frame_context = RESET_FRAME_CONTEXT_NONE;
 
-#if CONFIG_PALETTE
-  cm->allow_screen_content_tools = (cpi->oxcf.content == AOM_CONTENT_SCREEN);
-  if (cm->allow_screen_content_tools) {
-    MACROBLOCK *x = &cpi->td.mb;
-    if (x->palette_buffer == 0) {
-      CHECK_MEM_ERROR(cm, x->palette_buffer,
-                      aom_memalign(16, sizeof(*x->palette_buffer)));
-    }
-    // Reallocate the pc_tree, as it's contents depends on
-    // the state of cm->allow_screen_content_tools
-    av1_free_pc_tree(&cpi->td);
-    av1_setup_pc_tree(&cpi->common, &cpi->td);
+#if CONFIG_PALETTE || CONFIG_INTRABC
+  if (frame_is_intra_only(cm)) {
+    cm->allow_screen_content_tools = (cpi->oxcf.content == AOM_CONTENT_SCREEN);
+    // Automatically decide if screen content tools should be enabled.
+    cpi->auto_tune_content = (cpi->oxcf.content == AOM_CONTENT_DEFAULT);
   }
-#endif  // CONFIG_PALETTE
+  if (x->palette_buffer == 0) {
+    CHECK_MEM_ERROR(cm, x->palette_buffer,
+                    aom_memalign(16, sizeof(*x->palette_buffer)));
+  }
+#endif  // CONFIG_PALETTE || CONFIG_INTRABC
 #if CONFIG_EXT_INTER
   set_compound_tools(cm);
 #endif  // CONFIG_EXT_INTER
@@ -2811,8 +2808,7 @@
     // Deallocate allocated thread data.
     if (t < cpi->num_workers - 1) {
 #if CONFIG_PALETTE
-      if (cpi->common.allow_screen_content_tools)
-        aom_free(thread_data->td->palette_buffer);
+      aom_free(thread_data->td->palette_buffer);
 #endif  // CONFIG_PALETTE
 #if CONFIG_MOTION_VAR
       aom_free(thread_data->td->above_pred_buf);
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index 3145657..27931ee 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -651,6 +651,12 @@
 #if CONFIG_SPEED_REFS
   int sb_scanning_pass_idx;
 #endif  // CONFIG_SPEED_REFS
+
+#if CONFIG_PALETTE || CONFIG_INTRABC
+  // Specify if encoder does automatic classification of source video as
+  // screen content or not.
+  int auto_tune_content;
+#endif  // CONFIG_PALETTE || CONFIG_INTRABC
 } AV1_COMP;
 
 void av1_initialize_enc(void);
diff --git a/av1/encoder/ethread.c b/av1/encoder/ethread.c
index 072dead..1aa1d52 100644
--- a/av1/encoder/ethread.c
+++ b/av1/encoder/ethread.c
@@ -126,11 +126,9 @@
 
 #if CONFIG_PALETTE
         // Allocate buffers used by palette coding mode.
-        if (cpi->common.allow_screen_content_tools) {
-          CHECK_MEM_ERROR(
-              cm, thread_data->td->palette_buffer,
-              aom_memalign(16, sizeof(*thread_data->td->palette_buffer)));
-        }
+        CHECK_MEM_ERROR(
+            cm, thread_data->td->palette_buffer,
+            aom_memalign(16, sizeof(*thread_data->td->palette_buffer)));
 #endif  // CONFIG_PALETTE
 
         // Create threads
@@ -172,7 +170,7 @@
     }
 
 #if CONFIG_PALETTE
-    if (cpi->common.allow_screen_content_tools && i < num_workers - 1)
+    if (i < num_workers - 1)
       thread_data->td->mb.palette_buffer = thread_data->td->palette_buffer;
 #endif  // CONFIG_PALETTE
   }
diff --git a/av1/encoder/palette.c b/av1/encoder/palette.c
index 235964d..bac06cd 100644
--- a/av1/encoder/palette.c
+++ b/av1/encoder/palette.c
@@ -145,27 +145,6 @@
   return num_unique;
 }
 
-int av1_count_colors(const uint8_t *src, int stride, int rows, int cols) {
-  int n = 0, r, c, i, val_count[256];
-  uint8_t val;
-  memset(val_count, 0, sizeof(val_count));
-
-  for (r = 0; r < rows; ++r) {
-    for (c = 0; c < cols; ++c) {
-      val = src[r * stride + c];
-      ++val_count[val];
-    }
-  }
-
-  for (i = 0; i < 256; ++i) {
-    if (val_count[i]) {
-      ++n;
-    }
-  }
-
-  return n;
-}
-
 #if CONFIG_PALETTE_DELTA_ENCODING
 static int delta_encode_cost(const int *colors, int num, int bit_depth,
                              int min_val) {
@@ -291,30 +270,3 @@
   return 2 * bit_depth * n * av1_cost_bit(128, 0);
 #endif  // CONFIG_PALETTE_DELTA_ENCODING
 }
-
-#if CONFIG_HIGHBITDEPTH
-int av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols,
-                            int bit_depth) {
-  int n = 0, r, c, i;
-  uint16_t val;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  int val_count[1 << 12];
-
-  assert(bit_depth <= 12);
-  memset(val_count, 0, (1 << 12) * sizeof(val_count[0]));
-  for (r = 0; r < rows; ++r) {
-    for (c = 0; c < cols; ++c) {
-      val = src[r * stride + c];
-      ++val_count[val];
-    }
-  }
-
-  for (i = 0; i < (1 << bit_depth); ++i) {
-    if (val_count[i]) {
-      ++n;
-    }
-  }
-
-  return n;
-}
-#endif  // CONFIG_HIGHBITDEPTH
diff --git a/av1/encoder/palette.h b/av1/encoder/palette.h
index f5a3c1b..8afe5a7 100644
--- a/av1/encoder/palette.h
+++ b/av1/encoder/palette.h
@@ -36,14 +36,6 @@
 // method.
 int av1_remove_duplicates(float *centroids, int num_centroids);
 
-// Returns the number of colors in 'src'.
-int av1_count_colors(const uint8_t *src, int stride, int rows, int cols);
-#if CONFIG_HIGHBITDEPTH
-// Same as av1_count_colors(), but for high-bitdepth mode.
-int av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols,
-                            int bit_depth);
-#endif  // CONFIG_HIGHBITDEPTH
-
 #if CONFIG_PALETTE_DELTA_ENCODING
 // Given a color cache and a set of base colors, find if each cache color is
 // present in the base colors, record the binary results in "cache_color_found".
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index def2a36..fa4b970 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -1591,6 +1591,43 @@
                                   visible_rows);
 }
 
+#if CONFIG_PALETTE || CONFIG_INTRABC
+int av1_count_colors(const uint8_t *src, int stride, int rows, int cols) {
+  int val_count[256];
+  memset(val_count, 0, sizeof(val_count));
+  for (int r = 0; r < rows; ++r) {
+    for (int c = 0; c < cols; ++c) {
+      ++val_count[src[r * stride + c]];
+    }
+  }
+  int n = 0;
+  for (int i = 0; i < 256; ++i) {
+    if (val_count[i]) ++n;
+  }
+  return n;
+}
+
+#if CONFIG_HIGHBITDEPTH
+int av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols,
+                            int bit_depth) {
+  assert(bit_depth <= 12);
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  int val_count[1 << 12];
+  memset(val_count, 0, (1 << 12) * sizeof(val_count[0]));
+  for (int r = 0; r < rows; ++r) {
+    for (int c = 0; c < cols; ++c) {
+      ++val_count[src[r * stride + c]];
+    }
+  }
+  int n = 0;
+  for (int i = 0; i < (1 << bit_depth); ++i) {
+    if (val_count[i]) ++n;
+  }
+  return n;
+}
+#endif  // CONFIG_HIGHBITDEPTH
+#endif  // CONFIG_PALETTE || CONFIG_INTRABC
+
 void av1_dist_block(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
                     BLOCK_SIZE plane_bsize, int block, int blk_row, int blk_col,
                     TX_SIZE tx_size, int64_t *out_dist, int64_t *out_sse,
diff --git a/av1/encoder/rdopt.h b/av1/encoder/rdopt.h
index 2c09c20..e5614c7 100644
--- a/av1/encoder/rdopt.h
+++ b/av1/encoder/rdopt.h
@@ -57,6 +57,16 @@
   OUTPUT_HAS_DECODED_PIXELS
 } OUTPUT_STATUS;
 
+#if CONFIG_PALETTE || CONFIG_INTRABC
+// Returns the number of colors in 'src'.
+int av1_count_colors(const uint8_t *src, int stride, int rows, int cols);
+#if CONFIG_HIGHBITDEPTH
+// Same as av1_count_colors(), but for high-bitdepth mode.
+int av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols,
+                            int bit_depth);
+#endif  // CONFIG_HIGHBITDEPTH
+#endif  // CONFIG_PALETTE || CONFIG_INTRABC
+
 void av1_dist_block(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
                     BLOCK_SIZE plane_bsize, int block, int blk_row, int blk_col,
                     TX_SIZE tx_size, int64_t *out_dist, int64_t *out_sse,