AV1 RT: Add color to RDCost

7% BDRate improvement in UV PSNR. On some clips the improvement of UV
BDRate is 20-30%

Change-Id: Id981340dc4e2f123545362244935c1d2b2198897
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index 468de1a..febac03 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -440,6 +440,9 @@
 #endif
   int thresh_freq_fact[BLOCK_SIZES_ALL][MAX_MODES];
   uint8_t variance_low[105];
+  // Strong color activity detection. Used in REALTIME coding mode to enhance
+  // the visual quality at the boundary of moving color objects.
+  uint8_t color_sensitivity[2];
 };
 
 static INLINE int is_rect_tx_allowed_bsize(BLOCK_SIZE bsize) {
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index fd11726..06c7b40 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -4055,6 +4055,8 @@
     }
 
     x->mb_rd_record.num = x->mb_rd_record.index_start = 0;
+    x->color_sensitivity[0] = 0;
+    x->color_sensitivity[1] = 0;
 
     if (!use_nonrd_mode) {
       av1_zero(x->txb_rd_record_8X8);
diff --git a/av1/encoder/nonrd_pickmode.c b/av1/encoder/nonrd_pickmode.c
index 387537d..850b6c5 100644
--- a/av1/encoder/nonrd_pickmode.c
+++ b/av1/encoder/nonrd_pickmode.c
@@ -968,6 +968,56 @@
   }
 }
 
+static void model_rd_for_sb_uv(AV1_COMP *cpi, BLOCK_SIZE plane_bsize,
+                               MACROBLOCK *x, MACROBLOCKD *xd,
+                               RD_STATS *this_rdc, unsigned int *var_y,
+                               unsigned int *sse_y, int start_plane,
+                               int stop_plane) {
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  unsigned int sse;
+  int rate;
+  int64_t dist;
+  int i;
+  uint32_t tot_var = *var_y;
+  uint32_t tot_sse = *sse_y;
+
+  this_rdc->rate = 0;
+  this_rdc->dist = 0;
+
+  for (i = start_plane; i <= stop_plane; ++i) {
+    struct macroblock_plane *const p = &x->plane[i];
+    struct macroblockd_plane *const pd = &xd->plane[i];
+    const uint32_t dc_quant = p->dequant_QTX[0];
+    const uint32_t ac_quant = p->dequant_QTX[1];
+    const BLOCK_SIZE bs = plane_bsize;
+    unsigned int var;
+    if (!x->color_sensitivity[i - 1]) continue;
+
+    var = cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf,
+                             pd->dst.stride, &sse);
+    assert(sse >= var);
+    tot_var += var;
+    tot_sse += sse;
+
+    av1_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bs],
+                                 dc_quant >> 3, &rate, &dist);
+
+    this_rdc->rate += rate >> 1;
+    this_rdc->dist += dist << 3;
+
+    av1_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bs], ac_quant >> 3,
+                                 &rate, &dist);
+
+    this_rdc->rate += rate;
+    this_rdc->dist += dist << 4;
+  }
+
+  *var_y = tot_var;
+  *sse_y = tot_sse;
+}
+
 struct estimate_block_intra_args {
   AV1_COMP *cpi;
   MACROBLOCK *x;
@@ -994,20 +1044,21 @@
   RD_STATS this_rdc;
 
   (void)block;
-  (void)plane_bsize;
-  assert(plane == 0);
 
   p->src.buf = &src_buf_base[4 * (row * src_stride + col)];
   pd->dst.buf = &dst_buf_base[4 * (row * dst_stride + col)];
 
-  av1_predict_intra_block_facade(cm, xd, 0, col, row, tx_size);
+  av1_predict_intra_block_facade(cm, xd, plane, col, row, tx_size);
 
   if (plane == 0) {
     int64_t this_sse = INT64_MAX;
     block_yrd(cpi, x, 0, 0, &this_rdc, &args->skippable, &this_sse, bsize_tx,
               AOMMIN(tx_size, TX_16X16));
   } else {
-    return;
+    unsigned int var = 0;
+    unsigned int sse = 0;
+    model_rd_for_sb_uv(cpi, plane_bsize, x, xd, &this_rdc, &var, &sse, plane,
+                       plane);
   }
 
   p->src.buf = src_buf_base;
@@ -1443,6 +1494,11 @@
       }
     }
 
+    if (x->color_sensitivity[0] || x->color_sensitivity[1]) {
+      this_early_term = 0;
+      this_rdc.skip = 0;
+    }
+
     if (ref_frame == LAST_FRAME && frame_mv[this_mode][ref_frame].as_int == 0) {
       sse_zeromv_norm =
           sse_y >> (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
@@ -1487,6 +1543,24 @@
       this_rdc.dist = sse_y << 4;
     }
 
+    if (!this_early_term &&
+        (x->color_sensitivity[0] || x->color_sensitivity[1])) {
+      RD_STATS rdc_uv;
+      const BLOCK_SIZE uv_bsize = get_plane_block_size(
+          bsize, xd->plane[1].subsampling_x, xd->plane[1].subsampling_y);
+      if (x->color_sensitivity[0]) {
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+                                      AOM_PLANE_U, AOM_PLANE_U);
+      }
+      if (x->color_sensitivity[1]) {
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+                                      AOM_PLANE_V, AOM_PLANE_V);
+      }
+      model_rd_for_sb_uv(cpi, uv_bsize, x, xd, &rdc_uv, &var_y, &sse_y, 1, 2);
+      this_rdc.rate += rdc_uv.rate;
+      this_rdc.dist += rdc_uv.dist;
+    }
+
     // TODO(kyslov) account for UV prediction cost
     this_rdc.rate += rate_mv;
     const int16_t mode_ctx =
@@ -1572,6 +1646,8 @@
                               rd_thresh_freq_fact[mode_index])) {
         continue;
       }
+      const BLOCK_SIZE uv_bsize = get_plane_block_size(
+          bsize, xd->plane[1].subsampling_x, xd->plane[1].subsampling_y);
 
       mi->mode = this_mode;
       mi->ref_frame[0] = INTRA_FRAME;
@@ -1585,6 +1661,13 @@
       av1_foreach_transformed_block_in_plane(xd, bsize, 0, estimate_block_intra,
                                              &args);
       // TODO(kyslov@) Need to account for skippable
+      if (x->color_sensitivity[0])
+        av1_foreach_transformed_block_in_plane(xd, uv_bsize, 1,
+                                               estimate_block_intra, &args);
+      if (x->color_sensitivity[1])
+        av1_foreach_transformed_block_in_plane(xd, uv_bsize, 2,
+                                               estimate_block_intra, &args);
+
       int mode_cost = 0;
       if (av1_is_directional_mode(this_mode) && av1_use_angle_delta(bsize)) {
         mode_cost += x->angle_delta_cost[this_mode - V_PRED]
diff --git a/av1/encoder/var_based_part.c b/av1/encoder/var_based_part.c
index d17e686..6ee7c16 100644
--- a/av1/encoder/var_based_part.c
+++ b/av1/encoder/var_based_part.c
@@ -496,6 +496,28 @@
   }
 }
 
+static void chroma_check(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+                         unsigned int y_sad, int is_key_frame) {
+  int i;
+  MACROBLOCKD *xd = &x->e_mbd;
+
+  if (is_key_frame) return;
+
+  for (i = 1; i <= 2; ++i) {
+    unsigned int uv_sad = UINT_MAX;
+    struct macroblock_plane *p = &x->plane[i];
+    struct macroblockd_plane *pd = &xd->plane[i];
+    const BLOCK_SIZE bs =
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+
+    if (bs != BLOCK_INVALID)
+      uv_sad = cpi->fn_ptr[bs].sdf(p->src.buf, p->src.stride, pd->dst.buf,
+                                   pd->dst.stride);
+
+    x->color_sensitivity[i - 1] = uv_sad > (y_sad / 6);
+  }
+}
+
 // This function chooses partitioning based on the variance between source and
 // reconstructed last, where variance is computed for down-sampled inputs.
 // TODO(kyslov): lot of things. Bring back noise estimation, brush up partition
@@ -534,6 +556,9 @@
   const int is_small_sb = (cm->seq_params.sb_size == BLOCK_64X64);
   const int num_64x64_blocks = is_small_sb ? 1 : 4;
 
+  unsigned int y_sad = UINT_MAX;
+  BLOCK_SIZE bsize = is_small_sb ? BLOCK_64X64 : BLOCK_128X128;
+
   // Ref frame used in partitioning.
   MV_REFERENCE_FRAME ref_frame_partition = LAST_FRAME;
 
@@ -589,17 +614,16 @@
     if (cpi->sf.estimate_motion_for_var_based_partition) {
       if (xd->mb_to_right_edge >= 0 && xd->mb_to_bottom_edge >= 0) {
         const MV dummy_mv = { 0, 0 };
-        av1_int_pro_motion_estimation(cpi, x, cm->seq_params.sb_size, mi_row,
-                                      mi_col, &dummy_mv);
+        y_sad = av1_int_pro_motion_estimation(cpi, x, cm->seq_params.sb_size,
+                                              mi_row, mi_col, &dummy_mv);
       }
     }
+    if (y_sad == UINT_MAX) {
+      y_sad = cpi->fn_ptr[bsize].sdf(
+          x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf,
+          xd->plane[0].pre[0].stride);
+    }
 
-// TODO(kyslov): bring the small SAD functionality back
-#if 0
-    y_sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, x->plane[0].src.stride,
-                                   xd->plane[0].pre[0].buf,
-                                   xd->plane[0].pre[0].stride);
-#endif
     x->pred_mv[LAST_FRAME] = mi->mv[0].as_mv;
 
     set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
@@ -856,6 +880,7 @@
     set_low_temp_var_flag(cpi, x, xd, vt, thresholds, ref_frame_partition,
                           mi_col, mi_row);
   }
+  chroma_check(cpi, x, bsize, y_sad, is_key_frame);
 
   if (vt2) aom_free(vt2);
   if (vt) aom_free(vt);