Introduce rate propagation factor

Support a rate cost propagation pipeline to estimate the associated
rate cost due to quantization noise. The new tpl propagation kernel
improves the compression efficiency. In the settings of speed 1, VBR
mode, and 150 frames, the coding gains are:

        avg PSNR   ovr PSNR  SSIM
lowres  -0.02%      0.04%    -0.60%
midres  -0.49%     -0.39%    -1.60%
ugc360  -0.23%     -0.08%    -1.07%
hdres   -0.31%     -0.27%    -1.36%

STATS_CHANGED

Change-Id: I4c08474adfe98f2bf6dc3372ff17089a0d74c206
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 60f3e23..c2220c4 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -3595,8 +3595,11 @@
       if (row >= cm->mi_rows || col >= mi_cols_sr) continue;
       TplDepStats *this_stats =
           &tpl_stats[av1_tpl_ptr_pos(cpi, row, col, tpl_stride)];
-      intra_cost += this_stats->recrf_dist;
-      mc_dep_cost += this_stats->recrf_dist + this_stats->mc_dep_delta;
+      int64_t mc_dep_delta =
+          RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+                 this_stats->mc_dep_dist);
+      intra_cost += this_stats->recrf_dist << RDDIV_BITS;
+      mc_dep_cost += (this_stats->recrf_dist << RDDIV_BITS) + mc_dep_delta;
 #if !USE_TPL_CLASSIC_MODEL
       mc_count += this_stats->mc_count;
       mc_saved += this_stats->mc_saved;
@@ -3740,8 +3743,11 @@
       if (row >= cm->mi_rows || col >= mi_cols_sr) continue;
       TplDepStats *this_stats =
           &tpl_stats[av1_tpl_ptr_pos(cpi, row, col, tpl_stride)];
-      intra_cost += this_stats->recrf_dist;
-      mc_dep_cost += this_stats->recrf_dist + this_stats->mc_dep_delta;
+      int64_t mc_dep_delta =
+          RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+                 this_stats->mc_dep_dist);
+      intra_cost += this_stats->recrf_dist << RDDIV_BITS;
+      mc_dep_cost += (this_stats->recrf_dist << RDDIV_BITS) + mc_dep_delta;
 #if !USE_TPL_CLASSIC_MODEL
       mc_count += this_stats->mc_count;
       mc_saved += this_stats->mc_saved;
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index d5c6858..91640dc 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -3944,8 +3944,12 @@
       for (int col = 0; col < mi_cols_sr; col += step) {
         TplDepStats *this_stats =
             &tpl_stats[av1_tpl_ptr_pos(cpi, row, col, tpl_stride)];
-        intra_cost_base += this_stats->recrf_dist;
-        mc_dep_cost_base += this_stats->recrf_dist + this_stats->mc_dep_delta;
+        int64_t mc_dep_delta =
+            RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+                   this_stats->mc_dep_dist);
+        intra_cost_base += (this_stats->recrf_dist << RDDIV_BITS);
+        mc_dep_cost_base +=
+            (this_stats->recrf_dist << RDDIV_BITS) + mc_dep_delta;
 #if !USE_TPL_CLASSIC_MODEL
         mc_count_base += this_stats->mc_count;
         mc_saved_base += this_stats->mc_saved;
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index c6e2274..c7b70ef 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -168,7 +168,11 @@
   int64_t mc_dep_cost;
   int64_t srcrf_dist;
   int64_t recrf_dist;
+  int64_t srcrf_rate;
+  int64_t recrf_rate;
   int64_t mc_dep_delta;
+  int64_t mc_dep_rate;
+  int64_t mc_dep_dist;
   int64_t src_rdcost;
   int64_t rec_rdcost;
   int_mv mv;
@@ -193,6 +197,7 @@
   int mi_rows;
   int mi_cols;
   unsigned int frame_display_index;
+  int base_rdmult;
 } TplDepFrame;
 
 typedef enum {
diff --git a/av1/encoder/tpl_model.c b/av1/encoder/tpl_model.c
index e7d2269..cfa64b9 100644
--- a/av1/encoder/tpl_model.c
+++ b/av1/encoder/tpl_model.c
@@ -287,6 +287,7 @@
                          recon_error, sse);
       int rate_cost = rate_estimator(qcoeff, eob, tx_size);
       best_rdcost = RDCOST(base_rdmult, rate_cost, *recon_error);
+      tpl_stats->srcrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2;
     }
   }
   best_intra_cost = AOMMAX(best_intra_cost, 1);
@@ -298,8 +299,7 @@
   tpl_stats->inter_cost = best_inter_cost << TPL_DEP_COST_SCALE_LOG2;
   tpl_stats->intra_cost = best_intra_cost << TPL_DEP_COST_SCALE_LOG2;
 
-  tpl_stats->srcrf_dist = *recon_error
-                          << (TPL_DEP_COST_SCALE_LOG2 + RDDIV_BITS);
+  tpl_stats->srcrf_dist = *recon_error << (TPL_DEP_COST_SCALE_LOG2);
   tpl_stats->src_rdcost = best_rdcost << TPL_DEP_COST_SCALE_LOG2;
 
   // Final encode
@@ -350,17 +350,18 @@
   av1_inverse_transform_block(xd, dqcoeff, 0, DCT_DCT, tx_size, dst_buffer,
                               dst_buffer_stride, eob, 0);
 
-  tpl_stats->recrf_dist = *recon_error
-                          << (TPL_DEP_COST_SCALE_LOG2 + RDDIV_BITS);
+  tpl_stats->recrf_dist = *recon_error << (TPL_DEP_COST_SCALE_LOG2);
+  tpl_stats->recrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2;
   tpl_stats->rec_rdcost = RDCOST(base_rdmult, rate_cost, *recon_error)
                           << TPL_DEP_COST_SCALE_LOG2;
   if (!is_inter_mode(best_mode)) {
-    tpl_stats->srcrf_dist = *recon_error
-                            << (TPL_DEP_COST_SCALE_LOG2 + RDDIV_BITS);
+    tpl_stats->srcrf_dist = *recon_error << (TPL_DEP_COST_SCALE_LOG2);
+    tpl_stats->srcrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2;
     tpl_stats->src_rdcost = RDCOST(base_rdmult, rate_cost, *recon_error)
                             << TPL_DEP_COST_SCALE_LOG2;
   }
   tpl_stats->recrf_dist = AOMMAX(tpl_stats->srcrf_dist, tpl_stats->recrf_dist);
+  tpl_stats->recrf_rate = AOMMAX(tpl_stats->srcrf_rate, tpl_stats->recrf_rate);
   tpl_stats->rec_rdcost = AOMMAX(tpl_stats->rec_rdcost, tpl_stats->src_rdcost);
 
   if (frame_idx && best_rf_idx != -1) {
@@ -422,6 +423,35 @@
   return (mi_row >> right_shift) * stride + (mi_col >> right_shift);
 }
 
+static int delta_rate_cost(int64_t delta_rate, int64_t recrf_dist,
+                           int64_t srcrf_dist, int pix_num) {
+  double beta = (double)srcrf_dist / recrf_dist;
+  int64_t rate_cost = delta_rate;
+
+  if (srcrf_dist <= 128) return rate_cost;
+
+  double dr =
+      (double)(delta_rate >> (TPL_DEP_COST_SCALE_LOG2 + AV1_PROB_COST_SHIFT)) /
+      pix_num;
+
+  double log_den = log(beta) / log(2.0) + 2.0 * dr;
+
+  if (log_den > log(10.0) / log(2.0)) {
+    rate_cost = (int64_t)((log(1.0 / beta) * pix_num) / log(2.0) / 2.0);
+    rate_cost <<= (TPL_DEP_COST_SCALE_LOG2 + AV1_PROB_COST_SHIFT);
+    return rate_cost;
+  }
+
+  double num = pow(2.0, log_den);
+  double den = num * beta + (1 - beta) * beta;
+
+  rate_cost = (int64_t)((pix_num * log(num / den)) / log(2.0) / 2.0);
+
+  rate_cost <<= (TPL_DEP_COST_SCALE_LOG2 + AV1_PROB_COST_SHIFT);
+
+  return rate_cost;
+}
+
 static AOM_INLINE void tpl_model_update_b(AV1_COMP *cpi, TplDepFrame *tpl_frame,
                                           TplDepStats *tpl_stats_ptr,
                                           int mi_row, int mi_col,
@@ -462,13 +492,19 @@
           (int64_t)(tpl_stats_ptr->quant_ratio * tpl_stats_ptr->mc_dep_cost *
                     (1.0 - iiratio_nl));
 
-      int64_t cur_dep_cost =
-          tpl_stats_ptr->rec_rdcost - tpl_stats_ptr->src_rdcost;
-      int64_t mc_dep_delta = (int64_t)(
-          tpl_stats_ptr->mc_dep_delta *
+      int64_t cur_dep_dist =
+          tpl_stats_ptr->recrf_dist - tpl_stats_ptr->srcrf_dist;
+      int64_t mc_dep_dist = (int64_t)(
+          tpl_stats_ptr->mc_dep_dist *
           ((double)(tpl_stats_ptr->recrf_dist - tpl_stats_ptr->srcrf_dist) /
            tpl_stats_ptr->recrf_dist));
 
+      int64_t delta_rate =
+          tpl_stats_ptr->recrf_rate - tpl_stats_ptr->srcrf_rate;
+      int64_t mc_dep_rate =
+          delta_rate_cost(tpl_stats_ptr->mc_dep_rate, tpl_stats_ptr->recrf_dist,
+                          tpl_stats_ptr->srcrf_dist, pix_num);
+
 #if !USE_TPL_CLASSIC_MODEL
       int64_t mc_saved = tpl_stats_ptr->intra_cost - tpl_stats_ptr->inter_cost;
 #endif  // #if !USE_TPL_CLASSIC_MODEL
@@ -483,8 +519,11 @@
           des_stats->mc_saved += (mc_saved * overlap_area) / pix_num;
 #endif  // !USE_TPL_CLASSIC_MODEL
 
-          des_stats->mc_dep_delta +=
-              ((cur_dep_cost + mc_dep_delta) * overlap_area) / pix_num;
+          des_stats->mc_dep_dist +=
+              ((cur_dep_dist + mc_dep_dist) * overlap_area) / pix_num;
+          des_stats->mc_dep_rate +=
+              ((delta_rate + mc_dep_rate) * overlap_area) / pix_num;
+
           assert(overlap_area >= 0);
         }
       }
@@ -523,6 +562,8 @@
   int64_t inter_cost = src_stats->inter_cost / (mi_height * mi_width);
   int64_t srcrf_dist = src_stats->srcrf_dist / (mi_height * mi_width);
   int64_t recrf_dist = src_stats->recrf_dist / (mi_height * mi_width);
+  int64_t srcrf_rate = src_stats->srcrf_rate / (mi_height * mi_width);
+  int64_t recrf_rate = src_stats->recrf_rate / (mi_height * mi_width);
   int64_t src_rdcost = src_stats->src_rdcost / (mi_height * mi_width);
   int64_t rec_rdcost = src_stats->rec_rdcost / (mi_height * mi_width);
 
@@ -530,6 +571,8 @@
   inter_cost = AOMMAX(1, inter_cost);
   srcrf_dist = AOMMAX(1, srcrf_dist);
   recrf_dist = AOMMAX(1, recrf_dist);
+  srcrf_rate = AOMMAX(1, srcrf_rate);
+  recrf_rate = AOMMAX(1, recrf_rate);
   src_rdcost = AOMMAX(1, src_rdcost);
   rec_rdcost = AOMMAX(1, rec_rdcost);
 
@@ -541,6 +584,8 @@
       tpl_ptr->inter_cost = inter_cost;
       tpl_ptr->srcrf_dist = srcrf_dist;
       tpl_ptr->recrf_dist = recrf_dist;
+      tpl_ptr->srcrf_rate = srcrf_rate;
+      tpl_ptr->recrf_rate = recrf_rate;
       tpl_ptr->src_rdcost = src_rdcost;
       tpl_ptr->rec_rdcost = rec_rdcost;
       tpl_ptr->quant_ratio = src_stats->quant_ratio;
@@ -669,6 +714,8 @@
 
   int base_rdmult = av1_compute_rd_mult_based_on_qindex(cpi, pframe_qindex) / 6;
 
+  tpl_frame->base_rdmult = base_rdmult;
+
   for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
     // Motion estimation row boundary
     x->mv_limits.row_min = -((mi_row * MI_SIZE) + (17 - 2 * AOM_INTERP_EXTEND));
@@ -1244,9 +1291,12 @@
           if (mi_row >= cm->mi_rows || mi_col >= mi_cols_sr) continue;
           const TplDepStats *this_stats =
               &tpl_stats[av1_tpl_ptr_pos(cpi, mi_row, mi_col, tpl_stride)];
-          intra_cost += (double)this_stats->recrf_dist;
+          int64_t mc_dep_delta =
+              RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+                     this_stats->mc_dep_dist);
+          intra_cost += (double)(this_stats->recrf_dist << RDDIV_BITS);
           mc_dep_cost +=
-              (double)this_stats->recrf_dist + this_stats->mc_dep_delta;
+              (double)(this_stats->recrf_dist << RDDIV_BITS) + mc_dep_delta;
         }
       }
       const double rk = intra_cost / mc_dep_cost;