Reduce the size of TplDepStats structure

The structure TplDepStats holds statistics related to TPL
at 16x16 block level. Size of the few variables present in
that structure can be reduced by avoiding the left shift
operation with TPL_DEP_COST_SCALE_LOG2. This CL, optimizes
the memory of these few varibales by delaying the left shift
operation from population stage to consumption stage.

               Peak Memory Reduction(%)
Resolution    Single Thread(good Speed 6)
640x360              1.66
854x480              1.94
832x480              1.99
1280x720             1.88
1920x1080            2.23

HEAP memory reduction was measured using the following command.
$valgrind --tool=massif ./aomenc ...

Change-Id: I317bb5ca49021dee6ad238e8c6e3419201993cac
diff --git a/av1/ducky_encode.cc b/av1/ducky_encode.cc
index a4fa2f8..972cbbb 100644
--- a/av1/ducky_encode.cc
+++ b/av1/ducky_encode.cc
@@ -414,8 +414,10 @@
         block_stats.col = mi_col * MI_SIZE;
         block_stats.height = (1 << block_mis_log2) * MI_SIZE;
         block_stats.width = (1 << block_mis_log2) * MI_SIZE;
-        block_stats.inter_cost = tpl_stats_ptr->inter_cost;
-        block_stats.intra_cost = tpl_stats_ptr->intra_cost;
+        block_stats.inter_cost = tpl_stats_ptr->inter_cost
+                                 << TPL_DEP_COST_SCALE_LOG2;
+        block_stats.intra_cost = tpl_stats_ptr->intra_cost
+                                 << TPL_DEP_COST_SCALE_LOG2;
         block_stats.ref_frame_index = { -1, -1 };
 
         for (int i = 0; i < kBlockRefCount; ++i) {
diff --git a/av1/encoder/encodeframe_utils.c b/av1/encoder/encodeframe_utils.c
index e40698c..df248ca 100644
--- a/av1/encoder/encodeframe_utils.c
+++ b/av1/encoder/encodeframe_utils.c
@@ -950,8 +950,10 @@
 
       TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(
           row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)];
-      sb_enc->tpl_inter_cost[count] = this_stats->inter_cost;
-      sb_enc->tpl_intra_cost[count] = this_stats->intra_cost;
+      sb_enc->tpl_inter_cost[count] = this_stats->inter_cost
+                                      << TPL_DEP_COST_SCALE_LOG2;
+      sb_enc->tpl_intra_cost[count] = this_stats->intra_cost
+                                      << TPL_DEP_COST_SCALE_LOG2;
       memcpy(sb_enc->tpl_mv[count], this_stats->mv, sizeof(this_stats->mv));
       mi_count++;
       count++;
@@ -1020,7 +1022,7 @@
       mc_dep_reg += log(3 * dist_scaled + mc_dep_delta) * cbcmp;
       srcrf_dist += (double)(this_stats->srcrf_dist << RDDIV_BITS);
       srcrf_sse += (double)(this_stats->srcrf_sse << RDDIV_BITS);
-      srcrf_rate += (double)this_stats->srcrf_rate;
+      srcrf_rate += (double)(this_stats->srcrf_rate << TPL_DEP_COST_SCALE_LOG2);
 #ifndef NDEBUG
       mi_count++;
 #endif
diff --git a/av1/encoder/tpl_model.c b/av1/encoder/tpl_model.c
index d2e351b..b8366f0 100644
--- a/av1/encoder/tpl_model.c
+++ b/av1/encoder/tpl_model.c
@@ -196,7 +196,7 @@
   }
 }
 
-static AOM_INLINE int64_t tpl_get_satd_cost(BitDepthInfo bd_info,
+static AOM_INLINE int32_t tpl_get_satd_cost(BitDepthInfo bd_info,
                                             int16_t *src_diff, int diff_stride,
                                             const uint8_t *src, int src_stride,
                                             const uint8_t *dst, int dst_stride,
@@ -458,8 +458,8 @@
 
   int frame_offset = tpl_data->frame_idx - cpi->gf_frame_index;
 
-  int64_t best_intra_cost = INT64_MAX;
-  int64_t intra_cost;
+  int32_t best_intra_cost = INT32_MAX;
+  int32_t intra_cost;
   PREDICTION_MODE best_mode = DC_PRED;
 
   int mb_y_offset = mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
@@ -606,8 +606,8 @@
 
   int best_rf_idx = -1;
   int_mv best_mv[2];
-  int64_t inter_cost;
-  int64_t best_inter_cost = INT64_MAX;
+  int32_t inter_cost;
+  int32_t best_inter_cost = INT32_MAX;
   int rf_idx;
   int_mv single_mv[INTER_REFS_PER_FRAME];
 
@@ -878,7 +878,7 @@
     xd->mi[0]->ref_frame[1] = best_rf_idx1 + LAST_FRAME;
   }
 
-  if (best_inter_cost < INT64_MAX) {
+  if (best_inter_cost < INT32_MAX) {
     xd->mi[0]->mv[0].as_int = best_mv[0].as_int;
     xd->mi[0]->mv[1].as_int = best_mv[1].as_int;
     const YV12_BUFFER_CONFIG *ref_frame_ptr[2] = {
@@ -894,13 +894,13 @@
                         qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool,
                         rec_stride_pool, tx_size, best_mode, mi_row, mi_col,
                         use_y_only_rate_distortion, NULL);
-    tpl_stats->srcrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2;
+    tpl_stats->srcrf_rate = rate_cost;
   }
 
   best_intra_cost = AOMMAX(best_intra_cost, 1);
   best_inter_cost = AOMMIN(best_intra_cost, best_inter_cost);
-  tpl_stats->inter_cost = best_inter_cost << TPL_DEP_COST_SCALE_LOG2;
-  tpl_stats->intra_cost = best_intra_cost << TPL_DEP_COST_SCALE_LOG2;
+  tpl_stats->inter_cost = best_inter_cost;
+  tpl_stats->intra_cost = best_intra_cost;
 
   tpl_stats->srcrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2;
   tpl_stats->srcrf_sse = pred_error << TPL_DEP_COST_SCALE_LOG2;
@@ -924,11 +924,11 @@
                       use_y_only_rate_distortion, tpl_txfm_stats);
 
   tpl_stats->recrf_dist = recon_error << (TPL_DEP_COST_SCALE_LOG2);
-  tpl_stats->recrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2;
+  tpl_stats->recrf_rate = rate_cost;
 
   if (!is_inter_mode(best_mode)) {
     tpl_stats->srcrf_dist = recon_error << (TPL_DEP_COST_SCALE_LOG2);
-    tpl_stats->srcrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2;
+    tpl_stats->srcrf_rate = rate_cost;
     tpl_stats->srcrf_sse = pred_error << TPL_DEP_COST_SCALE_LOG2;
   }
 
@@ -944,7 +944,7 @@
                         rec_stride_pool, tx_size, best_mode, mi_row, mi_col,
                         use_y_only_rate_distortion, NULL);
     tpl_stats->cmp_recrf_dist[0] = recon_error << TPL_DEP_COST_SCALE_LOG2;
-    tpl_stats->cmp_recrf_rate[0] = rate_cost << TPL_DEP_COST_SCALE_LOG2;
+    tpl_stats->cmp_recrf_rate[0] = rate_cost;
 
     tpl_stats->cmp_recrf_dist[0] =
         AOMMAX(tpl_stats->srcrf_dist, tpl_stats->cmp_recrf_dist[0]);
@@ -965,7 +965,7 @@
                         rec_stride_pool, tx_size, best_mode, mi_row, mi_col,
                         use_y_only_rate_distortion, NULL);
     tpl_stats->cmp_recrf_dist[1] = recon_error << TPL_DEP_COST_SCALE_LOG2;
-    tpl_stats->cmp_recrf_rate[1] = rate_cost << TPL_DEP_COST_SCALE_LOG2;
+    tpl_stats->cmp_recrf_rate[1] = rate_cost;
 
     tpl_stats->cmp_recrf_dist[1] =
         AOMMAX(tpl_stats->srcrf_dist, tpl_stats->cmp_recrf_dist[1]);
@@ -1099,15 +1099,18 @@
 
   int64_t srcrf_dist = is_compound ? tpl_stats_ptr->cmp_recrf_dist[!ref]
                                    : tpl_stats_ptr->srcrf_dist;
-  int64_t srcrf_rate = is_compound ? tpl_stats_ptr->cmp_recrf_rate[!ref]
-                                   : tpl_stats_ptr->srcrf_rate;
+  int64_t srcrf_rate =
+      is_compound
+          ? (tpl_stats_ptr->cmp_recrf_rate[!ref] << TPL_DEP_COST_SCALE_LOG2)
+          : (tpl_stats_ptr->srcrf_rate << TPL_DEP_COST_SCALE_LOG2);
 
   int64_t cur_dep_dist = tpl_stats_ptr->recrf_dist - srcrf_dist;
   int64_t mc_dep_dist =
       (int64_t)(tpl_stats_ptr->mc_dep_dist *
                 ((double)(tpl_stats_ptr->recrf_dist - srcrf_dist) /
                  tpl_stats_ptr->recrf_dist));
-  int64_t delta_rate = tpl_stats_ptr->recrf_rate - srcrf_rate;
+  int64_t delta_rate =
+      (tpl_stats_ptr->recrf_rate << TPL_DEP_COST_SCALE_LOG2) - srcrf_rate;
   int64_t mc_dep_rate =
       av1_delta_rate_cost(tpl_stats_ptr->mc_dep_rate, tpl_stats_ptr->recrf_dist,
                           srcrf_dist, pix_num);
diff --git a/av1/encoder/tpl_model.h b/av1/encoder/tpl_model.h
index b77a19f..ec49ea5 100644
--- a/av1/encoder/tpl_model.h
+++ b/av1/encoder/tpl_model.h
@@ -104,20 +104,20 @@
 } TplTxfmStats;
 
 typedef struct TplDepStats {
-  int64_t intra_cost;
-  int64_t inter_cost;
+  int64_t srcrf_sse;
   int64_t srcrf_dist;
   int64_t recrf_dist;
   int64_t cmp_recrf_dist[2];
-  int64_t srcrf_rate;
-  int64_t recrf_rate;
-  int64_t srcrf_sse;
-  int64_t cmp_recrf_rate[2];
   int64_t mc_dep_rate;
   int64_t mc_dep_dist;
-  int_mv mv[INTER_REFS_PER_FRAME];
-  int ref_frame_index[2];
   int64_t pred_error[INTER_REFS_PER_FRAME];
+  int32_t intra_cost;
+  int32_t inter_cost;
+  int32_t srcrf_rate;
+  int32_t recrf_rate;
+  int32_t cmp_recrf_rate[2];
+  int_mv mv[INTER_REFS_PER_FRAME];
+  int8_t ref_frame_index[2];
 } TplDepStats;
 
 typedef struct TplDepFrame {