Avoid redundant calculations in tpl

The following redundant computations in tpl module are avoided :
- Inverse transform when using source frame as reference.
- Rate and distortion calculation with source frame as
  reference when the best prediction mode is an intra mode.
- Start MV SAD computation when the number of start MVs
  used for motion search is 1.

      Instruction Count
cpu   Reduction(%)
 5       0.226
 6       0.309

Change-Id: I89b259b8af4533c42b4586e711745a9bd0371196
diff --git a/av1/encoder/tpl_model.c b/av1/encoder/tpl_model.c
index 38877f7..3aeb511 100644
--- a/av1/encoder/tpl_model.c
+++ b/av1/encoder/tpl_model.c
@@ -231,7 +231,7 @@
     const MACROBLOCK *x, int16_t *src_diff, int diff_stride, uint8_t *src,
     int src_stride, uint8_t *dst, int dst_stride, tran_low_t *coeff,
     tran_low_t *qcoeff, tran_low_t *dqcoeff, int bw, int bh, TX_SIZE tx_size,
-    int *rate_cost, int64_t *recon_error, int64_t *sse) {
+    int do_recon, int *rate_cost, int64_t *recon_error, int64_t *sse) {
   const MACROBLOCKD *xd = &x->e_mbd;
   const BitDepthInfo bd_info = get_bit_depth_info(xd);
   uint16_t eob;
@@ -244,8 +244,9 @@
 
   *rate_cost = rate_estimator(qcoeff, eob, tx_size);
 
-  av1_inverse_transform_block(xd, dqcoeff, 0, DCT_DCT, tx_size, dst, dst_stride,
-                              eob, 0);
+  if (do_recon)
+    av1_inverse_transform_block(xd, dqcoeff, 0, DCT_DCT, tx_size, dst,
+                                dst_stride, eob, 0);
 }
 
 static uint32_t motion_estimation(AV1_COMP *cpi, MACROBLOCK *x,
@@ -347,7 +348,7 @@
     tran_low_t *dqcoeff, AV1_COMMON *cm, MACROBLOCK *x,
     const YV12_BUFFER_CONFIG *ref_frame_ptr[2], uint8_t *rec_buffer_pool[3],
     const int rec_stride_pool[3], TX_SIZE tx_size, PREDICTION_MODE best_mode,
-    int mi_row, int mi_col, int use_y_only_rate_distortion,
+    int mi_row, int mi_col, int use_y_only_rate_distortion, int do_recon,
     TplTxfmStats *tpl_txfm_stats) {
   const SequenceHeader *seq_params = cm->seq_params;
   *rate_cost = 0;
@@ -435,7 +436,7 @@
         src_buffer_pool[plane] + src_mb_offset, src_stride, dst_buffer,
         dst_buffer_stride, coeff, qcoeff, dqcoeff, block_size_wide[bsize_plane],
         block_size_high[bsize_plane], max_txsize_rect_lookup[bsize_plane],
-        &this_rate, &this_recon_error, &sse);
+        do_recon, &this_rate, &this_recon_error, &sse);
 
 #if CONFIG_BITRATE_ACCURACY
     if (plane == 0 && tpl_txfm_stats) {
@@ -591,7 +592,7 @@
     get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff,
                         qcoeff, dqcoeff, cm, x, NULL, rec_buffer_pool,
                         rec_stride_pool, tx_size, best_mode, mi_row, mi_col,
-                        use_y_only_rate_distortion, NULL);
+                        use_y_only_rate_distortion, 1 /*do_recon*/, NULL);
 
     tpl_stats->intra_dist = recon_error << TPL_DEP_COST_SCALE_LOG2;
     tpl_stats->intra_sse = pred_error << TPL_DEP_COST_SCALE_LOG2;
@@ -717,7 +718,7 @@
     }
 
     // Prune starting mvs
-    if (tpl_sf->prune_starting_mv) {
+    if (tpl_sf->prune_starting_mv && refmv_count > 1) {
       // Get each center mv's sad.
       for (idx = 0; idx < refmv_count; ++idx) {
         FULLPEL_MV mv = get_fullmv_from_mv(&center_mvs[idx].mv.as_mv);
@@ -728,9 +729,8 @@
       }
 
       // Rank center_mv using sad.
-      if (refmv_count > 1) {
-        qsort(center_mvs, refmv_count, sizeof(center_mvs[0]), compare_sad);
-      }
+      qsort(center_mvs, refmv_count, sizeof(center_mvs[0]), compare_sad);
+
       refmv_count = AOMMIN(4 - tpl_sf->prune_starting_mv, refmv_count);
       // Further reduce number of refmv based on sad difference.
       if (refmv_count > 1) {
@@ -918,7 +918,7 @@
     xd->mi[0]->ref_frame[1] = best_rf_idx1 + LAST_FRAME;
   }
 
-  if (best_inter_cost < INT32_MAX) {
+  if (best_inter_cost < INT32_MAX && is_inter_mode(best_mode)) {
     xd->mi[0]->mv[0].as_int = best_mv[0].as_int;
     xd->mi[0]->mv[1].as_int = best_mv[1].as_int;
     const YV12_BUFFER_CONFIG *ref_frame_ptr[2] = {
@@ -933,7 +933,7 @@
     get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff,
                         qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool,
                         rec_stride_pool, tx_size, best_mode, mi_row, mi_col,
-                        use_y_only_rate_distortion, NULL);
+                        use_y_only_rate_distortion, 0 /*do_recon*/, NULL);
     tpl_stats->srcrf_rate = rate_cost;
   }
 
@@ -961,7 +961,8 @@
   get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff,
                       qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool,
                       rec_stride_pool, tx_size, best_mode, mi_row, mi_col,
-                      use_y_only_rate_distortion, tpl_txfm_stats);
+                      use_y_only_rate_distortion, 1 /*do_recon*/,
+                      tpl_txfm_stats);
 
   tpl_stats->recrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2;
   tpl_stats->recrf_sse = pred_error << TPL_DEP_COST_SCALE_LOG2;
@@ -983,7 +984,7 @@
     get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff,
                         qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool,
                         rec_stride_pool, tx_size, best_mode, mi_row, mi_col,
-                        use_y_only_rate_distortion, NULL);
+                        use_y_only_rate_distortion, 1 /*do_recon*/, NULL);
     tpl_stats->cmp_recrf_dist[0] = recon_error << TPL_DEP_COST_SCALE_LOG2;
     tpl_stats->cmp_recrf_rate[0] = rate_cost;
 
@@ -1004,7 +1005,7 @@
     get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff,
                         qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool,
                         rec_stride_pool, tx_size, best_mode, mi_row, mi_col,
-                        use_y_only_rate_distortion, NULL);
+                        use_y_only_rate_distortion, 1 /*do_recon*/, NULL);
     tpl_stats->cmp_recrf_dist[1] = recon_error << TPL_DEP_COST_SCALE_LOG2;
     tpl_stats->cmp_recrf_rate[1] = rate_cost;