[CFL] Reorganize Reconstructed Pixel Buffering

Reworked how the storage flag is set to avoid duplication on the encoder
side. Reconstructed Luma pixels are stored in encode_superblock in the
loop that calls av1_encode_intra_block_plane and in the extra call to
txfm_rd_in_plane after the luma RDO, but prior to the chroma RDO.

This change does not alter the bitsteam.

Change-Id: Ifd8441363ea0733fea3d06129a025940abb2abc9
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index f730fac..4a9359d 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -1380,10 +1380,6 @@
   mbmi->mi_row = mi_row;
   mbmi->mi_col = mi_col;
 #endif
-#if CONFIG_CFL
-  // Don't store luma during RDO. Only store luma when best luma is known
-  x->cfl_store_y = 0;
-#endif
 #if CONFIG_SUPERTX
   // We set tx_size here as skip blocks would otherwise not set it.
   // tx_size needs to be set at this point as supertx_enable in
@@ -4388,10 +4384,6 @@
   *rate_nocoef = best_rate_nocoef;
 #endif  // CONFIG_SUPERTX
 
-#if CONFIG_CFL
-  // Store the luma for the best mode
-  x->cfl_store_y = 1;
-#endif
   if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX &&
       pc_tree->index != 3) {
     if (bsize == cm->sb_size) {
@@ -4405,9 +4397,6 @@
                 pc_tree, NULL);
     }
   }
-#if CONFIG_CFL
-  x->cfl_store_y = 0;
-#endif
 
 #if CONFIG_DIST_8X8 && CONFIG_CB4X4
   if (x->using_dist_8x8 && best_rdc.rate < INT_MAX &&
@@ -5957,17 +5946,20 @@
   x->pvq_speed = 0;
   x->pvq_coded = (dry_run == OUTPUT_ENABLED) ? 1 : 0;
 #endif
-#if CONFIG_CFL
-  x->cfl_store_y = 1;
-#endif
 
   if (!is_inter) {
+#if CONFIG_CFL
+    x->cfl_store_y = 1;
+#endif  // CONFIG_CFL
     int plane;
     mbmi->skip = 1;
     for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
       av1_encode_intra_block_plane((AV1_COMMON *)cm, x, block_size, plane, 1,
                                    mi_row, mi_col);
     }
+#if CONFIG_CFL
+    x->cfl_store_y = 0;
+#endif  // CONFIG_CFL
     if (!dry_run) {
       sum_intra_stats(td->counts, xd, mi, xd->above_mi, xd->left_mi,
                       frame_is_intra_only(cm), mi_row, mi_col);
diff --git a/av1/encoder/firstpass.c b/av1/encoder/firstpass.c
index ae55833..9498fd4 100644
--- a/av1/encoder/firstpass.c
+++ b/av1/encoder/firstpass.c
@@ -579,7 +579,7 @@
 #if CONFIG_CFL
   // Don't store luma on the fist pass since chroma is not computed
   x->cfl_store_y = 0;
-#endif
+#endif  // CONFIG_CFL
   av1_frame_init_quantizer(cpi);
 
 #if CONFIG_PVQ
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 180001a..695b4c4 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -9843,12 +9843,12 @@
 #else
     x->cfl_store_y = 1;
 #endif  // CONFIG_CB4X4
-
-    txfm_rd_in_plane(x, cpi, &this_rd_stats, INT64_MAX, AOM_PLANE_Y,
-                     mbmi->sb_type, mbmi->tx_size,
-                     cpi->sf.use_fast_coef_costing);
-
-    x->cfl_store_y = 0;
+    if (x->cfl_store_y) {
+      txfm_rd_in_plane(x, cpi, &this_rd_stats, INT64_MAX, AOM_PLANE_Y,
+                       mbmi->sb_type, mbmi->tx_size,
+                       cpi->sf.use_fast_coef_costing);
+      x->cfl_store_y = 0;
+    }
 #endif  // CONFIG_CFL
     max_uv_tx_size = uv_txsize_lookup[bsize][mbmi->tx_size][pd[1].subsampling_x]
                                      [pd[1].subsampling_y];