[CFL] Store Reconstructed Luma for Intra In Inter

Like for intra block in intra frames, an extra call to
txfm_rd_in_plane is added to the RDO of intra blocks in inter frames.
This extra call is performed using the best parameters found during
RDO and the reconstructed luma pixel are stored.

Results on objective-1-fast (compared to CfL on Intra frames only)

   PSNR | PSNR Cb | PSNR Cr | PSNR HVS |    SSIM | MS SSIM | CIEDE 2000
-0.2497 | -3.5526 | -3.5048 |  -0.2456 | -0.2392 | -0.2508 |    -1.4811

https://arewecompressedyet.com/?job=cfl-no-inter%402017-09-13&job=cfl-inter%402017-09-13T14%3A13%3A13.918Z

Change-Id: I70ea2c01859b6c55d7c3eb9680d492c0bfc2aad4
diff --git a/av1/common/cfl.c b/av1/common/cfl.c
index 2988200..60c5118 100644
--- a/av1/common/cfl.c
+++ b/av1/common/cfl.c
@@ -276,58 +276,11 @@
   }
 }
 
-void cfl_store(CFL_CTX *cfl, const uint8_t *input, int input_stride, int row,
-               int col, TX_SIZE tx_size, BLOCK_SIZE bsize) {
-  const int tx_width = tx_size_wide[tx_size];
-  const int tx_height = tx_size_high[tx_size];
+static INLINE void cfl_store(CFL_CTX *cfl, const uint8_t *input,
+                             int input_stride, int row, int col, int width,
+                             int height) {
   const int tx_off_log2 = tx_size_wide_log2[0];
 
-#if CONFIG_CHROMA_SUB8X8
-  if (bsize < BLOCK_8X8) {
-    // Transform cannot be smaller than
-    assert(tx_width >= 4);
-    assert(tx_height >= 4);
-
-    const int bw = block_size_wide[bsize];
-    const int bh = block_size_high[bsize];
-
-    // For chroma_sub8x8, the CfL prediction for prediction blocks smaller than
-    // 8X8 uses non chroma reference reconstructed luma pixels. To do so, we
-    // combine the 4X4 non chroma reference into the CfL pixel buffers based on
-    // their row and column index.
-
-    // The following code is adapted from the is_chroma_reference() function.
-    if ((cfl->mi_row &
-         0x01)        // Increment the row index for odd indexed 4X4 blocks
-        && (bh == 4)  // But not for 4X8 blocks
-        && cfl->subsampling_y) {  // And only when chroma is subsampled
-      assert(row == 0);
-      row++;
-    }
-
-    if ((cfl->mi_col &
-         0x01)        // Increment the col index for odd indexed 4X4 blocks
-        && (bw == 4)  // But not for 8X4 blocks
-        && cfl->subsampling_x) {  // And only when chroma is subsampled
-      assert(col == 0);
-      col++;
-    }
-#if CONFIG_DEBUG
-    for (int unit_r = 0; unit_r < tx_size_high_unit[tx_size]; unit_r++) {
-      assert(row + unit_r < CFL_SUB8X8_VAL_MI_SIZE);
-      int row_off = (row + unit_r) * CFL_SUB8X8_VAL_MI_SIZE;
-      for (int unit_c = 0; unit_c < tx_size_wide_unit[tx_size]; unit_c++) {
-        assert(col + unit_c < CFL_SUB8X8_VAL_MI_SIZE);
-        assert(cfl->sub8x8_val[row_off + col + unit_c] == 0);
-        cfl->sub8x8_val[row_off + col + unit_c] = 1;
-      }
-    }
-#endif  // CONFIG_DEBUG
-  }
-#else
-  (void)bsize;
-#endif  // CONFIG_CHROMA_SUB8X8
-
   // Invalidate current parameters
   cfl->are_parameters_computed = 0;
 
@@ -335,29 +288,104 @@
   // can manage chroma overrun (e.g. when the chroma surfaces goes beyond the
   // frame boundary)
   if (col == 0 && row == 0) {
-    cfl->y_width = tx_width;
-    cfl->y_height = tx_height;
+    cfl->y_width = width;
+    cfl->y_height = height;
   } else {
-    cfl->y_width = OD_MAXI((col << tx_off_log2) + tx_width, cfl->y_width);
-    cfl->y_height = OD_MAXI((row << tx_off_log2) + tx_height, cfl->y_height);
+    cfl->y_width = OD_MAXI((col << tx_off_log2) + width, cfl->y_width);
+    cfl->y_height = OD_MAXI((row << tx_off_log2) + height, cfl->y_height);
   }
 
   // Check that we will remain inside the pixel buffer.
-  assert((row << tx_off_log2) + tx_height <= MAX_SB_SIZE);
-  assert((col << tx_off_log2) + tx_width <= MAX_SB_SIZE);
+  assert((row << tx_off_log2) + height <= MAX_SB_SIZE);
+  assert((col << tx_off_log2) + width <= MAX_SB_SIZE);
 
   // Store the input into the CfL pixel buffer
   uint8_t *y_pix = &cfl->y_pix[(row * MAX_SB_SIZE + col) << tx_off_log2];
 
   // TODO(ltrudeau) Speedup possible by moving the downsampling to cfl_store
-  for (int j = 0; j < tx_height; j++) {
-    for (int i = 0; i < tx_width; i++) {
+  for (int j = 0; j < height; j++) {
+    for (int i = 0; i < width; i++) {
       y_pix[i] = input[i];
     }
     y_pix += MAX_SB_SIZE;
     input += input_stride;
   }
 }
+#if CONFIG_CHROMA_SUB8X8
+// Adjust the row and column of blocks smaller than 8X8, as chroma-referenced
+// and non-chroma-referenced blocks are stored together in the CfL buffer.
+static INLINE void sub8x8_adjust_offset(const CFL_CTX *cfl, int *row_out,
+                                        int *col_out) {
+  // Increment row index for bottom: 8x4, 16x4 or both bottom 4x4s.
+  if ((cfl->mi_row & 0x01) && cfl->subsampling_y) {
+    assert(*row_out == 0);
+    (*row_out)++;
+  }
+
+  // Increment col index for right: 4x8, 4x16 or both right 4x4s.
+  if ((cfl->mi_col & 0x01) && cfl->subsampling_x) {
+    assert(*col_out == 0);
+    (*col_out)++;
+  }
+}
+#if CONFIG_DEBUG
+static INLINE void sub8x8_set_val(CFL_CTX *cfl, int row, int col, int val_high,
+                                  int val_wide) {
+  for (int val_r = 0; val_r < val_high; val_r++) {
+    assert(row + val_r < CFL_SUB8X8_VAL_MI_SIZE);
+    int row_off = (row + val_r) * CFL_SUB8X8_VAL_MI_SIZE;
+    for (int val_c = 0; val_c < val_wide; val_c++) {
+      assert(col + val_c < CFL_SUB8X8_VAL_MI_SIZE);
+      assert(cfl->sub8x8_val[row_off + col + val_c] == 0);
+      cfl->sub8x8_val[row_off + col + val_c]++;
+    }
+  }
+}
+#endif  // CONFIG_DEBUG
+#endif  // CONFIG_CHROMA_SUB8X8
+
+void cfl_store_tx(MACROBLOCKD *const xd, int row, int col, TX_SIZE tx_size,
+                  BLOCK_SIZE bsize) {
+  CFL_CTX *const cfl = xd->cfl;
+  struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
+  uint8_t *dst =
+      &pd->dst.buf[(row * pd->dst.stride + col) << tx_size_wide_log2[0]];
+  (void)bsize;
+#if CONFIG_CHROMA_SUB8X8
+
+  if (block_size_high[bsize] == 4 || block_size_wide[bsize] == 4) {
+    // Only dimensions of size 4 can have an odd offset.
+    assert(!((col & 1) && tx_size_wide[tx_size] != 4));
+    assert(!((row & 1) && tx_size_high[tx_size] != 4));
+    sub8x8_adjust_offset(cfl, &row, &col);
+#if CONFIG_DEBUG
+    sub8x8_set_val(cfl, row, col, tx_size_high_unit[tx_size],
+                   tx_size_wide_unit[tx_size]);
+#endif  // CONFIG_DEBUG
+  }
+#endif
+  cfl_store(cfl, dst, pd->dst.stride, row, col, tx_size_wide[tx_size],
+            tx_size_high[tx_size]);
+}
+
+void cfl_store_block(MACROBLOCKD *const xd, BLOCK_SIZE bsize, TX_SIZE tx_size) {
+  CFL_CTX *const cfl = xd->cfl;
+  struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
+  int row = 0;
+  int col = 0;
+#if CONFIG_CHROMA_SUB8X8
+  bsize = AOMMAX(BLOCK_4X4, bsize);
+  if (block_size_high[bsize] == 4 || block_size_wide[bsize] == 4) {
+    sub8x8_adjust_offset(cfl, &row, &col);
+#if CONFIG_DEBUG
+    sub8x8_set_val(cfl, row, col, mi_size_high[bsize], mi_size_wide[bsize]);
+#endif  // CONFIG_DEBUG
+  }
+#endif  // CONFIG_CHROMA_SUB8X8
+  const int width = max_intra_block_width(xd, bsize, AOM_PLANE_Y, tx_size);
+  const int height = max_intra_block_height(xd, bsize, AOM_PLANE_Y, tx_size);
+  cfl_store(cfl, pd->dst.buf, pd->dst.stride, row, col, width, height);
+}
 
 void cfl_compute_parameters(MACROBLOCKD *const xd, TX_SIZE tx_size) {
   CFL_CTX *const cfl = xd->cfl;
@@ -393,7 +421,7 @@
     assert(cfl->y_width <= cfl->uv_width << cfl->subsampling_x);
     assert(cfl->y_height <= cfl->uv_height << cfl->subsampling_y);
   }
-#endif
+#endif  // CONFIG_DEBUG
 
   // Compute block-level DC_PRED for both chromatic planes.
   // DC_PRED replaces beta in the linear model.
diff --git a/av1/common/cfl.h b/av1/common/cfl.h
index 0cefd06..7a56a49 100644
--- a/av1/common/cfl.h
+++ b/av1/common/cfl.h
@@ -22,8 +22,10 @@
 void cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
                        int row, int col, TX_SIZE tx_size, int plane);
 
-void cfl_store(CFL_CTX *cfl, const uint8_t *input, int input_stride, int row,
-               int col, TX_SIZE tx_size, BLOCK_SIZE bsize);
+void cfl_store_block(MACROBLOCKD *const xd, BLOCK_SIZE bsize, TX_SIZE tx_size);
+
+void cfl_store_tx(MACROBLOCKD *const xd, int row, int col, TX_SIZE tx_size,
+                  BLOCK_SIZE bsize);
 
 void cfl_compute_parameters(MACROBLOCKD *const xd, TX_SIZE tx_size);
 
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index 0080c2d..949a69d 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -519,20 +519,9 @@
   }
 #if CONFIG_CFL
   if (plane == AOM_PLANE_Y && xd->cfl->store_y) {
-    struct macroblockd_plane *const pd = &xd->plane[plane];
-#if CONFIG_CHROMA_SUB8X8
-    const BLOCK_SIZE plane_bsize =
-        AOMMAX(BLOCK_4X4, get_plane_block_size(mbmi->sb_type, pd));
-#else
-    const BLOCK_SIZE plane_bsize = get_plane_block_size(mbmi->sb_type, pd);
-#endif  // CONFIG_CHROMA_SUB8X8
-    uint8_t *dst =
-        &pd->dst.buf[(row * pd->dst.stride + col) << tx_size_wide_log2[0]];
-    // TODO (ltrudeau) Store sub-8x8 inter blocks when bottom right block is
-    // intra predicted.
-    cfl_store(xd->cfl, dst, pd->dst.stride, row, col, tx_size, plane_bsize);
+    cfl_store_tx(xd, row, col, tx_size, mbmi->sb_type);
   }
-#endif  // CONFIG_CFL
+#endif  // CONFIG_CFL && CONFIG_COEFF_INTERLEAVE
 }
 
 #if CONFIG_VAR_TX && !CONFIG_COEF_INTERLEAVE
@@ -1769,6 +1758,11 @@
 
   set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis, y_mis);
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+#if CONFIG_CFL && CONFIG_CHROMA_SUB8X8
+  CFL_CTX *const cfl = xd->cfl;
+  cfl->is_chroma_reference = is_chroma_reference(
+      mi_row, mi_col, bsize, cfl->subsampling_x, cfl->subsampling_y);
+#endif  // CONFIG_CFL && CONFIG_CHROMA_SUB8X8
 
 #if CONFIG_DELTA_Q
   if (cm->delta_q_present_flag) {
@@ -1966,11 +1960,6 @@
         }
       }
     }
-#if CONFIG_CFL && CONFIG_CB4X4 && CONFIG_DEBUG
-    if (xd->cfl->is_chroma_reference) {
-      cfl_clear_sub8x8_val(xd->cfl);
-    }
-#endif  // CONFIG_CFL && CONFIG_CB4X4 && CONFIG_DEBUG
   } else {
     int ref;
 
@@ -2103,6 +2092,18 @@
       }
     }
   }
+#if CONFIG_CFL && CONFIG_CHROMA_SUB8X8
+  if (mbmi->uv_mode != UV_CFL_PRED) {
+#if CONFIG_DEBUG
+    if (cfl->is_chroma_reference) {
+      cfl_clear_sub8x8_val(cfl);
+    }
+#endif
+    if (!cfl->is_chroma_reference && is_inter_block(mbmi)) {
+      cfl_store_block(xd, mbmi->sb_type, mbmi->tx_size);
+    }
+  }
+#endif  // CONFIG_CFL && CONFIG_CHROMA_SUB8X8
 #endif  // CONFIG_COEF_INTERLEAVE
 
   int reader_corrupted_flag = aom_reader_has_error(r);
diff --git a/av1/decoder/decodemv.c b/av1/decoder/decodemv.c
index be62400..81e1e06 100644
--- a/av1/decoder/decodemv.c
+++ b/av1/decoder/decodemv.c
@@ -1214,9 +1214,7 @@
 #if CONFIG_CFL
     if (mbmi->uv_mode == UV_CFL_PRED) {
       mbmi->cfl_alpha_idx = read_cfl_alphas(ec_ctx, r, &mbmi->cfl_alpha_signs);
-      // TODO(ltrudeau) Remove key_frame check (used to test CfL only in Intra
-      // frame).
-      xd->cfl->store_y = cm->frame_type == KEY_FRAME;
+      xd->cfl->store_y = 1;
     } else {
       xd->cfl->store_y = 0;
     }
@@ -1228,9 +1226,7 @@
     mbmi->uv_mode = UV_DC_PRED;
 #if CONFIG_CFL
     xd->cfl->is_chroma_reference = 0;
-    // TODO(ltrudeau) Remove key_frame check (used to test CfL only in Intra
-    // frame).
-    xd->cfl->store_y = cm->frame_type == KEY_FRAME;
+    xd->cfl->store_y = 1;
 #endif
   }
 #endif
@@ -1811,15 +1807,20 @@
     if (mbmi->uv_mode == UV_CFL_PRED) {
       mbmi->cfl_alpha_idx =
           read_cfl_alphas(xd->tile_ctx, r, &mbmi->cfl_alpha_signs);
-      // TODO(ltrudeau) Remove key_frame check (used to test CfL only in Intra
-      // frame).
-      xd->cfl->store_y = cm->frame_type == KEY_FRAME;
+      xd->cfl->store_y = 1;
     } else {
       xd->cfl->store_y = 0;
     }
 #endif  // CONFIG_CFL
 
 #if CONFIG_CB4X4
+  } else {
+    // Avoid decoding angle_info if there is is no chroma prediction
+    mbmi->uv_mode = UV_DC_PRED;
+#if CONFIG_CFL
+    xd->cfl->is_chroma_reference = 0;
+    xd->cfl->store_y = 1;
+#endif
   }
 #endif
 
@@ -2275,6 +2276,7 @@
 
   assert(NELEMENTS(mode_2_counter) == MB_MODE_COUNT);
 
+  mbmi->uv_mode = UV_DC_PRED;
   mbmi->palette_mode_info.palette_size[0] = 0;
   mbmi->palette_mode_info.palette_size[1] = 0;
 
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index b4d5a7e..04d2f79 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -4109,9 +4109,7 @@
 #endif  // CONFIG_SUPERTX
 
 #if CONFIG_CFL && CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
-    if (sum_rdc.rdcost >= best_rdc.rdcost) {
-      cfl_clear_sub8x8_val(xd->cfl);
-    }
+    cfl_clear_sub8x8_val(xd->cfl);
 #endif  // CONFIG_CFL && CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
     if (sum_rdc.rdcost < best_rdc.rdcost) {
       sum_rdc.rate += partition_cost[PARTITION_HORZ];
@@ -4272,9 +4270,7 @@
 #endif  // CONFIG_SUPERTX
 
 #if CONFIG_CFL && CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
-    if (sum_rdc.rdcost >= best_rdc.rdcost) {
-      cfl_clear_sub8x8_val(xd->cfl);
-    }
+    cfl_clear_sub8x8_val(xd->cfl);
 #endif  // CONFIG_CFL && CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
 
     if (sum_rdc.rdcost < best_rdc.rdcost) {
@@ -6105,9 +6101,7 @@
 
   if (!is_inter) {
 #if CONFIG_CFL
-    // TODO(ltrudeau) Remove key_frame check (used to test CfL only in Intra
-    // frame).
-    xd->cfl->store_y = cm->frame_type == KEY_FRAME;
+    xd->cfl->store_y = 1;
 #endif  // CONFIG_CFL
     int plane;
     mbmi->skip = 1;
@@ -6117,13 +6111,13 @@
     }
 #if CONFIG_CFL
     xd->cfl->store_y = 0;
-#if CONFIG_CB4X4 && CONFIG_DEBUG
+#if CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
     if (is_chroma_reference(mi_row, mi_col, bsize, xd->cfl->subsampling_x,
                             xd->cfl->subsampling_y) &&
         !xd->cfl->are_parameters_computed) {
       cfl_clear_sub8x8_val(xd->cfl);
     }
-#endif  // CONFIG_CB4X4 && CONFIG_DEBUG
+#endif  // CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
 #endif  // CONFIG_CFL
     if (!dry_run) {
       sum_intra_stats(td->counts, xd, mi, xd->above_mi, xd->left_mi,
@@ -6334,6 +6328,21 @@
     set_txfm_ctxs(tx_size, xd->n8_w, xd->n8_h, (mbmi->skip || seg_skip), xd);
   }
 #endif  // CONFIG_VAR_TX
+#if CONFIG_CFL && CONFIG_CHROMA_SUB8X8
+  CFL_CTX *const cfl = xd->cfl;
+#if CONFIG_DEBUG
+  if (is_chroma_reference(mi_row, mi_col, bsize, cfl->subsampling_x,
+                          cfl->subsampling_y) &&
+      !cfl->are_parameters_computed) {
+    cfl_clear_sub8x8_val(cfl);
+  }
+#endif  // CONFIG_DEBUG
+  if (is_inter_block(mbmi) &&
+      !is_chroma_reference(mi_row, mi_col, bsize, cfl->subsampling_x,
+                           cfl->subsampling_y)) {
+    cfl_store_block(xd, mbmi->sb_type, mbmi->tx_size);
+  }
+#endif  // CONFIG_CFL && CONFIG_CHROMA_SUB8X8
 }
 
 #if CONFIG_SUPERTX
diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c
index 565f420..ada6b8b 100644
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c
@@ -741,27 +741,29 @@
 
   if (p->eobs[block]) *(args->skip) = 0;
 
-  if (p->eobs[block] == 0) return;
+  if (p->eobs[block] != 0)
 #else
   (void)ctx;
   if (!x->pvq_skip[plane]) *(args->skip) = 0;
 
-  if (x->pvq_skip[plane]) return;
+  if (!x->pvq_skip[plane])
 #endif
+  {
 #if CONFIG_LGT
-  PREDICTION_MODE mode = xd->mi[0]->mbmi.mode;
+    PREDICTION_MODE mode = xd->mi[0]->mbmi.mode;
 #endif  // CONFIG_LGT
-  TX_TYPE tx_type =
-      av1_get_tx_type(pd->plane_type, xd, blk_row, blk_col, block, tx_size);
-  av1_inverse_transform_block(xd, dqcoeff,
+    TX_TYPE tx_type =
+        av1_get_tx_type(pd->plane_type, xd, blk_row, blk_col, block, tx_size);
+    av1_inverse_transform_block(xd, dqcoeff,
 #if CONFIG_LGT
-                              mode,
+                                mode,
 #endif  // CONFIG_LGT
 #if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                              mrc_mask,
+                                mrc_mask,
 #endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
-                              tx_type, tx_size, dst, pd->dst.stride,
-                              p->eobs[block]);
+                                tx_type, tx_size, dst, pd->dst.stride,
+                                p->eobs[block]);
+  }
 }
 
 #if CONFIG_VAR_TX
@@ -1113,9 +1115,7 @@
 #endif
 #if CONFIG_CFL
   if (plane == AOM_PLANE_Y && xd->cfl->store_y) {
-    // TODO (ltrudeau) Store sub-8x8 inter blocks when bottom right block is
-    // intra predicted.
-    cfl_store(xd->cfl, dst, dst_stride, blk_row, blk_col, tx_size, plane_bsize);
+    cfl_store_tx(xd, blk_row, blk_col, tx_size, plane_bsize);
   }
 #endif  // CONFIG_CFL
 }
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index c6dd076..acee6de 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -2081,15 +2081,14 @@
   }
 #if CONFIG_CFL
   if (plane == AOM_PLANE_Y && xd->cfl->store_y) {
-    struct macroblockd_plane *const pd = &xd->plane[plane];
-    const int dst_stride = pd->dst.stride;
-    uint8_t *dst =
-        &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
-    // TODO (ltrudeau) Store sub-8x8 inter blocks when bottom right block is
-    // intra predicted.
-    cfl_store(xd->cfl, dst, dst_stride, blk_row, blk_col, tx_size, plane_bsize);
+#if CONFIG_CHROMA_SUB8X8
+    assert(!is_inter_block(mbmi) || plane_bsize < BLOCK_8X8);
+#else
+    assert(!is_inter_block(mbmi));
+#endif  // CONFIG_CHROMA_SUB8X8
+    cfl_store_tx(xd, blk_row, blk_col, tx_size, plane_bsize);
   }
-#endif
+#endif  // CONFIG_CFL
   rd = RDCOST(x->rdmult, 0, this_rd_stats.dist);
   if (args->this_rd + rd > args->best_rd) {
     args->exit_early = 1;
@@ -6027,18 +6026,11 @@
 
     mbmi->uv_mode = mode;
 #if CONFIG_CFL
-    const AV1_COMMON *const cm = &cpi->common;
     int cfl_alpha_rate = 0;
     if (mode == UV_CFL_PRED) {
       assert(!is_directional_mode);
-      // TODO(ltrudeau) Remove key_frame check (used to test CfL only in Intra
-      // frame).
-      if (cm->frame_type == KEY_FRAME) {
-        const TX_SIZE uv_tx_size = av1_get_uv_tx_size(mbmi, &xd->plane[1]);
-        cfl_alpha_rate = cfl_rd_pick_alpha(x, uv_tx_size);
-      } else {
-        continue;
-      }
+      const TX_SIZE uv_tx_size = av1_get_uv_tx_size(mbmi, &xd->plane[1]);
+      cfl_alpha_rate = cfl_rd_pick_alpha(x, uv_tx_size);
     }
 #endif
 #if CONFIG_EXT_INTRA
@@ -6124,9 +6116,11 @@
                                  int *rate_uv, int *rate_uv_tokenonly,
                                  int64_t *dist_uv, int *skip_uv,
                                  UV_PREDICTION_MODE *mode_uv) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   // Use an estimated rd for uv_intra based on DC_PRED if the
   // appropriate speed flag is set.
-  init_sbuv_mode(&x->e_mbd.mi[0]->mbmi);
+  init_sbuv_mode(mbmi);
 #if CONFIG_CB4X4
 #if !CONFIG_CHROMA_2X2
   if (x->skip_chroma_rd) {
@@ -6137,15 +6131,34 @@
     *mode_uv = UV_DC_PRED;
     return;
   }
-  bsize = scale_chroma_bsize(bsize, x->e_mbd.plane[AOM_PLANE_U].subsampling_x,
-                             x->e_mbd.plane[AOM_PLANE_U].subsampling_y);
+  bsize = scale_chroma_bsize(bsize, xd->plane[AOM_PLANE_U].subsampling_x,
+                             xd->plane[AOM_PLANE_U].subsampling_y);
 #endif  // !CONFIG_CHROMA_2X2
+#if CONFIG_CFL
+  // Only store reconstructed luma when there's chroma RDO. When there's no
+  // chroma RDO, the reconstructed luma will be stored in encode_superblock().
+  xd->cfl->store_y = !x->skip_chroma_rd;
+#endif  // CONFIG_CFL
 #else
   bsize = bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize;
+#if CONFIG_CFL
+  xd->cfl->store_y = 1;
+#endif  // CONFIG_CFL
 #endif  // CONFIG_CB4X4
+#if CONFIG_CFL
+  if (xd->cfl->store_y) {
+    // Perform one extra call to txfm_rd_in_plane(), with the values chosen
+    // during luma RDO, so we can store reconstructed luma values
+    RD_STATS this_rd_stats;
+    txfm_rd_in_plane(x, cpi, &this_rd_stats, INT64_MAX, AOM_PLANE_Y,
+                     mbmi->sb_type, mbmi->tx_size,
+                     cpi->sf.use_fast_coef_costing);
+    xd->cfl->store_y = 0;
+  }
+#endif  // CONFIG_CFL
   rd_pick_intra_sbuv_mode(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
                           bsize, max_tx_size);
-  *mode_uv = x->e_mbd.mi[0]->mbmi.uv_mode;
+  *mode_uv = mbmi->uv_mode;
 }
 
 static int cost_mv_ref(const MACROBLOCK *const x, PREDICTION_MODE mode,
@@ -9933,23 +9946,17 @@
 
   if (intra_yrd < best_rd) {
 #if CONFIG_CFL
-    // Perform one extra txfm_rd_in_plane() call, this time with the best value
-    // so we can store reconstructed luma values
-    RD_STATS this_rd_stats;
-
 #if CONFIG_CB4X4
-    // Don't store the luma value if no chroma is associated.
-    // Don't worry, we will store this reconstructed luma in the following
-    // encode dry-run the chroma plane will never know.
-    // TODO(ltrudeau) Delete frame type check (only used to test key-frame only
-    // CfL)
-    xd->cfl->store_y = !x->skip_chroma_rd && cm->frame_type == KEY_FRAME;
+    // Only store reconstructed luma when there's chroma RDO. When there's no
+    // chroma RDO, the reconstructed luma will be stored in encode_superblock().
+    xd->cfl->store_y = !x->skip_chroma_rd;
 #else
-    // TODO(ltrudeau) Delete frame type check (only used to test key-frame only
-    // CfL)
-    xd->cfl->store_y = cm->frame_type == KEY_FRAME;
+    xd->cfl->store_y = 1;
 #endif  // CONFIG_CB4X4
     if (xd->cfl->store_y) {
+      // Perform one extra call to txfm_rd_in_plane(), with the values chosen
+      // during luma RDO, so we can store reconstructed luma values
+      RD_STATS this_rd_stats;
       txfm_rd_in_plane(x, cpi, &this_rd_stats, INT64_MAX, AOM_PLANE_Y,
                        mbmi->sb_type, mbmi->tx_size,
                        cpi->sf.use_fast_coef_costing);