[CFL] DC_PRED on Partition Unit Neighboring Pixels

To avoid a cascade of encodes when performing CfL RDO,
we compute DC_PRED on the partition unit. To do so, we
change the tx_size of CfL to match the size of the
partition unit (i.e. CfL partitions only contain 1
transform block).

This change requires disabling CfL when a chroma
partition-unit-sized DC_PRED is unavailable
(i.e. 4:1, 1:4 partitions and chroma partitions > 32X32).

Results on Subset1 (compared to disabling 4:1 and 1:4 partU):
   PSNR | PSNR Cb | PSNR Cr | PSNR HVS |    SSIM | MS SSIM | CIEDE 2000
-0.1243 | -1.9286 | -2.0140 |  -0.1514 | -0.1512 | -0.1947 |    -0.8066

https://two.arewecompressedyet.com/?job=master%402017-12-12T14%3A53%3A01.451Z&job=CfL-PartU%402017-12-12T15%3A39%3A36.794Z

Change-Id: I2a4adde79c10089130775b8e0df5f9c198855cad
diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index 75297e2..24be7d9 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h
@@ -1158,6 +1158,14 @@
 
 static INLINE TX_SIZE av1_get_uv_tx_size(const MB_MODE_INFO *mbmi,
                                          const struct macroblockd_plane *pd) {
+#if CONFIG_CFL
+  if (!is_inter_block(mbmi) && mbmi->uv_mode == UV_CFL_PRED) {
+    const BLOCK_SIZE plane_bsize =
+        AOMMAX(BLOCK_4X4, get_plane_block_size(mbmi->sb_type, pd));
+    assert(plane_bsize < BLOCK_SIZES_ALL);
+    return max_txsize_rect_lookup[0][plane_bsize];
+  }
+#endif
   const TX_SIZE uv_txsize =
       uv_txsize_lookup[mbmi->sb_type][mbmi->tx_size][pd->subsampling_x]
                       [pd->subsampling_y];
diff --git a/av1/common/cfl.c b/av1/common/cfl.c
index 95c61d9..29a7d0d 100644
--- a/av1/common/cfl.c
+++ b/av1/common/cfl.c
@@ -186,27 +186,27 @@
 }
 
 void cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
-                       int row, int col, TX_SIZE tx_size, int plane) {
+                       TX_SIZE tx_size, int plane) {
   CFL_CTX *const cfl = &xd->cfl;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  assert(is_cfl_allowed(xd));
 
   if (!cfl->are_parameters_computed) cfl_compute_parameters(xd, tx_size);
 
-  const int16_t *pred_buf_q3 =
-      cfl->pred_buf_q3 + ((row * MAX_SB_SIZE + col) << tx_size_wide_log2[0]);
   const int alpha_q3 =
       cfl_idx_to_alpha(mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, plane - 1);
 #if CONFIG_HIGHBITDEPTH
   if (get_bitdepth_data_path_index(xd)) {
     uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst);
-    cfl_build_prediction_hbd(pred_buf_q3, dst_16, dst_stride,
+    cfl_build_prediction_hbd(cfl->pred_buf_q3, dst_16, dst_stride,
                              tx_size_wide[tx_size], tx_size_high[tx_size],
                              alpha_q3, xd->bd);
     return;
   }
 #endif  // CONFIG_HIGHBITDEPTH
-  cfl_build_prediction_lbd(pred_buf_q3, dst, dst_stride, tx_size_wide[tx_size],
-                           tx_size_high[tx_size], alpha_q3);
+  cfl_build_prediction_lbd(cfl->pred_buf_q3, dst, dst_stride,
+                           tx_size_wide[tx_size], tx_size_high[tx_size],
+                           alpha_q3);
 }
 
 static void cfl_luma_subsampling_420_lbd(const uint8_t *input, int input_stride,
@@ -466,7 +466,7 @@
   uint8_t *dst =
       &pd->dst.buf[(row * pd->dst.stride + col) << tx_size_wide_log2[0]];
 
-  assert(is_cfl_allowed(&xd->mi[0]->mbmi));
+  assert(is_cfl_allowed(xd));
   if (block_size_high[bsize] == 4 || block_size_wide[bsize] == 4) {
     // Only dimensions of size 4 can have an odd offset.
     assert(!((col & 1) && tx_size_wide[tx_size] != 4));
@@ -487,7 +487,7 @@
   int col = 0;
   bsize = AOMMAX(BLOCK_4X4, bsize);
 
-  assert(is_cfl_allowed(&xd->mi[0]->mbmi));
+  assert(is_cfl_allowed(xd));
   if (block_size_high[bsize] == 4 || block_size_wide[bsize] == 4) {
     sub8x8_adjust_offset(cfl, &row, &col);
 #if CONFIG_DEBUG
diff --git a/av1/common/cfl.h b/av1/common/cfl.h
index 84e6b32..92fbbd0 100644
--- a/av1/common/cfl.h
+++ b/av1/common/cfl.h
@@ -14,19 +14,21 @@
 
 #include "av1/common/blockd.h"
 
-static INLINE int is_cfl_allowed(const MB_MODE_INFO *mbmi) {
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-  assert(bsize >= BLOCK_4X4);  // Intra luma partitions can't be < 4X4
-  assert(bsize < BLOCK_SIZES_ALL);
-  return (bsize >= BLOCK_4X4) && (bsize < BLOCK_SIZES);
+static INLINE int is_cfl_allowed(const MACROBLOCKD *const xd) {
+  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const BLOCK_SIZE plane_bsize = AOMMAX(
+      BLOCK_4X4, get_plane_block_size(mbmi->sb_type, &xd->plane[AOM_PLANE_U]));
+  assert(plane_bsize < BLOCK_SIZES_ALL);
+  return plane_bsize <= BLOCK_32X32;
 }
+
 static INLINE int get_scaled_luma_q0(int alpha_q3, int16_t pred_buf_q3) {
   int scaled_luma_q6 = alpha_q3 * pred_buf_q3;
   return ROUND_POWER_OF_TWO_SIGNED(scaled_luma_q6, 6);
 }
 
 void cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
-                       int row, int col, TX_SIZE tx_size, int plane);
+                       TX_SIZE tx_size, int plane);
 
 void cfl_store_block(MACROBLOCKD *const xd, BLOCK_SIZE bsize, TX_SIZE tx_size);
 
diff --git a/av1/common/reconintra.c b/av1/common/reconintra.c
index 0b41123..6ce57a8 100644
--- a/av1/common/reconintra.c
+++ b/av1/common/reconintra.c
@@ -2779,7 +2779,17 @@
 
 #if CONFIG_CFL
   if (plane != AOM_PLANE_Y && mbmi->uv_mode == UV_CFL_PRED) {
-    cfl_predict_block(xd, dst, dst_stride, blk_row, blk_col, tx_size, plane);
+#if CONFIG_DEBUG
+    assert(blk_col == 0);
+    assert(blk_row == 0);
+    assert(is_cfl_allowed(xd));
+    const BLOCK_SIZE plane_bsize =
+        AOMMAX(BLOCK_4X4, get_plane_block_size(mbmi->sb_type, pd));
+    assert(plane_bsize < BLOCK_SIZES_ALL);
+    assert(block_size_wide[plane_bsize] == tx_size_wide[tx_size]);
+    assert(block_size_high[plane_bsize] == tx_size_high[tx_size]);
+#endif
+    cfl_predict_block(xd, dst, dst_stride, tx_size, plane);
   }
 #endif
 }
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index 4e6ce01..47b3647 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -213,7 +213,7 @@
     }
   }
 #if CONFIG_CFL
-  if (plane == AOM_PLANE_Y && xd->cfl.store_y && is_cfl_allowed(mbmi)) {
+  if (plane == AOM_PLANE_Y && xd->cfl.store_y && is_cfl_allowed(xd)) {
     cfl_store_tx(xd, row, col, tx_size, mbmi->sb_type);
   }
 #endif  // CONFIG_CFL
@@ -584,7 +584,7 @@
 #if CONFIG_CFL
   if (mbmi->uv_mode != UV_CFL_PRED) {
     if (!cfl->is_chroma_reference && is_inter_block(mbmi) &&
-        is_cfl_allowed(mbmi)) {
+        is_cfl_allowed(xd)) {
       cfl_store_block(xd, mbmi->sb_type, mbmi->tx_size);
     }
   }
diff --git a/av1/decoder/decodemv.c b/av1/decoder/decodemv.c
index 0928fc0..c1f60f0 100644
--- a/av1/decoder/decodemv.c
+++ b/av1/decoder/decodemv.c
@@ -1200,7 +1200,7 @@
 
 #if CONFIG_CFL
     if (mbmi->uv_mode == UV_CFL_PRED) {
-      if (!is_cfl_allowed(mbmi)) {
+      if (!is_cfl_allowed(xd)) {
         aom_internal_error(
             &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
             "Chroma from Luma (CfL) cannot be signaled for a %dx%d block.",
@@ -1560,7 +1560,7 @@
 
 #if CONFIG_CFL
     if (mbmi->uv_mode == UV_CFL_PRED) {
-      if (!is_cfl_allowed(mbmi)) {
+      if (!is_cfl_allowed(xd)) {
         aom_internal_error(
             &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
             "Chroma from Luma (CfL) cannot be signaled for a %dx%d block.",
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index 1c04554..75abce6 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -1424,7 +1424,7 @@
 
 #if CONFIG_CFL
       if (mbmi->uv_mode == UV_CFL_PRED) {
-        if (!is_cfl_allowed(mbmi)) {
+        if (!is_cfl_allowed(xd)) {
           aom_internal_error(
               &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
               "Chroma from Luma (CfL) cannot be signaled for a %dx%d block.",
@@ -1735,7 +1735,7 @@
 
 #if CONFIG_CFL
     if (mbmi->uv_mode == UV_CFL_PRED) {
-      if (!is_cfl_allowed(mbmi)) {
+      if (!is_cfl_allowed(xd)) {
         aom_internal_error(
             &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
             "Chroma from Luma (CfL) cannot be signaled for a %dx%d block.",
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index f1d516e..c0c7f31 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -4827,7 +4827,7 @@
   if (is_inter_block(mbmi) &&
       !is_chroma_reference(mi_row, mi_col, bsize, cfl->subsampling_x,
                            cfl->subsampling_y) &&
-      is_cfl_allowed(mbmi)) {
+      is_cfl_allowed(xd)) {
     cfl_store_block(xd, mbmi->sb_type, mbmi->tx_size);
   }
 #endif  // CONFIG_CFL
diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c
index 65bc6be..354dca8 100644
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c
@@ -972,8 +972,7 @@
   if (*eob) *(args->skip) = 0;
 
 #if CONFIG_CFL
-  if (plane == AOM_PLANE_Y && xd->cfl.store_y &&
-      is_cfl_allowed(&xd->mi[0]->mbmi)) {
+  if (plane == AOM_PLANE_Y && xd->cfl.store_y && is_cfl_allowed(xd)) {
     cfl_store_tx(xd, blk_row, blk_col, tx_size, plane_bsize);
   }
 #endif  // CONFIG_CFL
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index f4dad69..2917271 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -2142,7 +2142,7 @@
     return;
   }
 #if CONFIG_CFL
-  if (plane == AOM_PLANE_Y && xd->cfl.store_y && is_cfl_allowed(mbmi)) {
+  if (plane == AOM_PLANE_Y && xd->cfl.store_y && is_cfl_allowed(xd)) {
     assert(!is_inter_block(mbmi) || plane_bsize < BLOCK_8X8);
     cfl_store_tx(xd, blk_row, blk_col, tx_size, plane_bsize);
   }
@@ -5427,12 +5427,19 @@
 }
 
 static int cfl_rd_pick_alpha(MACROBLOCK *const x, const AV1_COMP *const cpi,
-                             BLOCK_SIZE bsize, TX_SIZE tx_size,
-                             int64_t best_rd) {
+                             TX_SIZE tx_size, int64_t best_rd) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  bsize = scale_chroma_bsize(bsize, xd->plane[AOM_PLANE_U].subsampling_x,
-                             xd->plane[AOM_PLANE_U].subsampling_y);
+
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+#if CONFIG_DEBUG
+  assert(is_cfl_allowed(xd));
+  const BLOCK_SIZE plane_bsize = AOMMAX(
+      BLOCK_4X4, get_plane_block_size(mbmi->sb_type, &xd->plane[AOM_PLANE_U]));
+  assert(plane_bsize < BLOCK_SIZES_ALL);
+  assert(block_size_wide[plane_bsize] == tx_size_wide[tx_size]);
+  assert(block_size_high[plane_bsize] == tx_size_high[tx_size]);
+#endif
 
   int rates[CFL_PRED_PLANES][CFL_MAGS_SIZE];
   int64_t dists[CFL_PRED_PLANES][CFL_MAGS_SIZE];
@@ -5546,10 +5553,11 @@
 #if CONFIG_CFL
     int cfl_alpha_rate = 0;
     if (mode == UV_CFL_PRED) {
-      if (!is_cfl_allowed(mbmi)) continue;
+      if (!is_cfl_allowed(xd)) continue;
       assert(!is_directional_mode);
-      const TX_SIZE uv_tx_size = av1_get_uv_tx_size(mbmi, &xd->plane[1]);
-      cfl_alpha_rate = cfl_rd_pick_alpha(x, cpi, bsize, uv_tx_size, best_rd);
+      const TX_SIZE uv_tx_size =
+          av1_get_uv_tx_size(mbmi, &xd->plane[AOM_PLANE_U]);
+      cfl_alpha_rate = cfl_rd_pick_alpha(x, cpi, uv_tx_size, best_rd);
       if (cfl_alpha_rate == INT_MAX) continue;
     }
 #endif
@@ -5578,7 +5586,7 @@
 
 #if CONFIG_CFL
     if (mode == UV_CFL_PRED) {
-      assert(is_cfl_allowed(mbmi));
+      assert(is_cfl_allowed(xd));
       this_rate += cfl_alpha_rate;
 #if CONFIG_DEBUG
       assert(xd->cfl.rate == this_rate);