[CFL] DC_PRED on Partition Unit Neighboring Pixels

To avoid a cascade of encodes when performing CfL RDO,
we compute DC_PRED on the partition unit. To do so, we
change the tx_size of CfL to match the size of the
partition unit (i.e. CfL partitions only contain 1
transform block).

This change requires disabling CfL when a chroma
partition-unit-sized DC_PRED is unavailable
(i.e. 4:1, 1:4 partitions and chroma partitions > 32X32).

Results on Subset1 (compared to disabling 4:1 and 1:4 partU):
   PSNR | PSNR Cb | PSNR Cr | PSNR HVS |    SSIM | MS SSIM | CIEDE 2000
-0.1243 | -1.9286 | -2.0140 |  -0.1514 | -0.1512 | -0.1947 |    -0.8066

https://two.arewecompressedyet.com/?job=master%402017-12-12T14%3A53%3A01.451Z&job=CfL-PartU%402017-12-12T15%3A39%3A36.794Z

Change-Id: I2a4adde79c10089130775b8e0df5f9c198855cad
diff --git a/av1/common/cfl.c b/av1/common/cfl.c
index 95c61d9..29a7d0d 100644
--- a/av1/common/cfl.c
+++ b/av1/common/cfl.c
@@ -186,27 +186,27 @@
 }
 
 void cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
-                       int row, int col, TX_SIZE tx_size, int plane) {
+                       TX_SIZE tx_size, int plane) {
   CFL_CTX *const cfl = &xd->cfl;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  assert(is_cfl_allowed(xd));
 
   if (!cfl->are_parameters_computed) cfl_compute_parameters(xd, tx_size);
 
-  const int16_t *pred_buf_q3 =
-      cfl->pred_buf_q3 + ((row * MAX_SB_SIZE + col) << tx_size_wide_log2[0]);
   const int alpha_q3 =
       cfl_idx_to_alpha(mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, plane - 1);
 #if CONFIG_HIGHBITDEPTH
   if (get_bitdepth_data_path_index(xd)) {
     uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst);
-    cfl_build_prediction_hbd(pred_buf_q3, dst_16, dst_stride,
+    cfl_build_prediction_hbd(cfl->pred_buf_q3, dst_16, dst_stride,
                              tx_size_wide[tx_size], tx_size_high[tx_size],
                              alpha_q3, xd->bd);
     return;
   }
 #endif  // CONFIG_HIGHBITDEPTH
-  cfl_build_prediction_lbd(pred_buf_q3, dst, dst_stride, tx_size_wide[tx_size],
-                           tx_size_high[tx_size], alpha_q3);
+  cfl_build_prediction_lbd(cfl->pred_buf_q3, dst, dst_stride,
+                           tx_size_wide[tx_size], tx_size_high[tx_size],
+                           alpha_q3);
 }
 
 static void cfl_luma_subsampling_420_lbd(const uint8_t *input, int input_stride,
@@ -466,7 +466,7 @@
   uint8_t *dst =
       &pd->dst.buf[(row * pd->dst.stride + col) << tx_size_wide_log2[0]];
 
-  assert(is_cfl_allowed(&xd->mi[0]->mbmi));
+  assert(is_cfl_allowed(xd));
   if (block_size_high[bsize] == 4 || block_size_wide[bsize] == 4) {
     // Only dimensions of size 4 can have an odd offset.
     assert(!((col & 1) && tx_size_wide[tx_size] != 4));
@@ -487,7 +487,7 @@
   int col = 0;
   bsize = AOMMAX(BLOCK_4X4, bsize);
 
-  assert(is_cfl_allowed(&xd->mi[0]->mbmi));
+  assert(is_cfl_allowed(xd));
   if (block_size_high[bsize] == 4 || block_size_wide[bsize] == 4) {
     sub8x8_adjust_offset(cfl, &row, &col);
 #if CONFIG_DEBUG