[CFL] allow for 4:1 rects if full tx available

Disable CFL sub8x8 validation in this case, as it appears to give
false-negatives for 4:1 blocks. All other tests pass.

The coding gain on subset1 is quite significant.

   PSNR | PSNR Cb | PSNR Cr | PSNR HVS |    SSIM | MS SSIM | CIEDE 2000
-0.1270 | -1.1386 | -1.1426 |  -0.1167 | -0.1157 | -0.1264 |    -0.4142

Change-Id: Ic20c9b1a5ff28e0fbd4e6491ed2cd2d1f6b487c9
diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index 0cd4ec4..d63d8dd 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h
@@ -541,14 +541,14 @@
 #endif  // CONFIG_DEBUG
 
   int is_chroma_reference;
-#if CONFIG_DEBUG
+#if CONFIG_DEBUG && !CONFIG_RECT_TX_EXT_INTRA
   // Validation buffer is usually 2x2, except for 16x4 and 4x16 in that case it
   // is 4x2 and 2x4 respectively. To simplify accessing the buffer we use a
   // stride of CFL_SUB8X8_VAL_MI_SIZE resulting in a square of 16.
   uint16_t sub8x8_val[CFL_SUB8X8_VAL_MI_SQUARE];
   uint16_t store_counter;
   uint16_t last_compute_counter;
-#endif  // CONFIG_DEBUG
+#endif  // CONFIG_DEBUG && !CONFIG_RECT_TX_EXT_INTRA
 } CFL_CTX;
 #endif  // CONFIG_CFL
 
diff --git a/av1/common/cfl.c b/av1/common/cfl.c
index d0e69d5..6ab5ad9 100644
--- a/av1/common/cfl.c
+++ b/av1/common/cfl.c
@@ -36,11 +36,11 @@
   cfl->use_dc_pred_cache = 0;
   cfl->dc_pred_is_cached[CFL_PRED_U] = 0;
   cfl->dc_pred_is_cached[CFL_PRED_V] = 0;
-#if CONFIG_DEBUG
+#if CONFIG_DEBUG && !CONFIG_RECT_TX_EXT_INTRA
   cfl_clear_sub8x8_val(cfl);
   cfl->store_counter = 0;
   cfl->last_compute_counter = 0;
-#endif  // CONFIG_DEBUG
+#endif  // CONFIG_DEBUG && !CONFIG_RECT_TX_EXT_INTRA
 }
 
 void cfl_store_dc_pred(MACROBLOCKD *const xd, const uint8_t *input,
@@ -205,7 +205,7 @@
   // Do not call cfl_compute_parameters multiple time on the same values.
   assert(cfl->are_parameters_computed == 0);
 
-#if CONFIG_DEBUG
+#if CONFIG_DEBUG && !CONFIG_RECT_TX_EXT_INTRA
   BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
   if (block_size_high[bsize] == 4 || block_size_wide[bsize] == 4) {
     const uint16_t compute_counter = cfl->sub8x8_val[0];
@@ -225,7 +225,7 @@
     }
     cfl->last_compute_counter = compute_counter;
   }
-#endif  // CONFIG_DEBUG
+#endif  // CONFIG_DEBUG && !CONFIG_RECT_TX_EXT_INTRA
 
   cfl_subtract_average(cfl, tx_size);
   cfl->are_parameters_computed = 1;
@@ -453,7 +453,7 @@
     (*col_out)++;
   }
 }
-#if CONFIG_DEBUG
+#if CONFIG_DEBUG && !CONFIG_RECT_TX_EXT_INTRA
 // Since the chroma surface of sub8x8 block span across multiple luma blocks,
 // this function validates that the reconstructed luma area required to predict
 // the chroma block using CfL has been stored during the previous luma encode.
@@ -510,7 +510,7 @@
     assert(found);
   }
 }
-#endif  // CONFIG_DEBUG
+#endif  // CONFIG_DEBUG && !CONFIG_RECT_TX_EXT_INTRA
 
 void cfl_store_tx(MACROBLOCKD *const xd, int row, int col, TX_SIZE tx_size,
                   BLOCK_SIZE bsize) {
@@ -525,9 +525,9 @@
     assert(!((col & 1) && tx_size_wide[tx_size] != 4));
     assert(!((row & 1) && tx_size_high[tx_size] != 4));
     sub8x8_adjust_offset(cfl, &row, &col);
-#if CONFIG_DEBUG
+#if CONFIG_DEBUG && !CONFIG_RECT_TX_EXT_INTRA
     sub8x8_set_val(cfl, row, col, tx_size);
-#endif  // CONFIG_DEBUG
+#endif  // CONFIG_DEBUG && !CONFIG_RECT_TX_EXT_INTRA
   }
   cfl_store(cfl, dst, pd->dst.stride, row, col, tx_size_wide[tx_size],
             tx_size_high[tx_size], get_bitdepth_data_path_index(xd));
@@ -542,14 +542,14 @@
   assert(is_cfl_allowed(&xd->mi[0]->mbmi));
   if (block_size_high[bsize] == 4 || block_size_wide[bsize] == 4) {
     sub8x8_adjust_offset(cfl, &row, &col);
-#if CONFIG_DEBUG
+#if CONFIG_DEBUG && !CONFIG_RECT_TX_EXT_INTRA
     // Point to the last transform block inside the partition.
     const int off_row =
         row + (mi_size_high[bsize] - tx_size_high_unit[tx_size]);
     const int off_col =
         col + (mi_size_wide[bsize] - tx_size_wide_unit[tx_size]);
     sub8x8_set_val(cfl, off_row, off_col, tx_size);
-#endif  // CONFIG_DEBUG
+#endif  // CONFIG_DEBUG && !CONFIG_RECT_TX_EXT_INTRA
   }
   const int width = max_intra_block_width(xd, bsize, AOM_PLANE_Y, tx_size);
   const int height = max_intra_block_height(xd, bsize, AOM_PLANE_Y, tx_size);
diff --git a/av1/common/cfl.h b/av1/common/cfl.h
index fb0e884..6d5d9cb 100644
--- a/av1/common/cfl.h
+++ b/av1/common/cfl.h
@@ -28,7 +28,12 @@
 static INLINE CFL_ALLOWED_TYPE is_cfl_allowed(const MB_MODE_INFO *mbmi) {
   const BLOCK_SIZE bsize = mbmi->sb_type;
   assert(bsize < BLOCK_SIZES_ALL);
+#if CONFIG_EXT_PARTITION_TYPES && CONFIG_RECT_TX_EXT_INTRA
+  return (CFL_ALLOWED_TYPE)(block_size_wide[bsize] <= 32 &&
+                            block_size_high[bsize] <= 32);
+#else
   return (CFL_ALLOWED_TYPE)(bsize <= CFL_MAX_BLOCK_SIZE);
+#endif  // CONFIG_EXT_PARTITION_TYPES && CONFIG_RECT_TX_EXT_INTRA
 }
 
 static INLINE int get_scaled_luma_q0(int alpha_q3, int16_t pred_buf_q3) {
diff --git a/av1/common/onyxc_int.h b/av1/common/onyxc_int.h
index 88201a5..10e0dcb 100644
--- a/av1/common/onyxc_int.h
+++ b/av1/common/onyxc_int.h
@@ -745,11 +745,11 @@
 }
 
 #if CONFIG_CFL
-#if CONFIG_DEBUG
+#if CONFIG_DEBUG && !CONFIG_RECT_TX_EXT_INTRA
 static INLINE void cfl_clear_sub8x8_val(CFL_CTX *cfl) {
   memset(cfl->sub8x8_val, 0, sizeof(cfl->sub8x8_val));
 }
-#endif  // CONFIG_DEBUG
+#endif  // CONFIG_DEBUG && !CONFIG_RECT_TX_EXT_INTRA
 void cfl_init(CFL_CTX *cfl, AV1_COMMON *cm);
 #endif  // CONFIG_CFL