CWG-D029: CfL improvements

This branch include the source code of CWG-D029 with two aspects: Simplification of scaling factor and replace sample-copying by collocated filter for DS in CfL mode.

STATS_CHANGED
diff --git a/av1/common/cfl.c b/av1/common/cfl.c
index fb2b5af..8b43bc3 100644
--- a/av1/common/cfl.c
+++ b/av1/common/cfl.c
@@ -224,7 +224,13 @@
                               input[i + 1] + input[bot + AOMMAX(-1, -i)] +
                               2 * input[bot] + input[bot + 1];
         } else if (filter_type == 2) {
+#if CONFIG_CFL_IMPROVEMENTS
+          const int top = i - input_stride;
+          output_q3[i >> 1] = input[AOMMAX(0, i - 1)] + 4 * input[i] +
+                              input[i + 1] + input[top] + input[bot];
+#else
           output_q3[i >> 1] = input[i] * 8;
+#endif  // CONFIG_CFL_IMPROVEMENTS
         } else {
           output_q3[i >> 1] =
               (input[i] + input[i + 1] + input[bot] + input[bot + 1] + 2) << 1;
@@ -278,7 +284,13 @@
           output_q3[j >> 1] = input[-1] + 2 * input[0] + input[1] +
                               input[bot - 1] + 2 * input[bot] + input[bot + 1];
         } else if (filter_type == 2) {
+#if CONFIG_CFL_IMPROVEMENTS
+          const int top = (j == 0) ? 0 : (0 - input_stride);
+          output_q3[j >> 1] =
+              input[-1] + 4 * input[0] + input[1] + input[top] + input[bot];
+#else
           output_q3[j >> 1] = input[0] * 8;
+#endif  // CONFIG_CFL_IMPROVEMENTS
         } else {
           output_q3[j >> 1] =
               (input[0] + input[1] + input[bot] + input[bot + 1]) << 1;
@@ -601,7 +613,14 @@
                                             int height) {
   for (int j = 0; j < height; j += 2) {
     for (int i = 0; i < width; i += 2) {
+#if CONFIG_CFL_IMPROVEMENTS
+      const int top = (j == 0) ? i : (i - input_stride);
+      const int bot = i + input_stride;
+      output_q3[i >> 1] = input[AOMMAX(0, i - 1)] + 4 * input[i] +
+                          input[i + 1] + input[top] + input[bot];
+#else
       output_q3[i >> 1] = input[i] * 8;
+#endif  // CONFIG_CFL_IMPROVEMENTS
     }
     input += input_stride << 1;
     output_q3 += CFL_BUF_LINE;
diff --git a/av1/common/entropymode.c b/av1/common/entropymode.c
index b0a057e..bd04055 100644
--- a/av1/common/entropymode.c
+++ b/av1/common/entropymode.c
@@ -1466,7 +1466,17 @@
 static const aom_cdf_prob default_cfl_sign_cdf[CDF_SIZE(CFL_JOINT_SIGNS)] = {
   AOM_CDF8(1418, 2123, 13340, 18405, 26972, 28343, 32294)
 };
-
+#if CONFIG_CFL_IMPROVEMENTS
+static const aom_cdf_prob
+    default_cfl_alpha_cdf[CFL_ALPHA_CONTEXTS][CDF_SIZE(CFL_ALPHABET_SIZE)] = {
+      { AOM_CDF8(7650, 20740, 31430, 32520, 32700, 32730, 32740) },
+      { AOM_CDF8(14400, 23680, 28230, 31270, 32290, 32530, 32640) },
+      { AOM_CDF8(11560, 22430, 28510, 31430, 32430, 32610, 32680) },
+      { AOM_CDF8(27000, 31430, 32310, 32610, 32730, 32740, 32750) },
+      { AOM_CDF8(17320, 26210, 29100, 30820, 31550, 32150, 32430) },
+      { AOM_CDF8(14990, 22180, 26430, 28600, 29820, 31200, 31980) }
+    };
+#else
 static const aom_cdf_prob
     default_cfl_alpha_cdf[CFL_ALPHA_CONTEXTS][CDF_SIZE(CFL_ALPHABET_SIZE)] = {
       { AOM_CDF16(7637, 20719, 31401, 32481, 32657, 32688, 32692, 32696, 32700,
@@ -1482,7 +1492,7 @@
       { AOM_CDF16(14738, 21678, 25779, 27901, 29024, 30302, 30980, 31843, 32144,
                   32413, 32520, 32594, 32622, 32656, 32660) }
     };
-
+#endif  // CONFIG_CFL_IMPROVEMENTS
 static const aom_cdf_prob
     default_switchable_interp_cdf[SWITCHABLE_FILTER_CONTEXTS][CDF_SIZE(
         SWITCHABLE_FILTERS)] = {
diff --git a/av1/common/enums.h b/av1/common/enums.h
index ea2a2e4..c5493ff 100644
--- a/av1/common/enums.h
+++ b/av1/common/enums.h
@@ -651,7 +651,11 @@
 
 enum { PLANE_TYPE_Y, PLANE_TYPE_UV, PLANE_TYPES } UENUM1BYTE(PLANE_TYPE);
 
+#if CONFIG_CFL_IMPROVEMENTS
+#define CFL_ALPHABET_SIZE_LOG2 3
+#else
 #define CFL_ALPHABET_SIZE_LOG2 4
+#endif  // CONFIG_CFL_IMPROVEMENTS
 #define CFL_ALPHABET_SIZE (1 << CFL_ALPHABET_SIZE_LOG2)
 #define CFL_MAGS_SIZE ((2 << CFL_ALPHABET_SIZE_LOG2) + 1)
 #define CFL_IDX_U(idx) (idx >> CFL_ALPHABET_SIZE_LOG2)
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 276b474..bf932d3 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -1680,9 +1680,13 @@
     dst += CFL_BUF_LINE;
   }
 }
-
+#if CONFIG_CFL_IMPROVEMENTS
+static int64_t compute_sad(const uint16_t *src, uint16_t *src2, int width,
+                           int height, int round_offset, int src2_stride) {
+#else
 static int compute_sad(const uint16_t *src, uint16_t *src2, int width,
                        int height, int round_offset, int src2_stride) {
+#endif  // CONFIG_CFL_IMPROVEMENTS
   int sad = round_offset;
   for (int j = 0; j < height; ++j) {
     for (int i = 0; i < width; ++i) {
@@ -1691,7 +1695,11 @@
     src += CFL_BUF_LINE;
     src2 += src2_stride;
   }
+#if CONFIG_CFL_IMPROVEMENTS
+  return sad;
+#else
   return (sad / (height * width));
+#endif  // CONFIG_CFL_IMPROVEMENTS
 }
 
 static void cfl_predict_hbd_pre_analysis(const int16_t *ac_buf_q3,
@@ -1761,14 +1769,23 @@
   const int subsampling_x = cpi->unfiltered_source->subsampling_x;
   const int subsampling_y = cpi->unfiltered_source->subsampling_y;
 
+#if CONFIG_CFL_IMPROVEMENTS
+  const int blk_w = 16;
+  const int blk_h = 16;
+#else
   const int blk_w = 32;
   const int blk_h = 32;
+#endif  // CONFIG_CFL_IMPROVEMENTS
 
   uint16_t recon_buf_q3[CFL_BUF_SQUARE];
   uint16_t dc_buf_q3[CFL_BUF_SQUARE];
   // Q3 AC contributions (reconstructed luma pixels - tx block avg)
   int16_t ac_buf_q3[CFL_BUF_SQUARE];
+#if CONFIG_CFL_IMPROVEMENTS
+  int64_t cost[3] = { 0, 0, 0 };
+#else
   int cost[3] = { 0, 0, 0 };
+#endif  // CONFIG_CFL_IMPROVEMENTS
   for (int filter_type = 0; filter_type < 3; ++filter_type) {
     for (int comp = 0; comp < 2; comp++) {
       for (int r = 2; r + blk_h <= height - 2; r += blk_h) {
@@ -1803,15 +1820,24 @@
                              chroma_stride, blk_w >> 1, blk_h >> 1);
           cfl_predict_hbd_pre_analysis(ac_buf_q3, dc_buf_q3, CFL_BUF_LINE,
                                        alpha, bd, blk_w >> 1, blk_h >> 1);
+#if CONFIG_CFL_IMPROVEMENTS
+          int64_t filter_cost =
+              compute_sad(dc_buf_q3, this_src_chroma, blk_w >> 1, blk_h >> 1, 2,
+                          chroma_stride);
+#else
           int filter_cost = compute_sad(dc_buf_q3, this_src_chroma, blk_w >> 1,
                                         blk_h >> 1, 2, chroma_stride);
+#endif  // CONFIG_CFL_IMPROVEMENTS
           cost[filter_type] = cost[filter_type] + filter_cost;
         }
       }
     }
   }
-
+#if CONFIG_CFL_IMPROVEMENTS
+  int64_t min_cost = INT64_MAX;
+#else
   int min_cost = INT_MAX;
+#endif  // CONFIG_CFL_IMPROVEMENTS
   for (int i = 0; i < 3; ++i) {
     if (cost[i] < min_cost) {
       min_cost = cost[i];
diff --git a/build/cmake/aom_config_defaults.cmake b/build/cmake/aom_config_defaults.cmake
index 588ed2c..9fd4d2c 100644
--- a/build/cmake/aom_config_defaults.cmake
+++ b/build/cmake/aom_config_defaults.cmake
@@ -221,6 +221,8 @@
 set_aom_config_var(CONFIG_BVCOST_UPDATE 1 "Enables sb-level update for bv cost")
 set_aom_config_var(CONFIG_CCSO_EXT 1
                    "AV2 experiment flag to enable extended CCSO.")
+set_aom_config_var(CONFIG_CFL_IMPROVEMENTS 1
+                   "AV2 Cfl improvements from CWG-D029.")
 set_aom_config_var(CONFIG_ADAPTIVE_MVD 1 "Enable adaptive MVD resolution")
 set_aom_config_var(CONFIG_JOINT_MVD 1 "Enable joint MVD coding")
 set_aom_config_var(CONFIG_IMPROVED_JMVD 1