Fix multiple issues in D118 (MHCCP)

* Fix arguments to cfl_predict_block. Fixes #320
* Fix shift for dynamic range scaling. Fixes #321
* Implement YUV422 and YUV444 support. Fixes #318 and #324
diff --git a/av1/common/cfl.c b/av1/common/cfl.c
index c2ce88b..853ee1d 100644
--- a/av1/common/cfl.c
+++ b/av1/common/cfl.c
@@ -44,13 +44,15 @@
   assert(pred_plane < CFL_PRED_PLANES);
   assert(width <= CFL_BUF_LINE);
 
-  memcpy(xd->cfl.dc_pred_cache[pred_plane], input, width << 1);
+  memcpy(xd->cfl.dc_pred_cache[pred_plane], input,
+         width << xd->cfl.subsampling_x);
   return;
 }
 
 static void cfl_load_dc_pred_hbd(const int16_t *dc_pred_cache, uint16_t *dst,
-                                 int dst_stride, int width, int height) {
-  const size_t num_bytes = width << 1;
+                                 int dst_stride, int width, int height,
+                                 int sub_x) {
+  const size_t num_bytes = width << sub_x;
   for (int j = 0; j < height; j++) {
     memcpy(dst, dc_pred_cache, num_bytes);
     dst += dst_stride;
@@ -64,7 +66,7 @@
   assert(width <= CFL_BUF_LINE);
   assert(height <= CFL_BUF_LINE);
   cfl_load_dc_pred_hbd(xd->cfl.dc_pred_cache[pred_plane], dst, dst_stride,
-                       width, height);
+                       width, height, xd->cfl.subsampling_x);
 }
 
 // Due to frame boundary issues, it is possible that the total area covered by
@@ -956,10 +958,14 @@
         A[0][count] = (l[i + j * ref_stride] >> 3);  // C
         if (dir == 0) {
           A[1][count] = (l[i + (j - 1) * ref_stride] >> 3);  // N 1, -1
-          A[2][count] = (l[i + (j + 1) * ref_stride] >> 3);  // S 1,  1
+          A[2][count] = (i >= left_lines && j + 1 >= above_lines)
+                            ? (l[i + (j)*ref_stride] >> 3)
+                            : (l[i + (j + 1) * ref_stride] >> 3);  // S 1,  1
         } else {
           A[1][count] = (l[(i - 1) + j * ref_stride] >> 3);  // W 1, -1
-          A[2][count] = (l[(i + 1) + j * ref_stride] >> 3);  // E 1,  1
+          A[2][count] = (i + 1 >= left_lines && j >= above_lines)
+                            ? (l[(i) + j * ref_stride] >> 3)
+                            : (l[(i + 1) + j * ref_stride] >> 3);  // E 1,  1
         }
         A[3][count] = NON_LINEAR((l[i + j * ref_stride] >> 3), mid, xd->bd);
         A[4][count] = mid;
@@ -1012,7 +1018,7 @@
           ATA[coli0][coli1] >>= matrixShift;
 
       for (int coli = 0; coli < MHCCP_NUM_PARAMS; coli++)
-        Ty[coli] <<= matrixShift;
+        Ty[coli] >>= matrixShift;
     }
     int64_t U[MHCCP_NUM_PARAMS][MHCCP_NUM_PARAMS];
     int64_t diag[MHCCP_NUM_PARAMS];
diff --git a/av1/common/reconintra.c b/av1/common/reconintra.c
index a26b039..b6dba3d 100644
--- a/av1/common/reconintra.c
+++ b/av1/common/reconintra.c
@@ -2143,32 +2143,43 @@
     }
 #if CONFIG_ADPTIVE_DS_422
     else if (sub_x) {
-      input = dst - input_stride;
-      for (int i = 0; i < width; i += 2) {
+      for (int h = 0; h < (*ref_height); h++) {
+        for (int i = 0; i < (*ref_width); i += 2) {
 #if CONFIG_ADAPTIVE_DS_FILTER
-        const int filter_type = cm->seq_params.enable_cfl_ds_filter;
-        if (filter_type == 1) {
-          output_q3[i >> 1] =
-              (input[AOMMAX(0, i - 1)] + 2 * input[i] + input[i + 1]) << 1;
-        } else if (filter_type == 2) {
-          output_q3[i >> 1] = input[i] << 3;
-        } else {
-          output_q3[i >> 1] = (input[i] + input[i + 1]) << 2;
-        }
+          const int filter_type = cm->seq_params.enable_cfl_ds_filter;
+          if (filter_type == 1) {
+            output_q3[i >> 1] =
+                (input[AOMMAX(0, i - 1)] + 2 * input[i] + input[i + 1]) << 1;
+          } else if (filter_type == 2) {
+            output_q3[i >> 1] = input[i] << 3;
+          } else {
+            output_q3[i >> 1] = (input[i] + input[i + 1]) << 2;
+          }
 #else
-        output_q3[i >> 1] = input[i] << 3;
+          output_q3[i >> 1] = input[i] << 3;
 #endif  // CONFIG_ADAPTIVE_DS_FILTER
+        }
+        output_q3 += output_stride;
+        input += input_stride;
       }
-#endif                   // CONFIG_ADPTIVE_DS_422
-    } else if (sub_y) {  // @todo extend for 422
-      input = dst - 2 * input_stride;
-      for (int i = 0; i < width; ++i) {
-        const int bot = i + input_stride;
-        output_q3[i] = (input[i] + input[bot]) << 2;
+#endif
+    } else if (sub_y) {
+      for (int h = 0; h < (*ref_height); h += 2) {
+        for (int i = 0; i < (*ref_width); ++i) {
+          const int bot = i + input_stride;
+          output_q3[i] = (input[i] + input[bot]) << 2;
+        }
+        output_q3 += output_stride;
+        input += input_stride * 2;
       }
-    } else {  // @todo extend for 444
-      input = dst - input_stride;
-      for (int i = 0; i < width; ++i) output_q3[i] = input[i] << 3;
+    } else {
+      for (int h = 0; h < (*ref_height); h++) {
+        for (int i = 0; i < (*ref_width); ++i) {
+          output_q3[i] = input[i] << 3;
+        }
+        output_q3 += output_stride;
+        input += input_stride;
+      }
     }
   }
 }
@@ -2271,7 +2282,14 @@
     }
 #if CONFIG_IMPROVED_CFL
     CFL_CTX *const cfl = &xd->cfl;
+    const int sub_x = cfl->subsampling_x;
+    const int sub_y = cfl->subsampling_y;
+
     CFL_PRED_TYPE pred_plane = get_cfl_pred_type(plane);
+    if (mbmi->cfl_idx == CFL_DERIVED_ALPHA) {
+      cfl->dc_pred_is_cached[pred_plane] = 0;
+      cfl->use_dc_pred_cache = 0;
+    }
     if (cfl->dc_pred_is_cached[pred_plane] == 0) {
       av1_predict_intra_block(cm, xd, pd->width, pd->height, tx_size, mode,
                               angle_delta, use_palette, filter_intra_mode, dst,
@@ -2306,10 +2324,11 @@
             cm, xd, blk_row << cfl->subsampling_y,
             blk_col << cfl->subsampling_x, tx_size, &above_lines, &left_lines,
             &ref_width, &ref_height);
-        above_lines >>= 1;
-        left_lines >>= 1;
-        ref_width >>= 1;
-        ref_height >>= 1;
+
+        above_lines >>= sub_y;
+        left_lines >>= sub_x;
+        ref_width >>= sub_x;
+        ref_height >>= sub_y;
         mhccp_implicit_fetch_neighbor_chroma(xd, plane, blk_row, blk_col,
                                              tx_size, above_lines, left_lines,
                                              ref_width, ref_height);
@@ -2320,10 +2339,10 @@
             cm, xd, blk_row << cfl->subsampling_y,
             blk_col << cfl->subsampling_x, tx_size, &above_lines, &left_lines,
             &ref_width, &ref_height);
-        above_lines >>= 1;
-        left_lines >>= 1;
-        ref_width >>= 1;
-        ref_height >>= 1;
+        above_lines >>= sub_y;
+        left_lines >>= sub_x;
+        ref_width >>= sub_x;
+        ref_height >>= sub_y;
         mhccp_implicit_fetch_neighbor_chroma(xd, plane, blk_row, blk_col,
                                              tx_size, above_lines, left_lines,
                                              ref_width, ref_height);
@@ -2336,7 +2355,7 @@
     }
 #endif
     cfl_predict_block(xd, dst, dst_stride, tx_size, plane, above_lines > 0,
-                      left_lines > 0, above_lines, above_lines);
+                      left_lines > 0, above_lines, left_lines);
     return;
   }