fix CCSO filter size
diff --git a/av1/common/av1_common_int.h b/av1/common/av1_common_int.h
index 22c233d..69d1325 100644
--- a/av1/common/av1_common_int.h
+++ b/av1/common/av1_common_int.h
@@ -2209,12 +2209,21 @@
       if (buf->ccso_info.sb_filter_control[pli]) {
         aom_free(buf->ccso_info.sb_filter_control[pli]);
       }
+#if CONFIG_CCSO_FU_BUGFIX
+      const int log2_filter_unit_size_y =
+          pli == 0 ? CCSO_BLK_SIZE
+                  : CCSO_BLK_SIZE - cm->seq_params.subsampling_y;
+      const int log2_filter_unit_size_x =
+          pli == 0 ? CCSO_BLK_SIZE
+                  : CCSO_BLK_SIZE - cm->seq_params.subsampling_x;
+#else
       const int log2_filter_unit_size_y =
           pli > 0 ? CCSO_BLK_SIZE
                   : CCSO_BLK_SIZE + cm->seq_params.subsampling_y;
       const int log2_filter_unit_size_x =
           pli > 0 ? CCSO_BLK_SIZE
                   : CCSO_BLK_SIZE + cm->seq_params.subsampling_x;
+#endif
 
       const int ccso_nvfb =
           ((cm->mi_params.mi_rows >> (pli ? cm->seq_params.subsampling_y : 0)) +
@@ -2231,6 +2240,9 @@
               32, sizeof(*buf->ccso_info.sb_filter_control[pli]) * sb_count));
       memset(buf->ccso_info.sb_filter_control[pli], 0,
              sizeof(*buf->ccso_info.sb_filter_control[pli]) * sb_count);
+#if CONFIG_CCSO_DEBUG
+      printf("CCSO: plane %d nvfb %d nhfb %d sb_count %d @ %s\n", pli, ccso_nvfb, ccso_nhfb, sb_count, __FUNCTION__);
+#endif
     }
   }
 #endif  // CONFIG_CCSO_IMPROVE
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 1c9a2b4..1e089f7 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -620,31 +620,63 @@
 }
 
 # Cross-component Sample Offset
-add_proto qw/void ccso_filter_block_hbd_wo_buf/, "const uint16_t *src_y, uint16_t *dst_yuv, const int x, const int y, const int pic_width, const int pic_height, int *src_cls, const int8_t *offset_buf, const int scaled_ext_stride, const int dst_stride, const int y_uv_hscale, const int y_uv_vscale, const int thr, const int neg_thr, const int *src_loc, const int max_val, const int blk_size, const bool isSingleBand, const uint8_t shift_bits, const int edge_clf, const uint8_t ccso_bo_only";
+if (aom_config("CONFIG_CCSO_FU_BUGFIX") eq "yes") {
+  add_proto qw/void ccso_filter_block_hbd_wo_buf/, "const uint16_t *src_y, uint16_t *dst_yuv, const int x, const int y, const int pic_width, const int pic_height, int *src_cls, const int8_t *offset_buf, const int scaled_ext_stride, const int dst_stride, const int y_uv_hscale, const int y_uv_vscale, const int thr, const int neg_thr, const int *src_loc, const int max_val, const int blk_size_x, const int blk_size_y, const bool isSingleBand, const uint8_t shift_bits, const int edge_clf, const uint8_t ccso_bo_only";
+}
+else {
+  add_proto qw/void ccso_filter_block_hbd_wo_buf/, "const uint16_t *src_y, uint16_t *dst_yuv, const int x, const int y, const int pic_width, const int pic_height, int *src_cls, const int8_t *offset_buf, const int scaled_ext_stride, const int dst_stride, const int y_uv_hscale, const int y_uv_vscale, const int thr, const int neg_thr, const int *src_loc, const int max_val, const int blk_size, const bool isSingleBand, const uint8_t shift_bits, const int edge_clf, const uint8_t ccso_bo_only";
+}
 specialize qw/ccso_filter_block_hbd_wo_buf avx2/;
 
 if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
-  add_proto qw/void ccso_filter_block_hbd_with_buf/, "const uint16_t *src_y, uint16_t *dst_yuv, const uint8_t *src_cls0, const uint8_t *src_cls1,
-  const int src_y_stride, const int dst_stride,
-  const int ccso_stride,
-  const int x, const int y,
-  const int pic_width, const int pic_height,
-  const int8_t *filter_offset, const int blk_size,
-  const int y_uv_hscale,  const int y_uv_vscale,
-  const int max_val, const uint8_t shift_bits,
-  const uint8_t ccso_bo_only";
-  specialize qw/ccso_filter_block_hbd_with_buf avx2/;
+  if (aom_config("CONFIG_CCSO_FU_BUGFIX") eq "yes") {
+    add_proto qw/void ccso_filter_block_hbd_with_buf/, "const uint16_t *src_y, uint16_t *dst_yuv, const uint8_t *src_cls0, const uint8_t *src_cls1,
+                      const int src_y_stride, const int dst_stride,
+                      const int ccso_stride,
+                      const int x, const int y,
+                      const int pic_width, const int pic_height,
+                      const int8_t *filter_offset, const int blk_size_x,
+                      const int blk_size_y,
+                      const int y_uv_hscale,  const int y_uv_vscale,
+                      const int max_val, const uint8_t shift_bits,
+                      const uint8_t ccso_bo_only";
+    specialize qw/ccso_filter_block_hbd_with_buf avx2/;
 
-  add_proto qw/void ccso_filter_block_hbd_with_buf_bo_only/, "const uint16_t *src_y, uint16_t *dst_yuv, const uint8_t *src_cls0, const uint8_t *src_cls1,
-  const int src_y_stride, const int dst_stride,
-  const int ccso_stride,
-  const int x, const int y,
-  const int pic_width, const int pic_height,
-  const int8_t *filter_offset, const int blk_size,
-  const int y_uv_hscale,  const int y_uv_vscale,
-  const int max_val, const uint8_t shift_bits,
-  const uint8_t ccso_bo_only";
-  specialize qw/ccso_filter_block_hbd_with_buf_bo_only avx2/;
+    add_proto qw/void ccso_filter_block_hbd_with_buf_bo_only/, "const uint16_t *src_y, uint16_t *dst_yuv, const uint8_t *src_cls0, const uint8_t *src_cls1,
+                      const int src_y_stride, const int dst_stride,
+                      const int ccso_stride,
+                      const int x, const int y,
+                      const int pic_width, const int pic_height,
+                      const int8_t *filter_offset, const int blk_size_x,
+                      const int blk_size_y,
+                      const int y_uv_hscale,  const int y_uv_vscale,
+                      const int max_val, const uint8_t shift_bits,
+                      const uint8_t ccso_bo_only";
+    specialize qw/ccso_filter_block_hbd_with_buf_bo_only avx2/;
+  }
+  else {
+    add_proto qw/void ccso_filter_block_hbd_with_buf/, "const uint16_t *src_y, uint16_t *dst_yuv, const uint8_t *src_cls0, const uint8_t *src_cls1,
+                      const int src_y_stride, const int dst_stride,
+                      const int ccso_stride,
+                      const int x, const int y,
+                      const int pic_width, const int pic_height,
+                      const int8_t *filter_offset, const int blk_size,
+                      const int y_uv_hscale,  const int y_uv_vscale,
+                      const int max_val, const uint8_t shift_bits,
+                      const uint8_t ccso_bo_only";
+    specialize qw/ccso_filter_block_hbd_with_buf avx2/;
+
+    add_proto qw/void ccso_filter_block_hbd_with_buf_bo_only/, "const uint16_t *src_y, uint16_t *dst_yuv, const uint8_t *src_cls0, const uint8_t *src_cls1,
+                      const int src_y_stride, const int dst_stride,
+                      const int ccso_stride,
+                      const int x, const int y,
+                      const int pic_width, const int pic_height,
+                      const int8_t *filter_offset, const int blk_size,
+                      const int y_uv_hscale,  const int y_uv_vscale,
+                      const int max_val, const uint8_t shift_bits,
+                      const uint8_t ccso_bo_only";
+    specialize qw/ccso_filter_block_hbd_with_buf_bo_only avx2/;
+  }
 
   add_proto qw/uint64_t compute_distortion_block/, "const uint16_t *org, const int org_stride,
                       const uint16_t *rec16, const int rec_stride, const int x, const int y,
@@ -652,12 +684,23 @@
                       const int width";
   specialize qw/compute_distortion_block avx2/;
 
-  add_proto qw/void ccso_derive_src_block/, "const uint16_t *src_y, uint8_t *const src_cls0,
-                        uint8_t *const src_cls1, const int src_y_stride, const int ccso_stride,
-                        const int x, const int y, const int pic_width, const int pic_height,
-                        const int y_uv_hscale, const int y_uv_vscale, const int qstep,
-                        const int neg_qstep, const int *src_loc, const int blk_size, const int edge_clf";
-  specialize qw/ccso_derive_src_block avx2/
+  if (aom_config("CONFIG_CCSO_FU_BUGFIX") eq "yes") {
+    add_proto qw/void ccso_derive_src_block/, "const uint16_t *src_y, uint8_t *const src_cls0,
+                          uint8_t *const src_cls1, const int src_y_stride, const int ccso_stride,
+                          const int x, const int y, const int pic_width, const int pic_height,
+                          const int y_uv_hscale, const int y_uv_vscale, const int qstep,
+                          const int neg_qstep, const int *src_loc, const int blk_size_x,
+                          const int blk_size_y, const int edge_clf";
+    specialize qw/ccso_derive_src_block avx2/
+  }
+  else {
+    add_proto qw/void ccso_derive_src_block/, "const uint16_t *src_y, uint8_t *const src_cls0,
+                          uint8_t *const src_cls1, const int src_y_stride, const int ccso_stride,
+                          const int x, const int y, const int pic_width, const int pic_height,
+                          const int y_uv_hscale, const int y_uv_vscale, const int qstep,
+                          const int neg_qstep, const int *src_loc, const int blk_size, const int edge_clf";
+    specialize qw/ccso_derive_src_block avx2/
+  }
 }
 
 # WARPED_MOTION / GLOBAL_MOTION functions
diff --git a/av1/common/ccso.c b/av1/common/ccso.c
index a5c148e..794d0e7 100644
--- a/av1/common/ccso.c
+++ b/av1/common/ccso.c
@@ -150,10 +150,20 @@
     const int8_t *offset_buf, const int src_y_stride, const int dst_stride,
     const int y_uv_hscale, const int y_uv_vscale, const int thr,
     const int neg_thr, const int *src_loc, const int max_val,
-    const int blk_size, const bool isSingleBand, const uint8_t shift_bits,
+#if CONFIG_CCSO_FU_BUGFIX
+    const int blk_size_x, const int blk_size_y,
+#else
+    const int blk_size,
+#endif
+    const bool isSingleBand, const uint8_t shift_bits,
     const int edge_clf, const uint8_t ccso_bo_only) {
+#if CONFIG_CCSO_FU_BUGFIX
+  const int y_end = AOMMIN(pic_height - y, blk_size_y);
+  const int x_end = AOMMIN(pic_width - x, blk_size_x);
+#else
   const int y_end = AOMMIN(pic_height - y, blk_size);
   const int x_end = AOMMIN(pic_width - x, blk_size);
+#endif
   for (int y_start = 0; y_start < y_end; y_start++) {
     const int y_pos = y_start;
     for (int x_start = 0; x_start < x_end; x_start++) {
@@ -202,34 +212,67 @@
   const int neg_thr = thr * -1;
   int src_loc[2];
   derive_ccso_sample_pos(cm, src_loc, ccso_ext_stride, filter_sup);
+#if CONFIG_CCSO_FU_BUGFIX
+  assert(plane == 0); // function must only be called for plane == 0
+  const int blk_log2 = CCSO_BLK_SIZE;
+#else
   const int blk_log2 = plane > 0 ? CCSO_BLK_SIZE : CCSO_BLK_SIZE + 1;
+#endif
   const int blk_size = 1 << blk_log2;
   src_y += CCSO_PADDING_SIZE * ccso_ext_stride + CCSO_PADDING_SIZE;
+#if CONFIG_CCSO_DEBUG
+  printf("CCSO: plane %d bo_only %d thr %d neg_thr %d @ %s\n",
+         plane, cm->ccso_info.ccso_bo_only[plane],
+         thr, neg_thr, __FUNCTION__);
+#endif
   for (int y = 0; y < pic_height; y += blk_size) {
+#if CONFIG_CCSO_DEBUG
+    printf("CCSO: y %d", y);
+#endif
     for (int x = 0; x < pic_width; x += blk_size) {
+#if CONFIG_CCSO_FU_BUGFIX
+      const int ccso_blk_idx =
+          (blk_size >> MI_SIZE_LOG2) * (y >> blk_log2) * mi_params->mi_stride +
+          (blk_size >> MI_SIZE_LOG2) * (x >> blk_log2);
+#else
       const int ccso_blk_idx =
           (blk_size >> (MI_SIZE_LOG2 - xd->plane[plane].subsampling_y)) *
               (y >> blk_log2) * mi_params->mi_stride +
           (blk_size >> (MI_SIZE_LOG2 - xd->plane[plane].subsampling_x)) *
               (x >> blk_log2);
+#endif
       const bool use_ccso = mi_params->mi_grid_base[ccso_blk_idx]->ccso_blk_y;
+#if CONFIG_CCSO_DEBUG
+      printf(" use [%d] %d (%p)", ccso_blk_idx, use_ccso, mi_params->mi_grid_base[ccso_blk_idx]);
+#endif
       if (!use_ccso) continue;
       if (cm->ccso_info.ccso_bo_only[plane]) {
         ccso_filter_block_hbd_wo_buf_c(
             src_y, dst_yuv, x, y, pic_width, pic_height, src_cls,
             cm->ccso_info.filter_offset[plane], ccso_ext_stride, dst_stride, 0,
-            0, thr, neg_thr, src_loc, max_val, blk_size, false, shift_bits,
+            0, thr, neg_thr, src_loc, max_val,
+#if CONFIG_CCSO_FU_BUGFIX
+            blk_size,
+#endif
+            blk_size, false, shift_bits,
             edge_clf, cm->ccso_info.ccso_bo_only[plane]);
       } else {
         ccso_filter_block_hbd_wo_buf(
             src_y, dst_yuv, x, y, pic_width, pic_height, src_cls,
             cm->ccso_info.filter_offset[plane], ccso_ext_stride, dst_stride, 0,
-            0, thr, neg_thr, src_loc, max_val, blk_size, false, shift_bits,
+            0, thr, neg_thr, src_loc, max_val,
+#if CONFIG_CCSO_FU_BUGFIX
+            blk_size,
+#endif
+            blk_size, false, shift_bits,
             edge_clf, 0);
       }
     }
     dst_yuv += (dst_stride << blk_log2);
     src_y += (ccso_ext_stride << blk_log2);
+#if CONFIG_CCSO_DEBUG
+    printf("\n");
+#endif
   }
 }
 
@@ -256,34 +299,67 @@
   const int neg_thr = thr * -1;
   int src_loc[2];
   derive_ccso_sample_pos(cm, src_loc, ccso_ext_stride, filter_sup);
+#if CONFIG_CCSO_FU_BUGFIX
+  assert(plane == 0); // function must only be called for plane == 0
+  const int blk_log2 = CCSO_BLK_SIZE;
+#else
   const int blk_log2 = plane > 0 ? CCSO_BLK_SIZE : CCSO_BLK_SIZE + 1;
+#endif
   const int blk_size = 1 << blk_log2;
   src_y += CCSO_PADDING_SIZE * ccso_ext_stride + CCSO_PADDING_SIZE;
+#if CONFIG_CCSO_DEBUG
+  printf("CCSO: plane %d bo_only %d thr %d neg_thr %d @ %s\n",
+         plane, cm->ccso_info.ccso_bo_only[plane],
+         thr, neg_thr, __FUNCTION__);
+#endif
   for (int y = 0; y < pic_height; y += blk_size) {
+#if CONFIG_CCSO_DEBUG
+    printf("CCSO: y %d", y);
+#endif
     for (int x = 0; x < pic_width; x += blk_size) {
+#if CONFIG_CCSO_FU_BUGFIX
+      const int ccso_blk_idx =
+          (blk_size >> MI_SIZE_LOG2) * (y >> blk_log2) * mi_params->mi_stride +
+          (blk_size >> MI_SIZE_LOG2) * (x >> blk_log2);
+#else
       const int ccso_blk_idx =
           (blk_size >> (MI_SIZE_LOG2 - xd->plane[plane].subsampling_y)) *
               (y >> blk_log2) * mi_params->mi_stride +
           (blk_size >> (MI_SIZE_LOG2 - xd->plane[plane].subsampling_x)) *
               (x >> blk_log2);
+#endif
       const bool use_ccso = mi_params->mi_grid_base[ccso_blk_idx]->ccso_blk_y;
+#if CONFIG_CCSO_DEBUG
+      printf(" use [%d] %d (%p)", ccso_blk_idx, use_ccso, mi_params->mi_grid_base[ccso_blk_idx]);
+#endif
       if (!use_ccso) continue;
       if (cm->ccso_info.ccso_bo_only[plane]) {
         ccso_filter_block_hbd_wo_buf_c(
             src_y, dst_yuv, x, y, pic_width, pic_height, src_cls,
             cm->ccso_info.filter_offset[plane], ccso_ext_stride, dst_stride, 0,
-            0, thr, neg_thr, src_loc, max_val, blk_size, true, shift_bits,
+            0, thr, neg_thr, src_loc, max_val,
+#if CONFIG_CCSO_FU_BUGFIX
+            blk_size,
+#endif
+            blk_size, true, shift_bits,
             edge_clf, cm->ccso_info.ccso_bo_only[plane]);
       } else {
         ccso_filter_block_hbd_wo_buf(
             src_y, dst_yuv, x, y, pic_width, pic_height, src_cls,
             cm->ccso_info.filter_offset[plane], ccso_ext_stride, dst_stride, 0,
-            0, thr, neg_thr, src_loc, max_val, blk_size, true, shift_bits,
+            0, thr, neg_thr, src_loc, max_val,
+#if CONFIG_CCSO_FU_BUGFIX
+            blk_size,
+#endif
+            blk_size, true, shift_bits,
             edge_clf, 0);
       }
     }
     dst_yuv += (dst_stride << blk_log2);
     src_y += (ccso_ext_stride << blk_log2);
+#if CONFIG_CCSO_DEBUG
+    printf("\n");
+#endif
   }
 }
 
@@ -311,36 +387,74 @@
   const int neg_thr = thr * -1;
   int src_loc[2];
   derive_ccso_sample_pos(cm, src_loc, ccso_ext_stride, filter_sup);
-  const int blk_log2 = plane > 0 ? CCSO_BLK_SIZE : CCSO_BLK_SIZE + 1;
-  const int blk_size = 1 << blk_log2;
+#if CONFIG_CCSO_FU_BUGFIX
+  assert(plane > 0); // function must only be called for plane > 0
+  const int blk_size = 1 << CCSO_BLK_SIZE;
+  const int blk_log2_y = CCSO_BLK_SIZE - cm->seq_params.subsampling_y;
+  const int blk_log2_x = CCSO_BLK_SIZE - cm->seq_params.subsampling_x;
+  const int blk_size_y = 1 << blk_log2_y;
+  const int blk_size_x = 1 << blk_log2_x;
+#else
+  const int blk_log2_y = plane > 0 ? CCSO_BLK_SIZE : CCSO_BLK_SIZE + 1;
+  const int blk_size_y = 1 << blk_log2_y;
+  const int blk_size_x = blk_size_x;
+#endif
   src_y += CCSO_PADDING_SIZE * ccso_ext_stride + CCSO_PADDING_SIZE;
-  for (int y = 0; y < pic_height; y += blk_size) {
-    for (int x = 0; x < pic_width; x += blk_size) {
+#if CONFIG_CCSO_DEBUG
+  printf("CCSO: plane %d bo_only %d thr %d neg_thr %d @ %s\n",
+         plane, cm->ccso_info.ccso_bo_only[plane],
+         thr, neg_thr, __FUNCTION__);
+#endif
+  for (int y = 0; y < pic_height; y += blk_size_y) {
+#if CONFIG_CCSO_DEBUG
+    printf("CCSO: y %d", y);
+#endif
+    for (int x = 0; x < pic_width; x += blk_size_x) {
+#if CONFIG_CCSO_FU_BUGFIX
       const int ccso_blk_idx =
-          (blk_size >> (MI_SIZE_LOG2 - xd->plane[plane].subsampling_y)) *
+          (blk_size >> MI_SIZE_LOG2) * (y >> blk_log2_y) * mi_params->mi_stride +
+          (blk_size >> MI_SIZE_LOG2) * (x >> blk_log2_x);
+#else
+      const int ccso_blk_idx =
+          (blk_size_y >> (MI_SIZE_LOG2 - xd->plane[plane].subsampling_y)) *
               (y >> blk_log2) * mi_params->mi_stride +
-          (blk_size >> (MI_SIZE_LOG2 - xd->plane[plane].subsampling_x)) *
+          (blk_size_x >> (MI_SIZE_LOG2 - xd->plane[plane].subsampling_x)) *
               (x >> blk_log2);
+#endif
       const bool use_ccso =
           (plane == 1) ? mi_params->mi_grid_base[ccso_blk_idx]->ccso_blk_u
                        : mi_params->mi_grid_base[ccso_blk_idx]->ccso_blk_v;
+#if CONFIG_CCSO_DEBUG
+      printf(" use [%d] %d (%p)", ccso_blk_idx, use_ccso, mi_params->mi_grid_base[ccso_blk_idx]);
+#endif
       if (!use_ccso) continue;
       if (cm->ccso_info.ccso_bo_only[plane]) {
         ccso_filter_block_hbd_wo_buf_c(
             src_y, dst_yuv, x, y, pic_width, pic_height, src_cls,
             cm->ccso_info.filter_offset[plane], ccso_ext_stride, dst_stride,
-            y_uv_hscale, y_uv_vscale, thr, neg_thr, src_loc, max_val, blk_size,
+            y_uv_hscale, y_uv_vscale, thr, neg_thr, src_loc, max_val,
+#if CONFIG_CCSO_FU_BUGFIX
+            blk_size_x,
+#endif
+            blk_size_y,
             false, shift_bits, edge_clf, cm->ccso_info.ccso_bo_only[plane]);
       } else {
         ccso_filter_block_hbd_wo_buf(
             src_y, dst_yuv, x, y, pic_width, pic_height, src_cls,
             cm->ccso_info.filter_offset[plane], ccso_ext_stride, dst_stride,
-            y_uv_hscale, y_uv_vscale, thr, neg_thr, src_loc, max_val, blk_size,
+            y_uv_hscale, y_uv_vscale, thr, neg_thr, src_loc, max_val,
+#if CONFIG_CCSO_FU_BUGFIX
+            blk_size_x,
+#endif
+            blk_size_y,
             false, shift_bits, edge_clf, 0);
       }
     }
-    dst_yuv += (dst_stride << blk_log2);
-    src_y += (ccso_ext_stride << (blk_log2 + y_uv_vscale));
+    dst_yuv += (dst_stride << blk_log2_y);
+    src_y += (ccso_ext_stride << (blk_log2_y + y_uv_vscale));
+#if CONFIG_CCSO_DEBUG
+    printf("\n");
+#endif
   }
 }
 
@@ -369,36 +483,74 @@
   const int neg_thr = thr * -1;
   int src_loc[2];
   derive_ccso_sample_pos(cm, src_loc, ccso_ext_stride, filter_sup);
-  const int blk_log2 = plane > 0 ? CCSO_BLK_SIZE : CCSO_BLK_SIZE + 1;
-  const int blk_size = 1 << blk_log2;
+#if CONFIG_CCSO_FU_BUGFIX
+  assert(plane > 0); // function must only be called for plane > 0
+  const int blk_size = 1 << CCSO_BLK_SIZE;
+  const int blk_log2_y = CCSO_BLK_SIZE - cm->seq_params.subsampling_y;
+  const int blk_log2_x = CCSO_BLK_SIZE - cm->seq_params.subsampling_x;
+  const int blk_size_y = 1 << blk_log2_y;
+  const int blk_size_x = 1 << blk_log2_x;
+#else
+  const int blk_log2_y = plane > 0 ? CCSO_BLK_SIZE : CCSO_BLK_SIZE + 1;
+  const int blk_size_y = 1 << blk_log2_y;
+  const int blk_size_x = blk_size_x;
+#endif
   src_y += CCSO_PADDING_SIZE * ccso_ext_stride + CCSO_PADDING_SIZE;
-  for (int y = 0; y < pic_height; y += blk_size) {
-    for (int x = 0; x < pic_width; x += blk_size) {
+#if CONFIG_CCSO_DEBUG
+  printf("CCSO: plane %d bo_only %d thr %d neg_thr %d @ %s\n",
+         plane, cm->ccso_info.ccso_bo_only[plane],
+         thr, neg_thr, __FUNCTION__);
+#endif
+  for (int y = 0; y < pic_height; y += blk_size_y) {
+#if CONFIG_CCSO_DEBUG
+    printf("CCSO: y %d", y);
+#endif
+    for (int x = 0; x < pic_width; x += blk_size_x) {
+#if CONFIG_CCSO_FU_BUGFIX
       const int ccso_blk_idx =
-          (blk_size >> (MI_SIZE_LOG2 - xd->plane[plane].subsampling_y)) *
+          (blk_size >> MI_SIZE_LOG2) * (y >> blk_log2_y) * mi_params->mi_stride +
+          (blk_size >> MI_SIZE_LOG2) * (x >> blk_log2_x);
+#else
+      const int ccso_blk_idx =
+          (blk_size_y >> (MI_SIZE_LOG2 - xd->plane[plane].subsampling_y)) *
               (y >> blk_log2) * mi_params->mi_stride +
-          (blk_size >> (MI_SIZE_LOG2 - xd->plane[plane].subsampling_x)) *
+          (blk_size_x >> (MI_SIZE_LOG2 - xd->plane[plane].subsampling_x)) *
               (x >> blk_log2);
+#endif
       const bool use_ccso =
           (plane == 1) ? mi_params->mi_grid_base[ccso_blk_idx]->ccso_blk_u
                        : mi_params->mi_grid_base[ccso_blk_idx]->ccso_blk_v;
+#if CONFIG_CCSO_DEBUG
+      printf(" use [%d] %d (%p)", ccso_blk_idx, use_ccso, mi_params->mi_grid_base[ccso_blk_idx]);
+#endif
       if (!use_ccso) continue;
       if (cm->ccso_info.ccso_bo_only[plane]) {
         ccso_filter_block_hbd_wo_buf_c(
             src_y, dst_yuv, x, y, pic_width, pic_height, src_cls,
             cm->ccso_info.filter_offset[plane], ccso_ext_stride, dst_stride,
-            y_uv_hscale, y_uv_vscale, thr, neg_thr, src_loc, max_val, blk_size,
+            y_uv_hscale, y_uv_vscale, thr, neg_thr, src_loc, max_val,
+#if CONFIG_CCSO_FU_BUGFIX
+            blk_size_x,
+#endif
+            blk_size_y,
             true, shift_bits, edge_clf, cm->ccso_info.ccso_bo_only[plane]);
       } else {
         ccso_filter_block_hbd_wo_buf(
             src_y, dst_yuv, x, y, pic_width, pic_height, src_cls,
             cm->ccso_info.filter_offset[plane], ccso_ext_stride, dst_stride,
-            y_uv_hscale, y_uv_vscale, thr, neg_thr, src_loc, max_val, blk_size,
+            y_uv_hscale, y_uv_vscale, thr, neg_thr, src_loc, max_val,
+#if CONFIG_CCSO_FU_BUGFIX
+            blk_size_x,
+#endif
+            blk_size_y,
             true, shift_bits, edge_clf, 0);
       }
     }
-    dst_yuv += (dst_stride << blk_log2);
-    src_y += (ccso_ext_stride << (blk_log2 + y_uv_vscale));
+    dst_yuv += (dst_stride << blk_log2_y);
+    src_y += (ccso_ext_stride << (blk_log2_y + y_uv_vscale));
+#if CONFIG_CCSO_DEBUG
+    printf("\n");
+#endif
   }
 }
 
@@ -465,4 +617,4 @@
 
   to->ccso_enable[plane] = from->ccso_enable[plane];
 }
-#endif  // CONFIG_CCSO_IMPROVE
\ No newline at end of file
+#endif  // CONFIG_CCSO_IMPROVE
diff --git a/av1/common/enums.h b/av1/common/enums.h
index 90dc0e6..8a86fbd 100644
--- a/av1/common/enums.h
+++ b/av1/common/enums.h
@@ -131,11 +131,13 @@
 #define IBP_WEIGHT_SIZE (1 << IBP_WEIGHT_SIZE_LOG2)
 #endif  // CONFIG_IBP_WEIGHT
 
+#if !CONFIG_CCSO_FU_BUGFIX
 // Cross-Component Sample Offset (CCSO)
 #define CCSO_BLK_SIZE 7
 #define CCSO_PADDING_SIZE 5
 #define CCSO_BAND_NUM 128
 #define CCSO_NUM_COMPONENTS 3
+#endif
 
 #define BUGFIX_AMVD_AMVR 1
 // Supported scale modes for JOINT_NEWMV
@@ -153,6 +155,14 @@
 #define MAX_SB_SQUARE (MAX_SB_SIZE * MAX_SB_SIZE)
 #define BLOCK_128_MI_SIZE_LOG2 5
 
+#if CONFIG_CCSO_FU_BUGFIX
+// Cross-Component Sample Offset (CCSO)
+#define CCSO_BLK_SIZE MAX_SB_SIZE_LOG2
+#define CCSO_PADDING_SIZE 5
+#define CCSO_BAND_NUM 128
+#define CCSO_NUM_COMPONENTS 3
+#endif
+
 #if CONFIG_ENABLE_MHCCP
 #define MHCCP_CONTEXT_GROUP_SIZE 7
 #define LINE_NUM 3
diff --git a/av1/common/pred_common.c b/av1/common/pred_common.c
index 0725e2b..a856199 100644
--- a/av1/common/pred_common.c
+++ b/av1/common/pred_common.c
@@ -524,10 +524,15 @@
 bool av1_check_ccso_mbmi_inside_tile(const MACROBLOCKD *xd,
                                      const MB_MODE_INFO *const mbmi) {
   const TileInfo *const tile = &xd->tile;
+#if CONFIG_CCSO_FU_BUGFIX
+  const int blk_size_y = (1 << (CCSO_BLK_SIZE - MI_SIZE_LOG2)) - 1;
+  const int blk_size_x = (1 << (CCSO_BLK_SIZE - MI_SIZE_LOG2)) - 1;
+#else
   const int blk_size_y =
       (1 << (CCSO_BLK_SIZE + xd->plane[1].subsampling_y - MI_SIZE_LOG2)) - 1;
   const int blk_size_x =
       (1 << (CCSO_BLK_SIZE + xd->plane[1].subsampling_x - MI_SIZE_LOG2)) - 1;
+#endif
 
   return (((mbmi->mi_row_start & ~blk_size_y) >= tile->mi_row_start) &&
           ((mbmi->mi_col_start & ~blk_size_x) >= tile->mi_col_start) &&
@@ -559,10 +564,15 @@
     neighbor1_ccso_available = av1_check_ccso_mbmi_inside_tile(xd, neighbor1);
   }
 
+#if CONFIG_CCSO_FU_BUGFIX
+  const int blk_size_y = (1 << (CCSO_BLK_SIZE - MI_SIZE_LOG2)) - 1;
+  const int blk_size_x = (1 << (CCSO_BLK_SIZE - MI_SIZE_LOG2)) - 1;
+#else
   const int blk_size_y =
       (1 << (CCSO_BLK_SIZE + xd->plane[1].subsampling_y - MI_SIZE_LOG2)) - 1;
   const int blk_size_x =
       (1 << (CCSO_BLK_SIZE + xd->plane[1].subsampling_x - MI_SIZE_LOG2)) - 1;
+#endif
 
   if (neighbor0_ccso_available && neighbor1_ccso_available) {
     int is_neighbor0_ccso = 0;
diff --git a/av1/common/x86/highbd_ccso_avx2.c b/av1/common/x86/highbd_ccso_avx2.c
index 9c2ac24..b796018 100644
--- a/av1/common/x86/highbd_ccso_avx2.c
+++ b/av1/common/x86/highbd_ccso_avx2.c
@@ -51,7 +51,13 @@
     const int y_uv_vscale,
     // const int pad_stride, no pad size anymore
     const int quant_step_size, const int inv_quant_step, const int *rec_idx,
-    const int max_val, const int blk_size, const bool isSingleBand,
+    const int max_val,
+#if CONFIG_CCSO_FU_BUGFIX
+    const int blk_size_x, const int blk_size_y,
+#else
+    const int blk_size,
+#endif
+    const bool isSingleBand,
     const uint8_t shift_bits, const int edge_clf, const uint8_t ccso_bo_only) {
   assert(ccso_bo_only == 0);
   (void)ccso_bo_only;
@@ -82,16 +88,22 @@
 
   int y_offset;
   int x_offset, x_remainder;
-  if (y + blk_size >= pic_height)
+
+#if !CONFIG_CCSO_FU_BUGFIX
+  const int blk_size_x = blk_size;
+  const int blk_size_y = blk_size;
+#endif
+
+  if (y + blk_size_y >= pic_height)
     y_offset = pic_height - y;
   else
-    y_offset = blk_size;
+    y_offset = blk_size_y;
 
-  if (x + blk_size >= pic_width) {
+  if (x + blk_size_x >= pic_width) {
     x_offset = ((pic_width - x) >> 4) << 4;
     x_remainder = pic_width - x - x_offset;
   } else {
-    x_offset = blk_size;
+    x_offset = blk_size_x;
     x_remainder = 0;
   }
   for (int yOff = 0; yOff < y_offset; yOff++) {
@@ -245,7 +257,12 @@
                                 const int pic_width, const int pic_height,
                                 const int y_uv_hscale, const int y_uv_vscale,
                                 const int qstep, const int neg_qstep,
-                                const int *src_loc, const int blk_size,
+                                const int *src_loc,
+#if CONFIG_CCSO_FU_BUGFIX
+                                const int blk_size_x, const int blk_size_y,
+#else
+                                const int blk_size,
+#endif
                                 const int edge_clf) {
   const int quant_step_size = qstep;
   const int inv_quant_step = neg_qstep;
@@ -275,16 +292,22 @@
 
   int y_offset;
   int x_offset, x_remainder;
-  if (y + blk_size >= pic_height)
+
+#if !CONFIG_CCSO_FU_BUGFIX
+  const int blk_size_x = blk_size;
+  const int blk_size_y = blk_size;
+#endif
+
+  if (y + blk_size_y >= pic_height)
     y_offset = pic_height - y;
   else
-    y_offset = blk_size;
+    y_offset = blk_size_y;
 
-  if (x + blk_size >= pic_width) {
+  if (x + blk_size_x >= pic_width) {
     x_offset = ((pic_width - x) >> 4) << 4;
     x_remainder = pic_width - x - x_offset;
   } else {
-    x_offset = blk_size;
+    x_offset = blk_size_x;
     x_remainder = 0;
   }
   for (int yOff = 0; yOff < y_offset; yOff++) {
@@ -421,7 +444,12 @@
     const uint16_t *src_y, uint16_t *dts_yuv, const uint8_t *src_cls0,
     const uint8_t *src_cls1, const int src_y_stride, const int dst_stride,
     const int ccso_stride, const int x, const int y, const int pic_width,
-    const int pic_height, const int8_t *filter_offset, const int blk_size,
+    const int pic_height, const int8_t *filter_offset,
+#if CONFIG_CCSO_FU_BUGFIX
+    const int blk_size_x, const int blk_size_y,
+#else
+    const int blk_size,
+#endif
     const int y_uv_hscale, const int y_uv_vscale, const int max_val,
     const uint8_t shift_bits, const uint8_t ccso_bo_only) {
   (void)ccso_bo_only;
@@ -440,16 +468,22 @@
 
   int y_offset;
   int x_offset, x_remainder;
-  if (y + blk_size >= pic_height)
+
+#if !CONFIG_CCSO_FU_BUGFIX
+  const int blk_size_x = blk_size;
+  const int blk_size_y = blk_size;
+#endif
+
+  if (y + blk_size_y >= pic_height)
     y_offset = pic_height - y;
   else
-    y_offset = blk_size;
+    y_offset = blk_size_y;
 
-  if (x + blk_size >= pic_width) {
+  if (x + blk_size_x >= pic_width) {
     x_offset = ((pic_width - x) >> 4) << 4;
     x_remainder = pic_width - x - x_offset;
   } else {
-    x_offset = blk_size;
+    x_offset = blk_size_x;
     x_remainder = 0;
   }
   for (int yOff = 0; yOff < y_offset; yOff++) {
@@ -528,7 +562,12 @@
     const uint16_t *src_y, uint16_t *dts_yuv, const uint8_t *src_cls0,
     const uint8_t *src_cls1, const int src_y_stride, const int dst_stride,
     const int ccso_stride, const int x, const int y, const int pic_width,
-    const int pic_height, const int8_t *filter_offset, const int blk_size,
+    const int pic_height, const int8_t *filter_offset,
+#if CONFIG_CCSO_FU_BUGFIX
+    const int blk_size_x, const int blk_size_y,
+#else
+    const int blk_size,
+#endif
     const int y_uv_hscale, const int y_uv_vscale, const int max_val,
     const uint8_t shift_bits, const uint8_t ccso_bo_only) {
   (void)ccso_bo_only;
@@ -547,16 +586,22 @@
 
   int y_offset;
   int x_offset, x_remainder;
-  if (y + blk_size >= pic_height)
+
+#if !CONFIG_CCSO_FU_BUGFIX
+  const int blk_size_x = blk_size;
+  const int blk_size_y = blk_size;
+#endif
+
+  if (y + blk_size_y >= pic_height)
     y_offset = pic_height - y;
   else
-    y_offset = blk_size;
+    y_offset = blk_size_y;
 
-  if (x + blk_size >= pic_width) {
+  if (x + blk_size_x >= pic_width) {
     x_offset = ((pic_width - x) >> 4) << 4;
     x_remainder = pic_width - x - x_offset;
   } else {
-    x_offset = blk_size;
+    x_offset = blk_size_x;
     x_remainder = 0;
   }
   for (int yOff = 0; yOff < y_offset; yOff++) {
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index 344268c..d42bb86 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -3894,6 +3894,12 @@
 #endif  // CONFIG_CCSO_SIGFIX
             cm->ccso_info.max_band_log2[plane] = aom_rb_read_literal(rb, 2);
           }
+#if CONFIG_CCSO_DEBUG
+          printf("CCSO: plane %d quant_idx %d ext_filter_support %d edge_clf %d ccso_bo_only %d max_band_log2 %d scale_idx %d @ %s\n",
+                 plane, cm->ccso_info.quant_idx[plane], cm->ccso_info.ext_filter_support[plane],
+                 cm->ccso_info.edge_clf[plane], cm->ccso_info.ccso_bo_only[plane],
+                 cm->ccso_info.max_band_log2[plane], cm->ccso_info.scale_idx[plane], __FUNCTION__);
+#endif
           const int max_band = 1 << cm->ccso_info.max_band_log2[plane];
 #if !CONFIG_CCSO_SIGFIX
           cm->ccso_info.edge_clf[plane] = aom_rb_read_bit(rb);
diff --git a/av1/decoder/decodemv.c b/av1/decoder/decodemv.c
index 305dc9a..3a81162 100644
--- a/av1/decoder/decodemv.c
+++ b/av1/decoder/decodemv.c
@@ -103,8 +103,13 @@
   const BLOCK_SIZE bsize = xd->mi[0]->sb_type[PLANE_TYPE_Y];
   const int bw = mi_size_wide[bsize];
   const int bh = mi_size_high[bsize];
+#if CONFIG_CCSO_FU_BUGFIX
+  const int log2_w = CCSO_BLK_SIZE;
+  const int log2_h = CCSO_BLK_SIZE;
+#else
   const int log2_w = CCSO_BLK_SIZE + xd->plane[1].subsampling_x;
   const int log2_h = CCSO_BLK_SIZE + xd->plane[1].subsampling_y;
+#endif
   const int f_w = 1 << log2_w >> MI_SIZE_LOG2;
   const int f_h = 1 << log2_h >> MI_SIZE_LOG2;
   const int ccso_nhfb = (mi_params->mi_cols + f_w - 1) / f_w;
@@ -122,18 +127,27 @@
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   const int mi_row = xd->mi_row;
   const int mi_col = xd->mi_col;
+#if CONFIG_CCSO_FU_BUGFIX
+  const int blk_size_y = (1 << (CCSO_BLK_SIZE - MI_SIZE_LOG2)) - 1;
+  const int blk_size_x = (1 << (CCSO_BLK_SIZE - MI_SIZE_LOG2)) - 1;
+#else
   const int blk_size_y =
       (1 << (CCSO_BLK_SIZE + xd->plane[1].subsampling_y - MI_SIZE_LOG2)) - 1;
   const int blk_size_x =
       (1 << (CCSO_BLK_SIZE + xd->plane[1].subsampling_x - MI_SIZE_LOG2)) - 1;
+#endif
 #if CONFIG_CCSO_IMPROVE
   int blk_idc;
 #endif
   if (!(mi_row & blk_size_y) && !(mi_col & blk_size_x) &&
       cm->ccso_info.ccso_enable[0]) {
 #if CONFIG_CCSO_IMPROVE
+#if CONFIG_CCSO_FU_BUGFIX
+    const int log2_filter_unit_size = CCSO_BLK_SIZE;
+#else
     const int log2_filter_unit_size =
         CCSO_BLK_SIZE + xd->plane[1].subsampling_x;
+#endif
     const int ccso_nhfb = ((mi_params->mi_cols >> xd->plane[0].subsampling_x) +
                            (1 << log2_filter_unit_size >> 2) - 1) /
                           (1 << log2_filter_unit_size >> 2);
@@ -144,10 +158,18 @@
       const int ccso_ctx = av1_get_ccso_context(xd, 0);
       blk_idc = aom_read_symbol(r, xd->tile_ctx->ccso_cdf[0][ccso_ctx], 2,
                                 ACCT_INFO("blk_idc"));
+#if CONFIG_CCSO_DEBUG
+      printf("CCSO: [%d,%d] read ccso_blk_y %d @ %s\n", mi_row, mi_col, blk_idc, __FUNCTION__);
+#endif
     } else {
       CcsoInfo *ref_frame_ccso_info =
           &get_ref_frame_buf(cm, cm->ccso_info.ccso_ref_idx[0])->ccso_info;
       blk_idc = ref_frame_ccso_info->sb_filter_control[0][sb_idx];
+#if CONFIG_CCSO_DEBUG
+      printf("CCSO: [%d,%d] copy [%d] ccso_blk_y %d : 0x%p @ %s\n", mi_row, mi_col, sb_idx, blk_idc,
+             mi_params->mi_grid_base[(mi_row & ~blk_size_y) * mi_params->mi_stride +
+                                     (mi_col & ~blk_size_x)], __FUNCTION__);
+#endif
     }
 #else
     const int blk_idc =
@@ -174,7 +196,11 @@
   if (!(mi_row & blk_size_y) && !(mi_col & blk_size_x) &&
       cm->ccso_info.ccso_enable[1]) {
 #if CONFIG_CCSO_IMPROVE
+#if CONFIG_CCSO_FU_BUGFIX
+    const int log2_filter_unit_size = (CCSO_BLK_SIZE - xd->plane[1].subsampling_x);
+#else
     const int log2_filter_unit_size = CCSO_BLK_SIZE;
+#endif
     const int ccso_nhfb = ((mi_params->mi_cols >> xd->plane[1].subsampling_x) +
                            (1 << log2_filter_unit_size >> 2) - 1) /
                           (1 << log2_filter_unit_size >> 2);
@@ -185,10 +211,18 @@
       const int ccso_ctx = av1_get_ccso_context(xd, 1);
       blk_idc = aom_read_symbol(r, xd->tile_ctx->ccso_cdf[1][ccso_ctx], 2,
                                 ACCT_INFO("blk_idc"));
+#if CONFIG_CCSO_DEBUG
+      printf("CCSO: [%d,%d] read ccso_blk_u %d @ %s\n", mi_row, mi_col, blk_idc, __FUNCTION__);
+#endif
     } else {
       CcsoInfo *ref_frame_ccso_info =
           &get_ref_frame_buf(cm, cm->ccso_info.ccso_ref_idx[1])->ccso_info;
       blk_idc = ref_frame_ccso_info->sb_filter_control[1][sb_idx];
+#if CONFIG_CCSO_DEBUG
+      printf("CCSO: [%d,%d] copy [%d] ccso_blk_u %d : 0x%p @ %s\n", mi_row, mi_col, sb_idx, blk_idc,
+             mi_params->mi_grid_base[(mi_row & ~blk_size_y) * mi_params->mi_stride +
+                                     (mi_col & ~blk_size_x)], __FUNCTION__);
+#endif
     }
 #else
     const int blk_idc =
@@ -215,7 +249,11 @@
   if (!(mi_row & blk_size_y) && !(mi_col & blk_size_x) &&
       cm->ccso_info.ccso_enable[2]) {
 #if CONFIG_CCSO_IMPROVE
+#if CONFIG_CCSO_FU_BUGFIX
+    const int log2_filter_unit_size = (CCSO_BLK_SIZE - xd->plane[2].subsampling_x);
+#else
     const int log2_filter_unit_size = CCSO_BLK_SIZE;
+#endif
     const int ccso_nhfb = ((mi_params->mi_cols >> xd->plane[2].subsampling_x) +
                            (1 << log2_filter_unit_size >> 2) - 1) /
                           (1 << log2_filter_unit_size >> 2);
@@ -226,10 +264,18 @@
       const int ccso_ctx = av1_get_ccso_context(xd, 2);
       blk_idc = aom_read_symbol(r, xd->tile_ctx->ccso_cdf[2][ccso_ctx], 2,
                                 ACCT_INFO("blk_idc"));
+#if CONFIG_CCSO_DEBUG
+      printf("CCSO: [%d,%d] read ccso_blk_v %d @ %s\n", mi_row, mi_col, blk_idc, __FUNCTION__);
+#endif
     } else {
       CcsoInfo *ref_frame_ccso_info =
           &get_ref_frame_buf(cm, cm->ccso_info.ccso_ref_idx[2])->ccso_info;
       blk_idc = ref_frame_ccso_info->sb_filter_control[2][sb_idx];
+#if CONFIG_CCSO_DEBUG
+      printf("CCSO: [%d,%d] copy [%d] ccso_blk_v %d : 0x%p @ %s\n", mi_row, mi_col, sb_idx, blk_idc,
+             mi_params->mi_grid_base[(mi_row & ~blk_size_y) * mi_params->mi_stride +
+                                     (mi_col & ~blk_size_x)], __FUNCTION__);
+#endif
     }
 #else
     const int blk_idc =
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index 1cd32bc..bf6bd0d 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -1949,10 +1949,15 @@
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   const int mi_row = xd->mi_row;
   const int mi_col = xd->mi_col;
+#if CONFIG_CCSO_FU_BUGFIX
+  const int blk_size_y = (1 << (CCSO_BLK_SIZE - MI_SIZE_LOG2)) - 1;
+  const int blk_size_x = (1 << (CCSO_BLK_SIZE - MI_SIZE_LOG2)) - 1;
+#else
   const int blk_size_y =
       (1 << (CCSO_BLK_SIZE + xd->plane[1].subsampling_y - MI_SIZE_LOG2)) - 1;
   const int blk_size_x =
       (1 << (CCSO_BLK_SIZE + xd->plane[1].subsampling_x - MI_SIZE_LOG2)) - 1;
+#endif
   const MB_MODE_INFO *mbmi =
       mi_params->mi_grid_base[(mi_row & ~blk_size_y) * mi_params->mi_stride +
                               (mi_col & ~blk_size_x)];
@@ -1964,6 +1969,9 @@
       const int ccso_ctx = av1_get_ccso_context(xd, 0);
       aom_write_symbol(w, mbmi->ccso_blk_y == 0 ? 0 : 1,
                        xd->tile_ctx->ccso_cdf[0][ccso_ctx], 2);
+#if CONFIG_CCSO_DEBUG
+      printf("CCSO: [%d,%d] write ccso_blk_y %d @ %s\n", mi_row, mi_col, mbmi->ccso_blk_y == 0 ? 0 : 1, __FUNCTION__);
+#endif
     }
 #else
     aom_write_symbol(w, mbmi->ccso_blk_y == 0 ? 0 : 1,
@@ -1979,6 +1987,9 @@
       const int ccso_ctx = av1_get_ccso_context(xd, 1);
       aom_write_symbol(w, mbmi->ccso_blk_u == 0 ? 0 : 1,
                        xd->tile_ctx->ccso_cdf[1][ccso_ctx], 2);
+#if CONFIG_CCSO_DEBUG
+      printf("CCSO: [%d,%d] write ccso_blk_u %d @ %s\n", mi_row, mi_col, mbmi->ccso_blk_u == 0 ? 0 : 1, __FUNCTION__);
+#endif
     }
 #else
     aom_write_symbol(w, mbmi->ccso_blk_u == 0 ? 0 : 1,
@@ -1994,6 +2005,9 @@
       const int ccso_ctx = av1_get_ccso_context(xd, 2);
       aom_write_symbol(w, mbmi->ccso_blk_v == 0 ? 0 : 1,
                        xd->tile_ctx->ccso_cdf[2][ccso_ctx], 2);
+#if CONFIG_CCSO_DEBUG
+      printf("CCSO: [%d,%d] write ccso_blk_v %d @ %s\n", mi_row, mi_col, mbmi->ccso_blk_v == 0 ? 0 : 1, __FUNCTION__);
+#endif
     }
 #else
     aom_write_symbol(w, mbmi->ccso_blk_v == 0 ? 0 : 1,
diff --git a/av1/encoder/pickccso.c b/av1/encoder/pickccso.c
index 794b248..3d21b04 100644
--- a/av1/encoder/pickccso.c
+++ b/av1/encoder/pickccso.c
@@ -75,11 +75,15 @@
                              const int pic_width, const int pic_height,
                              const int y_uv_hscale, const int y_uv_vscale,
                              const int qstep, const int neg_qstep,
-                             const int *src_loc, const int blk_size,
+                             const int *src_loc,
+#if CONFIG_CCSO_FU_BUGFIX
+                             const int blk_size_x,
+#endif
+                             const int blk_size_y,
                              const int edge_clf) {
   int src_cls[2];
-  const int y_end = AOMMIN(pic_height - y, blk_size);
-  const int x_end = AOMMIN(pic_width - x, blk_size);
+  const int y_end = AOMMIN(pic_height - y, blk_size_y);
+  const int x_end = AOMMIN(pic_width - x, blk_size_x);
   for (int y_start = 0; y_start < y_end; y_start++) {
     const int y_pos = y_start;
     for (int x_start = 0; x_start < x_end; x_start++) {
@@ -114,19 +118,29 @@
   const int neg_qstep = qstep * -1;
   int src_loc[2];
   derive_ccso_sample_pos(cm, src_loc, ccso_stride_ext, filter_sup);
-  const int blk_log2 = plane > 0 ? CCSO_BLK_SIZE : CCSO_BLK_SIZE + 1;
-  const int blk_size = 1 << blk_log2;
+#if CONFIG_CCSO_FU_BUGFIX
+  const int blk_log2_y = CCSO_BLK_SIZE - xd->plane[plane].subsampling_y;
+  const int blk_log2_x = CCSO_BLK_SIZE - xd->plane[plane].subsampling_x;
+#else
+  const int blk_log2_y = plane > 0 ? CCSO_BLK_SIZE : CCSO_BLK_SIZE + 1;
+  const int blk_log2_x = blk_log2_y;
+#endif
+  const int blk_size_y = 1 << blk_log2_y;
+  const int blk_size_x = 1 << blk_log2_x;
   src_y += CCSO_PADDING_SIZE * ccso_stride_ext + CCSO_PADDING_SIZE;
-  for (int y = 0; y < pic_height; y += blk_size) {
-    for (int x = 0; x < pic_width; x += blk_size) {
+  for (int y = 0; y < pic_height; y += blk_size_y) {
+    for (int x = 0; x < pic_width; x += blk_size_x) {
       ccso_derive_src_block(src_y, src_cls0, src_cls1, ccso_stride_ext,
                             ccso_stride, x, y, pic_width, pic_height,
                             y_uv_hscale, y_uv_vscale, qstep, neg_qstep, src_loc,
-                            blk_size, edge_clf);
+#if CONFIG_CCSO_FU_BUGFIX
+                            blk_size_x,
+#endif
+                            blk_size_y, edge_clf);
     }
-    src_y += (ccso_stride_ext << (blk_log2 + y_uv_vscale));
-    src_cls0 += (ccso_stride << (blk_log2 + y_uv_vscale));
-    src_cls1 += (ccso_stride << (blk_log2 + y_uv_vscale));
+    src_y += (ccso_stride_ext << (blk_log2_y + y_uv_vscale));
+    src_cls0 += (ccso_stride << (blk_log2_y + y_uv_vscale));
+    src_cls1 += (ccso_stride << (blk_log2_y + y_uv_vscale));
   }
 }
 
@@ -144,16 +158,23 @@
   int fb_idx = 0;
   uint8_t cur_src_cls0;
   uint8_t cur_src_cls1;
-  const int blk_log2 = plane > 0 ? CCSO_BLK_SIZE : CCSO_BLK_SIZE + 1;
-  const int blk_size = 1 << blk_log2;
+#if CONFIG_CCSO_FU_BUGFIX
+  const int blk_log2_y = CCSO_BLK_SIZE - xd->plane[plane].subsampling_y;
+  const int blk_log2_x = CCSO_BLK_SIZE - xd->plane[plane].subsampling_x;
+#else
+  const int blk_log2_y = plane > 0 ? CCSO_BLK_SIZE : CCSO_BLK_SIZE + 1;
+  const int blk_log2_x = blk_log2_y;
+#endif
+  const int blk_size_y = 1 << blk_log2_y;
+  const int blk_size_x = 1 << blk_log2_x;
   const int scaled_ext_stride = (ctx->ccso_stride_ext << y_uv_vscale);
   const int scaled_stride = (ctx->ccso_stride << y_uv_vscale);
   src_y += CCSO_PADDING_SIZE * ctx->ccso_stride_ext + CCSO_PADDING_SIZE;
-  for (int y = 0; y < pic_height; y += blk_size) {
-    for (int x = 0; x < pic_width; x += blk_size) {
+  for (int y = 0; y < pic_height; y += blk_size_y) {
+    for (int x = 0; x < pic_width; x += blk_size_x) {
       fb_idx++;
-      const int y_end = AOMMIN(pic_height - y, blk_size);
-      const int x_end = AOMMIN(pic_width - x, blk_size);
+      const int y_end = AOMMIN(pic_height - y, blk_size_y);
+      const int x_end = AOMMIN(pic_width - x, blk_size_x);
       for (int y_start = 0; y_start < y_end; y_start++) {
         for (int x_start = 0; x_start < x_end; x_start++) {
           const int x_pos = x + x_start;
@@ -177,11 +198,11 @@
       src_cls0 -= scaled_stride * y_end;
       src_cls1 -= scaled_stride * y_end;
     }
-    ref += (ctx->ccso_stride << blk_log2);
-    dst += (ctx->ccso_stride << blk_log2);
-    src_y += (ctx->ccso_stride_ext << (blk_log2 + y_uv_vscale));
-    src_cls0 += (ctx->ccso_stride << (blk_log2 + y_uv_vscale));
-    src_cls1 += (ctx->ccso_stride << (blk_log2 + y_uv_vscale));
+    ref += (ctx->ccso_stride << blk_log2_y);
+    dst += (ctx->ccso_stride << blk_log2_y);
+    src_y += (ctx->ccso_stride_ext << (blk_log2_y + y_uv_vscale));
+    src_cls0 += (ctx->ccso_stride << (blk_log2_y + y_uv_vscale));
+    src_cls1 += (ctx->ccso_stride << (blk_log2_y + y_uv_vscale));
   }
 }
 
@@ -194,15 +215,22 @@
   const int y_uv_hscale = xd->plane[plane].subsampling_x;
   const int y_uv_vscale = xd->plane[plane].subsampling_y;
   int fb_idx = 0;
-  const int blk_log2 = plane > 0 ? CCSO_BLK_SIZE : CCSO_BLK_SIZE + 1;
-  const int blk_size = 1 << blk_log2;
+#if CONFIG_CCSO_FU_BUGFIX
+  const int blk_log2_y = CCSO_BLK_SIZE - xd->plane[plane].subsampling_y;
+  const int blk_log2_x = CCSO_BLK_SIZE - xd->plane[plane].subsampling_x;
+#else
+  const int blk_log2_y = plane > 0 ? CCSO_BLK_SIZE : CCSO_BLK_SIZE + 1;
+  const int blk_log2_x = blk_log2_y;
+#endif
+  const int blk_size_y = 1 << blk_log2_y;
+  const int blk_size_x = 1 << blk_log2_x;
   const int scaled_ext_stride = (ctx->ccso_stride_ext << y_uv_vscale);
   src_y += CCSO_PADDING_SIZE * ctx->ccso_stride_ext + CCSO_PADDING_SIZE;
-  for (int y = 0; y < pic_height; y += blk_size) {
-    for (int x = 0; x < pic_width; x += blk_size) {
+  for (int y = 0; y < pic_height; y += blk_size_y) {
+    for (int x = 0; x < pic_width; x += blk_size_x) {
       fb_idx++;
-      const int y_end = AOMMIN(pic_height - y, blk_size);
-      const int x_end = AOMMIN(pic_width - x, blk_size);
+      const int y_end = AOMMIN(pic_height - y, blk_size_y);
+      const int x_end = AOMMIN(pic_width - x, blk_size_x);
       for (int y_start = 0; y_start < y_end; y_start++) {
         for (int x_start = 0; x_start < x_end; x_start++) {
           const int x_pos = x + x_start;
@@ -219,9 +247,9 @@
       dst -= ctx->ccso_stride * y_end;
       src_y -= scaled_ext_stride * y_end;
     }
-    ref += (ctx->ccso_stride << blk_log2);
-    dst += (ctx->ccso_stride << blk_log2);
-    src_y += (ctx->ccso_stride_ext << (blk_log2 + y_uv_vscale));
+    ref += (ctx->ccso_stride << blk_log2_y);
+    dst += (ctx->ccso_stride << blk_log2_y);
+    src_y += (ctx->ccso_stride_ext << (blk_log2_y + y_uv_vscale));
   }
 }
 
@@ -230,7 +258,12 @@
     const uint16_t *src_y, uint16_t *dst_yuv, const uint8_t *src_cls0,
     const uint8_t *src_cls1, const int src_y_stride, const int dst_stride,
     const int src_cls_stride, const int x, const int y, const int pic_width,
-    const int pic_height, const int8_t *filter_offset, const int blk_size,
+    const int pic_height, const int8_t *filter_offset,
+#if CONFIG_CCSO_FU_BUGFIX
+    const int blk_size_x, const int blk_size_y,
+#else
+    const int blk_size,
+#endif
     const int y_uv_hscale, const int y_uv_vscale, const int max_val,
     const uint8_t shift_bits, const uint8_t ccso_bo_only) {
   assert(ccso_bo_only == 1);
@@ -242,8 +275,13 @@
 
   int cur_src_cls0;
   int cur_src_cls1;
+#if CONFIG_CCSO_FU_BUGFIX
+  const int y_end = AOMMIN(pic_height - y, blk_size_y);
+  const int x_end = AOMMIN(pic_width - x, blk_size_x);
+#else
   const int y_end = AOMMIN(pic_height - y, blk_size);
   const int x_end = AOMMIN(pic_width - x, blk_size);
+#endif
   for (int y_start = 0; y_start < y_end; y_start++) {
     const int y_pos = y_start;
     for (int x_start = 0; x_start < x_end; x_start++) {
@@ -266,7 +304,12 @@
     const uint16_t *src_y, uint16_t *dst_yuv, const uint8_t *src_cls0,
     const uint8_t *src_cls1, const int src_y_stride, const int dst_stride,
     const int src_cls_stride, const int x, const int y, const int pic_width,
-    const int pic_height, const int8_t *filter_offset, const int blk_size,
+    const int pic_height, const int8_t *filter_offset,
+#if CONFIG_CCSO_FU_BUGFIX
+    const int blk_size_x, const int blk_size_y,
+#else
+    const int blk_size,
+#endif
     const int y_uv_hscale, const int y_uv_vscale, const int max_val,
     const uint8_t shift_bits, const uint8_t ccso_bo_only) {
   if (ccso_bo_only) {
@@ -275,8 +318,13 @@
   }
   int cur_src_cls0;
   int cur_src_cls1;
+#if CONFIG_CCSO_FU_BUGFIX
+  const int y_end = AOMMIN(pic_height - y, blk_size_y);
+  const int x_end = AOMMIN(pic_width - x, blk_size_x);
+#else
   const int y_end = AOMMIN(pic_height - y, blk_size);
   const int x_end = AOMMIN(pic_width - x, blk_size);
+#endif
   for (int y_start = 0; y_start < y_end; y_start++) {
     const int y_pos = y_start;
     for (int x_start = 0; x_start < x_end; x_start++) {
@@ -311,7 +359,11 @@
   const int pic_height = xd->plane[plane].dst.height;
   const int pic_width = xd->plane[plane].dst.width;
   const int max_val = (1 << cm->seq_params.bit_depth) - 1;
+#if CONFIG_CCSO_FU_BUGFIX
+  const int blk_log2 = CCSO_BLK_SIZE - xd->plane[plane].subsampling_y;
+#else
   const int blk_log2 = plane > 0 ? CCSO_BLK_SIZE : CCSO_BLK_SIZE + 1;
+#endif
   const int blk_size = 1 << blk_log2;
   src_y += CCSO_PADDING_SIZE * ccso_stride_ext + CCSO_PADDING_SIZE;
   for (int y = 0; y < pic_height; y += blk_size) {
@@ -324,12 +376,18 @@
 #endif  // CONFIG_CCSO_IMPROVE
             src_y, dst_yuv, src_cls0, src_cls1, ccso_stride_ext, dst_stride,
             ccso_stride, x, y, pic_width, pic_height, filter_offset, blk_size,
+#if CONFIG_CCSO_FU_BUGFIX
+            blk_size,
+#endif
             // y_uv_scale in h and v shall be zero
             0, 0, max_val, shift_bits, ccso_bo_only);
       } else {
         ccso_filter_block_hbd_with_buf(
             src_y, dst_yuv, src_cls0, src_cls1, ccso_stride_ext, dst_stride,
             ccso_stride, x, y, pic_width, pic_height, filter_offset, blk_size,
+#if CONFIG_CCSO_FU_BUGFIX
+            blk_size,
+#endif
             // y_uv_scale in h and v shall be zero
             0, 0, max_val, shift_bits, 0);
       }
@@ -352,11 +410,18 @@
   const int y_uv_hscale = xd->plane[plane].subsampling_x;
   const int y_uv_vscale = xd->plane[plane].subsampling_y;
   const int max_val = (1 << cm->seq_params.bit_depth) - 1;
-  const int blk_log2 = plane > 0 ? CCSO_BLK_SIZE : CCSO_BLK_SIZE + 1;
-  const int blk_size = 1 << blk_log2;
+#if CONFIG_CCSO_FU_BUGFIX
+  const int blk_log2_y = CCSO_BLK_SIZE - xd->plane[plane].subsampling_y;
+  const int blk_log2_x = CCSO_BLK_SIZE - xd->plane[plane].subsampling_x;
+#else
+  const int blk_log2_y = plane > 0 ? CCSO_BLK_SIZE : CCSO_BLK_SIZE + 1;
+  const int blk_log2_x = blk_log2_y;
+#endif
+  const int blk_size_y = 1 << blk_log2_y;
+  const int blk_size_x = 1 << blk_log2_x;
   src_y += CCSO_PADDING_SIZE * ccso_stride_ext + CCSO_PADDING_SIZE;
-  for (int y = 0; y < pic_height; y += blk_size) {
-    for (int x = 0; x < pic_width; x += blk_size) {
+  for (int y = 0; y < pic_height; y += blk_size_y) {
+    for (int x = 0; x < pic_width; x += blk_size_x) {
       if (ccso_bo_only) {
 #if CONFIG_CCSO_IMPROVE
         ccso_filter_block_hbd_with_buf_bo_only(
@@ -364,19 +429,27 @@
         ccso_filter_block_hbd_with_buf_c(
 #endif  // CONFIG_CCSO_IMPROVE
             src_y, dst_yuv, src_cls0, src_cls1, ccso_stride_ext, dst_stride,
-            ccso_stride, x, y, pic_width, pic_height, filter_offset, blk_size,
+            ccso_stride, x, y, pic_width, pic_height, filter_offset,
+#if CONFIG_CCSO_FU_BUGFIX
+            blk_size_x,
+#endif
+            blk_size_y,
             y_uv_hscale, y_uv_vscale, max_val, shift_bits, ccso_bo_only);
       } else {
         ccso_filter_block_hbd_with_buf(
             src_y, dst_yuv, src_cls0, src_cls1, ccso_stride_ext, dst_stride,
-            ccso_stride, x, y, pic_width, pic_height, filter_offset, blk_size,
+            ccso_stride, x, y, pic_width, pic_height, filter_offset,
+#if CONFIG_CCSO_FU_BUGFIX
+            blk_size_x,
+#endif
+            blk_size_y,
             y_uv_hscale, y_uv_vscale, max_val, shift_bits, 0);
       }
     }
-    dst_yuv += (dst_stride << blk_log2);
-    src_y += (ccso_stride_ext << (blk_log2 + y_uv_vscale));
-    src_cls0 += (ccso_stride << (blk_log2 + y_uv_vscale));
-    src_cls1 += (ccso_stride << (blk_log2 + y_uv_vscale));
+    dst_yuv += (dst_stride << blk_log2_y);
+    src_y += (ccso_stride_ext << (blk_log2_y + y_uv_vscale));
+    src_cls0 += (ccso_stride << (blk_log2_y + y_uv_vscale));
+    src_cls1 += (ccso_stride << (blk_log2_y + y_uv_vscale));
   }
 }
 
@@ -427,8 +500,8 @@
                      (x >> log2_filter_unit_size_x)] = ssd;
       *total_distortion += ssd;
     }
-    org += (org_stride << log2_filter_unit_size_x);
-    rec16 += (rec_stride << log2_filter_unit_size_x);
+    org += (org_stride << log2_filter_unit_size_y);
+    rec16 += (rec_stride << log2_filter_unit_size_y);
   }
 }
 
@@ -484,9 +557,13 @@
                           uint64_t *cur_total_dist, int *cur_total_rate,
                           bool *filter_enable, const int rdmult) {
   aom_cdf_prob ccso_cdf[CCSO_CONTEXT][CDF_SIZE(2)];
+#if CONFIG_CCSO_FU_BUGFIX
+  const int log2_filter_unit_size = CCSO_BLK_SIZE - xd->plane[plane].subsampling_x;
+#else
   const int log2_filter_unit_size =
       plane > 0 ? CCSO_BLK_SIZE : CCSO_BLK_SIZE + xd->plane[1].subsampling_x;
   ;
+#endif
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   const int ccso_nhfb =
       ((mi_params->mi_cols >> xd->plane[plane].subsampling_x) +
@@ -499,10 +576,15 @@
   const int tile_cols = tiles->cols;
   const int tile_rows = tiles->rows;
 
+#if CONFIG_CCSO_FU_BUGFIX
+  const int blk_size_y = (1 << (CCSO_BLK_SIZE - MI_SIZE_LOG2)) - 1;
+  const int blk_size_x = (1 << (CCSO_BLK_SIZE - MI_SIZE_LOG2)) - 1;
+#else
   const int blk_size_y =
       (1 << (CCSO_BLK_SIZE + xd->plane[1].subsampling_y - MI_SIZE_LOG2)) - 1;
   const int blk_size_x =
       (1 << (CCSO_BLK_SIZE + xd->plane[1].subsampling_x - MI_SIZE_LOG2)) - 1;
+#endif
 
   *cur_total_dist = 0;
 
@@ -590,9 +672,13 @@
                               bool *filter_enable, const int rdmult) {
   (void)rdmult;
 
+#if CONFIG_CCSO_FU_BUGFIX
+  const int log2_filter_unit_size = CCSO_BLK_SIZE - xd->plane[plane].subsampling_x;
+#else
   const int log2_filter_unit_size =
       plane > 0 ? CCSO_BLK_SIZE : CCSO_BLK_SIZE + xd->plane[1].subsampling_x;
   ;
+#endif
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   const int ccso_nhfb =
       ((mi_params->mi_cols >> xd->plane[plane].subsampling_x) +
@@ -605,10 +691,15 @@
   const int tile_cols = tiles->cols;
   const int tile_rows = tiles->rows;
 
+#if CONFIG_CCSO_FU_BUGFIX
+  const int blk_size_y = (1 << (CCSO_BLK_SIZE - MI_SIZE_LOG2)) - 1;
+  const int blk_size_x = (1 << (CCSO_BLK_SIZE - MI_SIZE_LOG2)) - 1;
+#else
   const int blk_size_y =
       (1 << (CCSO_BLK_SIZE + xd->plane[1].subsampling_y - MI_SIZE_LOG2)) - 1;
   const int blk_size_x =
       (1 << (CCSO_BLK_SIZE + xd->plane[1].subsampling_x - MI_SIZE_LOG2)) - 1;
+#endif
 
   *cur_total_dist = 0;
   *cur_total_rate = 0;
@@ -724,7 +815,11 @@
                                    const int max_edge_interval,
                                    const uint8_t ccso_bo_only) {
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
+#if CONFIG_CCSO_FU_BUGFIX
+  const int blk_log2 = CCSO_BLK_SIZE - xd->plane[plane].subsampling_y;
+#else
   const int blk_log2 = plane > 0 ? CCSO_BLK_SIZE : CCSO_BLK_SIZE + 1;
+#endif
   const int nvfb = ((mi_params->mi_rows >> xd->plane[plane].subsampling_y) +
                     (1 << blk_log2 >> MI_SIZE_LOG2) - 1) /
                    (1 << blk_log2 >> MI_SIZE_LOG2);
@@ -879,10 +974,15 @@
 #endif
 ) {
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
+#if CONFIG_CCSO_FU_BUGFIX
+  const int log2_filter_unit_size_y = CCSO_BLK_SIZE - xd->plane[plane].subsampling_y;
+  const int log2_filter_unit_size_x = CCSO_BLK_SIZE - xd->plane[plane].subsampling_x;
+#else
   const int log2_filter_unit_size_y =
       plane > 0 ? CCSO_BLK_SIZE : CCSO_BLK_SIZE + xd->plane[1].subsampling_y;
   const int log2_filter_unit_size_x =
       plane > 0 ? CCSO_BLK_SIZE : CCSO_BLK_SIZE + xd->plane[1].subsampling_x;
+#endif
 
   const int ccso_nvfb =
       ((mi_params->mi_rows >> xd->plane[plane].subsampling_y) +
@@ -1458,8 +1558,13 @@
     const BLOCK_SIZE bsize = xd->mi[0]->sb_type[PLANE_TYPE_Y];
     const int bw = mi_size_wide[bsize];
     const int bh = mi_size_high[bsize];
+#if CONFIG_CCSO_FU_BUGFIX
+    const int log2_w = CCSO_BLK_SIZE;
+    const int log2_h = CCSO_BLK_SIZE;
+#else
     const int log2_w = CCSO_BLK_SIZE + xd->plane[1].subsampling_x;
     const int log2_h = CCSO_BLK_SIZE + xd->plane[1].subsampling_y;
+#endif
     const int f_w = 1 << log2_w >> MI_SIZE_LOG2;
     const int f_h = 1 << log2_h >> MI_SIZE_LOG2;
     const int step_h = (bh + f_h - 1) / f_h;
@@ -1476,18 +1581,42 @@
               cm->cur_frame->ccso_info.sb_filter_control[plane][sb_idx] =
                   ctx->final_filter_control[y_sb * ccso_nhfb + x_sb];
 #endif  // CONFIG_CCSO_IMPROVE
+#if CONFIG_CCSO_FU_BUGFIX
+              const int grid_idx_mbmi = (1 << CCSO_BLK_SIZE >> MI_SIZE_LOG2) *
+                   row * mi_params->mi_stride +
+                   (1 << CCSO_BLK_SIZE >> MI_SIZE_LOG2) * col;
+              MB_MODE_INFO *const mbmi = mi_params->mi_grid_base[grid_idx_mbmi];
+#endif
               if (plane == AOM_PLANE_Y) {
+#if CONFIG_CCSO_FU_BUGFIX
+                mbmi->ccso_blk_y = ctx->final_filter_control[y_sb * ccso_nhfb + x_sb];
+#else
                 mi_params
                     ->mi_grid_base
                         [(1 << CCSO_BLK_SIZE >>
                           (MI_SIZE_LOG2 - xd->plane[1].subsampling_y)) *
                              row * mi_params->mi_stride +
                          (1 << CCSO_BLK_SIZE >>
-                          (MI_SIZE_LOG2 - xd->plane[1].subsampling_x)) *
+                          (MI_SIZE_LOG2 - xd->plane[1].subsampling_x)) *                        
                              col]
                     ->ccso_blk_y =
                     ctx->final_filter_control[y_sb * ccso_nhfb + x_sb];
+#endif
+#if CONFIG_CCSO_DEBUG && CONFIG_CCSO_FU_BUGFIX
+                printf("CCSO: [%d,%d] copy [%d] ccso_blk_y %d : 0x%p @ %s\n",
+                       (1 << CCSO_BLK_SIZE >> MI_SIZE_LOG2) * row,
+                       (1 << CCSO_BLK_SIZE >> MI_SIZE_LOG2) * col,
+                       sb_idx,
+                       ctx->final_filter_control[y_sb * ccso_nhfb + x_sb],
+                       mi_params->mi_grid_base[(1 << CCSO_BLK_SIZE >> MI_SIZE_LOG2) *
+                                               row * mi_params->mi_stride +
+                                               (1 << CCSO_BLK_SIZE >> MI_SIZE_LOG2) * col],
+                       __FUNCTION__);
+#endif
               } else if (plane == AOM_PLANE_U) {
+#if CONFIG_CCSO_FU_BUGFIX
+                mbmi->ccso_blk_u = ctx->final_filter_control[y_sb * ccso_nhfb + x_sb];
+#else
                 mi_params
                     ->mi_grid_base
                         [(1 << CCSO_BLK_SIZE >>
@@ -1498,7 +1627,22 @@
                              col]
                     ->ccso_blk_u =
                     ctx->final_filter_control[y_sb * ccso_nhfb + x_sb];
+#endif                    
+#if CONFIG_CCSO_DEBUG && CONFIG_CCSO_FU_BUGFIX
+                printf("CCSO: [%d,%d] copy [%d] ccso_blk_u %d : 0x%p @ %s\n",
+                       (1 << CCSO_BLK_SIZE >> MI_SIZE_LOG2) * row,
+                       (1 << CCSO_BLK_SIZE >> MI_SIZE_LOG2) * col,
+                       sb_idx,
+                       ctx->final_filter_control[y_sb * ccso_nhfb + x_sb],
+                       mi_params->mi_grid_base[(1 << CCSO_BLK_SIZE >> MI_SIZE_LOG2) *
+                                               row * mi_params->mi_stride +
+                                               (1 << CCSO_BLK_SIZE >> MI_SIZE_LOG2) * col],
+                       __FUNCTION__);
+#endif
               } else {
+#if CONFIG_CCSO_FU_BUGFIX
+                mbmi->ccso_blk_v = ctx->final_filter_control[y_sb * ccso_nhfb + x_sb];
+#else
                 mi_params
                     ->mi_grid_base
                         [(1 << CCSO_BLK_SIZE >>
@@ -1509,8 +1653,27 @@
                              col]
                     ->ccso_blk_v =
                     ctx->final_filter_control[y_sb * ccso_nhfb + x_sb];
+#endif                    
+#if CONFIG_CCSO_DEBUG && CONFIG_CCSO_FU_BUGFIX
+                printf("CCSO: [%d,%d] copy [%d] ccso_blk_v %d : 0x%p @ %s\n",
+                       (1 << CCSO_BLK_SIZE >> MI_SIZE_LOG2) * row,
+                       (1 << CCSO_BLK_SIZE >> MI_SIZE_LOG2) * col,
+                       sb_idx,
+                       ctx->final_filter_control[y_sb * ccso_nhfb + x_sb],
+                       mi_params->mi_grid_base[(1 << CCSO_BLK_SIZE >> MI_SIZE_LOG2) *
+                                               row * mi_params->mi_stride +
+                                               (1 << CCSO_BLK_SIZE >> MI_SIZE_LOG2) * col],
+                       __FUNCTION__);
+#endif
               }
 #if CONFIG_CCSO_IMPROVE
+#if CONFIG_CCSO_FU_BUGFIX
+              const int ccso_mib_size_y = (1 << (CCSO_BLK_SIZE - MI_SIZE_LOG2));
+              const int ccso_mib_size_x = (1 << (CCSO_BLK_SIZE - MI_SIZE_LOG2));
+
+              int mi_row = (1 << CCSO_BLK_SIZE >> MI_SIZE_LOG2) * row;
+              int mi_col = (1 << CCSO_BLK_SIZE >> MI_SIZE_LOG2) * col;
+#else
               const int ccso_mib_size_y =
                   (1 << (CCSO_BLK_SIZE + xd->plane[1].subsampling_y -
                          MI_SIZE_LOG2));
@@ -1524,6 +1687,7 @@
               int mi_col = (1 << CCSO_BLK_SIZE >>
                             (MI_SIZE_LOG2 - xd->plane[1].subsampling_x)) *
                            col;
+#endif                           
               for (int j = 0;
                    j < AOMMIN(ccso_mib_size_y, cm->mi_params.mi_rows - mi_row);
                    j++) {
@@ -1572,18 +1736,45 @@
 
       for (int y_sb = 0; y_sb < ccso_nvfb; y_sb++) {
         for (int x_sb = 0; x_sb < ccso_nhfb; x_sb++) {
+#if CONFIG_CCSO_FU_BUGFIX
+          const int grid_idx = (1 << CCSO_BLK_SIZE >> MI_SIZE_LOG2) * y_sb *
+                                   mi_params->mi_stride +
+                               (1 << CCSO_BLK_SIZE >> MI_SIZE_LOG2) * x_sb;
+          MB_MODE_INFO *const mbmi = mi_params->mi_grid_base[grid_idx];
+#endif
           if (plane == AOM_PLANE_Y) {
+#if CONFIG_CCSO_FU_BUGFIX
+            mbmi->ccso_blk_y =
+                ref_frame_ccso_info
+                    ->sb_filter_control[plane][y_sb * ccso_nhfb + x_sb];
+#else
             mi_params
                 ->mi_grid_base[(1 << CCSO_BLK_SIZE >>
-                                (MI_SIZE_LOG2 - xd->plane[1].subsampling_y)) *
+                                (MI_SIZE_LOG2 - xd->plane[1].subsampling_y)) *                             
                                    y_sb * mi_params->mi_stride +
                                (1 << CCSO_BLK_SIZE >>
-                                (MI_SIZE_LOG2 - xd->plane[1].subsampling_x)) *
+                                (MI_SIZE_LOG2 - xd->plane[1].subsampling_x)) *                             
                                    x_sb]
                 ->ccso_blk_y =
                 ref_frame_ccso_info
                     ->sb_filter_control[plane][y_sb * ccso_nhfb + x_sb];
+#endif                    
+#if CONFIG_CCSO_DEBUG && CONFIG_CCSO_FU_BUGFIX
+            printf("CCSO: [%d,%d] copy [%d] ccso_blk_y %d : 0x%p @ %s\n",
+                   (1 << CCSO_BLK_SIZE >> MI_SIZE_LOG2) * y_sb,
+                   (1 << CCSO_BLK_SIZE >> MI_SIZE_LOG2) * x_sb,
+                   y_sb * ccso_nhfb + x_sb,
+                   ref_frame_ccso_info->sb_filter_control[plane][y_sb * ccso_nhfb + x_sb],
+                   mi_params->mi_grid_base[(1 << CCSO_BLK_SIZE >> MI_SIZE_LOG2) * y_sb * mi_params->mi_stride +
+                                           (1 << CCSO_BLK_SIZE >> MI_SIZE_LOG2) * x_sb],
+                   __FUNCTION__);
+#endif
           } else if (plane == AOM_PLANE_U) {
+#if CONFIG_CCSO_FU_BUGFIX
+            mbmi->ccso_blk_u =
+                ref_frame_ccso_info
+                    ->sb_filter_control[plane][y_sb * ccso_nhfb + x_sb];
+#else
             mi_params
                 ->mi_grid_base[(1 << CCSO_BLK_SIZE >>
                                 (MI_SIZE_LOG2 - xd->plane[1].subsampling_y)) *
@@ -1594,7 +1785,23 @@
                 ->ccso_blk_u =
                 ref_frame_ccso_info
                     ->sb_filter_control[plane][y_sb * ccso_nhfb + x_sb];
+#endif                    
+#if CONFIG_CCSO_DEBUG && CONFIG_CCSO_FU_BUGFIX
+            printf("CCSO: [%d,%d] copy [%d] ccso_blk_u %d : 0x%p @ %s\n",
+                   (1 << CCSO_BLK_SIZE >> MI_SIZE_LOG2) * y_sb,
+                   (1 << CCSO_BLK_SIZE >> MI_SIZE_LOG2) * x_sb,
+                   y_sb * ccso_nhfb + x_sb,
+                   ref_frame_ccso_info->sb_filter_control[plane][y_sb * ccso_nhfb + x_sb],
+                   mi_params->mi_grid_base[(1 << CCSO_BLK_SIZE >> MI_SIZE_LOG2) * y_sb * mi_params->mi_stride +
+                                           (1 << CCSO_BLK_SIZE >> MI_SIZE_LOG2) * x_sb],
+                   __FUNCTION__);
+#endif
           } else {
+#if CONFIG_CCSO_FU_BUGFIX
+            mbmi->ccso_blk_v =
+                ref_frame_ccso_info
+                    ->sb_filter_control[plane][y_sb * ccso_nhfb + x_sb];
+#else
             mi_params
                 ->mi_grid_base[(1 << CCSO_BLK_SIZE >>
                                 (MI_SIZE_LOG2 - xd->plane[2].subsampling_y)) *
@@ -1605,6 +1812,17 @@
                 ->ccso_blk_v =
                 ref_frame_ccso_info
                     ->sb_filter_control[plane][y_sb * ccso_nhfb + x_sb];
+#endif                    
+#if CONFIG_CCSO_DEBUG && CONFIG_CCSO_FU_BUGFIX
+            printf("CCSO: [%d,%d] copy [%d] ccso_blk_v %d : 0x%p @ %s\n",
+                   (1 << CCSO_BLK_SIZE >> MI_SIZE_LOG2) * y_sb,
+                   (1 << CCSO_BLK_SIZE >> MI_SIZE_LOG2) * x_sb,
+                   y_sb * ccso_nhfb + x_sb,
+                   ref_frame_ccso_info->sb_filter_control[plane][y_sb * ccso_nhfb + x_sb],
+                   mi_params->mi_grid_base[(1 << CCSO_BLK_SIZE >> MI_SIZE_LOG2) * y_sb * mi_params->mi_stride +
+                                           (1 << CCSO_BLK_SIZE >> MI_SIZE_LOG2) * x_sb],
+                   __FUNCTION__);
+#endif
           }
         }
       }
@@ -1643,6 +1861,12 @@
       cm->cur_frame->ccso_info.reuse_root_ref[plane] =
           ref_frame_ccso_info->reuse_root_ref[plane];
     }
+#if CONFIG_CCSO_DEBUG
+    printf("CCSO: plane %d quant_idx %d ext_filter_support %d edge_clf %d ccso_bo_only %d max_band_log2 %d scale_idx %d @ %s\n",
+           plane, cm->ccso_info.quant_idx[plane], cm->ccso_info.ext_filter_support[plane],
+           cm->ccso_info.edge_clf[plane], cm->ccso_info.ccso_bo_only[plane],
+           cm->ccso_info.max_band_log2[plane], cm->ccso_info.scale_idx[plane], __FUNCTION__);
+#endif
   } else {
     cm->cur_frame->ccso_info.ccso_enable[plane] = 0;
   }
diff --git a/av1/encoder/pickcdef.c b/av1/encoder/pickcdef.c
index 90bfc3b..4f1dcad 100644
--- a/av1/encoder/pickcdef.c
+++ b/av1/encoder/pickcdef.c
@@ -244,7 +244,7 @@
   *width = block_size_wide[bsize];
   *height = block_size_high[bsize];
   *width_log2 = MI_SIZE_LOG2 + mi_size_wide_log2[bsize];
-  *height_log2 = MI_SIZE_LOG2 + mi_size_wide_log2[bsize];
+  *height_log2 = MI_SIZE_LOG2 + mi_size_high_log2[bsize];
 }
 
 /* Compute MSE only on the blocks we filtered. */
diff --git a/build/cmake/aom_config_defaults.cmake b/build/cmake/aom_config_defaults.cmake
index 61cb8e2..6bd3618 100644
--- a/build/cmake/aom_config_defaults.cmake
+++ b/build/cmake/aom_config_defaults.cmake
@@ -358,6 +358,8 @@
 set_aom_config_var(CONFIG_PARTITION_CONTEXT_REDUCE 1
                    "Enable to reduce partition contexts")
 set_aom_config_var(CONFIG_CCSO_IMPROVE 1 "Enable CCSO improvements")
+set_aom_config_var(CONFIG_CCSO_DEBUG 0 "Enable CCSO debug")
+set_aom_config_var(CONFIG_CCSO_FU_BUGFIX 1 "Bugfix to CCS FU size")
 
 set_aom_config_var(CONFIG_OPT_INTER_MODE_CTX 1
                    "Improvement of all inter mode related contexts")