Make CDEF work with EXT_PARTITION

Make CDEF select filter strength every 64x64 block when block size
could be larger than 64x64.

With/without this patch, coding performances on AWCY and Google
test of lowres and midres are neutral.

BUG=aomedia:662

Change-Id: Ief82cc51be91fc08a7c6d7e87f6d13bcc4336476
diff --git a/av1/common/cdef.c b/av1/common/cdef.c
index a02126a..ba8abbb 100644
--- a/av1/common/cdef.c
+++ b/av1/common/cdef.c
@@ -27,8 +27,8 @@
   maxc = cm->mi_cols - mi_col;
   maxr = cm->mi_rows - mi_row;
 
-  maxr = AOMMIN(maxr, cm->mib_size);
-  maxc = AOMMIN(maxc, cm->mib_size);
+  maxr = AOMMIN(maxr, MI_SIZE_64X64);
+  maxc = AOMMIN(maxc, MI_SIZE_64X64);
 
   for (r = 0; r < maxr; r++) {
     for (c = 0; c < maxc; c++) {
@@ -60,8 +60,8 @@
   maxc = cm->mi_cols - mi_col;
   maxr = cm->mi_rows - mi_row;
 
-  maxr = AOMMIN(maxr, cm->mib_size);
-  maxc = AOMMIN(maxc, cm->mib_size);
+  maxr = AOMMIN(maxr, MI_SIZE_64X64);
+  maxc = AOMMIN(maxc, MI_SIZE_64X64);
 
   const int r_step = mi_size_high[BLOCK_8X8];
   const int c_step = mi_size_wide[BLOCK_8X8];
@@ -161,7 +161,7 @@
   uint16_t src[OD_DERING_INBUF_SIZE];
   uint16_t *linebuf[3];
   uint16_t *colbuf[3];
-  dering_list dlist[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  dering_list dlist[MI_SIZE_64X64 * MI_SIZE_64X64];
   unsigned char *row_dering, *prev_row_dering, *curr_row_dering;
   int dering_count;
   int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS] = { { 0 } };
@@ -178,8 +178,8 @@
   int chroma_dering =
       xd->plane[1].subsampling_x == xd->plane[1].subsampling_y &&
       xd->plane[2].subsampling_x == xd->plane[2].subsampling_y;
-  nvsb = (cm->mi_rows + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
-  nhsb = (cm->mi_cols + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
+  nvsb = (cm->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  nhsb = (cm->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
   av1_setup_dst_planes(xd->plane, cm->sb_size, frame, 0, 0);
   row_dering = aom_malloc(sizeof(*row_dering) * (nhsb + 2) * 2);
   memset(row_dering, 1, sizeof(*row_dering) * (nhsb + 2) * 2);
@@ -202,7 +202,7 @@
   for (sbr = 0; sbr < nvsb; sbr++) {
     for (pli = 0; pli < nplanes; pli++) {
       const int block_height =
-          (MAX_MIB_SIZE << mi_high_l2[pli]) + 2 * OD_FILT_VBORDER;
+          (MI_SIZE_64X64 << mi_high_l2[pli]) + 2 * OD_FILT_VBORDER;
       fill_rect(colbuf[pli], OD_FILT_HBORDER, block_height, OD_FILT_HBORDER,
                 OD_DERING_VERY_LARGE);
     }
@@ -213,41 +213,41 @@
       int nhb, nvb;
       int cstart = 0;
       curr_row_dering[sbc] = 0;
-      if (cm->mi_grid_visible[MAX_MIB_SIZE * sbr * cm->mi_stride +
-                              MAX_MIB_SIZE * sbc] == NULL ||
-          cm->mi_grid_visible[MAX_MIB_SIZE * sbr * cm->mi_stride +
-                              MAX_MIB_SIZE * sbc]
+      if (cm->mi_grid_visible[MI_SIZE_64X64 * sbr * cm->mi_stride +
+                              MI_SIZE_64X64 * sbc] == NULL ||
+          cm->mi_grid_visible[MI_SIZE_64X64 * sbr * cm->mi_stride +
+                              MI_SIZE_64X64 * sbc]
                   ->mbmi.cdef_strength == -1) {
         dering_left = 0;
         continue;
       }
       if (!dering_left) cstart = -OD_FILT_HBORDER;
-      nhb = AOMMIN(MAX_MIB_SIZE, cm->mi_cols - MAX_MIB_SIZE * sbc);
-      nvb = AOMMIN(MAX_MIB_SIZE, cm->mi_rows - MAX_MIB_SIZE * sbr);
+      nhb = AOMMIN(MI_SIZE_64X64, cm->mi_cols - MI_SIZE_64X64 * sbc);
+      nvb = AOMMIN(MI_SIZE_64X64, cm->mi_rows - MI_SIZE_64X64 * sbr);
       int tile_top, tile_left, tile_bottom, tile_right;
-      int mi_idx = MAX_MIB_SIZE * sbr * cm->mi_stride + MAX_MIB_SIZE * sbc;
+      int mi_idx = MI_SIZE_64X64 * sbr * cm->mi_stride + MI_SIZE_64X64 * sbc;
       MODE_INFO *const mi_tl = cm->mi + mi_idx;
       BOUNDARY_TYPE boundary_tl = mi_tl->mbmi.boundary_info;
       tile_top = boundary_tl & TILE_ABOVE_BOUNDARY;
       tile_left = boundary_tl & TILE_LEFT_BOUNDARY;
 
       if (sbr != nvsb - 1 &&
-          (&cm->mi[mi_idx + (MAX_MIB_SIZE - 1) * cm->mi_stride]))
-        tile_bottom = cm->mi[mi_idx + (MAX_MIB_SIZE - 1) * cm->mi_stride]
+          (&cm->mi[mi_idx + (MI_SIZE_64X64 - 1) * cm->mi_stride]))
+        tile_bottom = cm->mi[mi_idx + (MI_SIZE_64X64 - 1) * cm->mi_stride]
                           .mbmi.boundary_info &
                       TILE_BOTTOM_BOUNDARY;
       else
         tile_bottom = 1;
 
-      if (sbc != nhsb - 1 && (&cm->mi[mi_idx + MAX_MIB_SIZE - 1]))
-        tile_right = cm->mi[mi_idx + MAX_MIB_SIZE - 1].mbmi.boundary_info &
+      if (sbc != nhsb - 1 && (&cm->mi[mi_idx + MI_SIZE_64X64 - 1]))
+        tile_right = cm->mi[mi_idx + MI_SIZE_64X64 - 1].mbmi.boundary_info &
                      TILE_RIGHT_BOUNDARY;
       else
         tile_right = 1;
 
       const int mbmi_cdef_strength =
-          cm->mi_grid_visible[MAX_MIB_SIZE * sbr * cm->mi_stride +
-                              MAX_MIB_SIZE * sbc]
+          cm->mi_grid_visible[MI_SIZE_64X64 * sbr * cm->mi_stride +
+                              MI_SIZE_64X64 * sbc]
               ->mbmi.cdef_strength;
       level = cm->cdef_strengths[mbmi_cdef_strength] / CLPF_STRENGTHS;
       clpf_strength = cm->cdef_strengths[mbmi_cdef_strength] % CLPF_STRENGTHS;
@@ -259,7 +259,7 @@
       if ((level == 0 && clpf_strength == 0 && uv_level == 0 &&
            uv_clpf_strength == 0) ||
           (dering_count = sb_compute_dering_list(
-               cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE, dlist,
+               cm, sbr * MI_SIZE_64X64, sbc * MI_SIZE_64X64, dlist,
                get_filter_skip(level) || get_filter_skip(uv_level))) == 0) {
         dering_left = 0;
         continue;
@@ -293,7 +293,7 @@
         else
           rend = vsize + OD_FILT_VBORDER;
 
-        coffset = sbc * MAX_MIB_SIZE << mi_wide_l2[pli];
+        coffset = sbc * MI_SIZE_64X64 << mi_wide_l2[pli];
         if (sbc == nhsb - 1) {
           /* On the last superblock column, fill in the right border with
              OD_DERING_VERY_LARGE to avoid filtering with the outside. */
@@ -314,14 +314,14 @@
             cm,
             &src[OD_FILT_VBORDER * OD_FILT_BSTRIDE + OD_FILT_HBORDER + cstart],
             OD_FILT_BSTRIDE, xd->plane[pli].dst.buf,
-            (MAX_MIB_SIZE << mi_high_l2[pli]) * sbr, coffset + cstart,
+            (MI_SIZE_64X64 << mi_high_l2[pli]) * sbr, coffset + cstart,
             xd->plane[pli].dst.stride, rend, cend - cstart);
         if (!prev_row_dering[sbc]) {
-          copy_sb8_16(cm, &src[OD_FILT_HBORDER], OD_FILT_BSTRIDE,
-                      xd->plane[pli].dst.buf,
-                      (MAX_MIB_SIZE << mi_high_l2[pli]) * sbr - OD_FILT_VBORDER,
-                      coffset, xd->plane[pli].dst.stride, OD_FILT_VBORDER,
-                      hsize);
+          copy_sb8_16(
+              cm, &src[OD_FILT_HBORDER], OD_FILT_BSTRIDE,
+              xd->plane[pli].dst.buf,
+              (MI_SIZE_64X64 << mi_high_l2[pli]) * sbr - OD_FILT_VBORDER,
+              coffset, xd->plane[pli].dst.stride, OD_FILT_VBORDER, hsize);
         } else if (sbr > 0) {
           copy_rect(&src[OD_FILT_HBORDER], OD_FILT_BSTRIDE,
                     &linebuf[pli][coffset], stride, OD_FILT_VBORDER, hsize);
@@ -330,10 +330,11 @@
                     hsize, OD_DERING_VERY_LARGE);
         }
         if (!prev_row_dering[sbc - 1]) {
-          copy_sb8_16(cm, src, OD_FILT_BSTRIDE, xd->plane[pli].dst.buf,
-                      (MAX_MIB_SIZE << mi_high_l2[pli]) * sbr - OD_FILT_VBORDER,
-                      coffset - OD_FILT_HBORDER, xd->plane[pli].dst.stride,
-                      OD_FILT_VBORDER, OD_FILT_HBORDER);
+          copy_sb8_16(
+              cm, src, OD_FILT_BSTRIDE, xd->plane[pli].dst.buf,
+              (MI_SIZE_64X64 << mi_high_l2[pli]) * sbr - OD_FILT_VBORDER,
+              coffset - OD_FILT_HBORDER, xd->plane[pli].dst.stride,
+              OD_FILT_VBORDER, OD_FILT_HBORDER);
         } else if (sbr > 0 && sbc > 0) {
           copy_rect(src, OD_FILT_BSTRIDE,
                     &linebuf[pli][coffset - OD_FILT_HBORDER], stride,
@@ -343,11 +344,12 @@
                     OD_DERING_VERY_LARGE);
         }
         if (!prev_row_dering[sbc + 1]) {
-          copy_sb8_16(cm, &src[OD_FILT_HBORDER + (nhb << mi_wide_l2[pli])],
-                      OD_FILT_BSTRIDE, xd->plane[pli].dst.buf,
-                      (MAX_MIB_SIZE << mi_high_l2[pli]) * sbr - OD_FILT_VBORDER,
-                      coffset + hsize, xd->plane[pli].dst.stride,
-                      OD_FILT_VBORDER, OD_FILT_HBORDER);
+          copy_sb8_16(
+              cm, &src[OD_FILT_HBORDER + (nhb << mi_wide_l2[pli])],
+              OD_FILT_BSTRIDE, xd->plane[pli].dst.buf,
+              (MI_SIZE_64X64 << mi_high_l2[pli]) * sbr - OD_FILT_VBORDER,
+              coffset + hsize, xd->plane[pli].dst.stride, OD_FILT_VBORDER,
+              OD_FILT_HBORDER);
         } else if (sbr > 0 && sbc < nhsb - 1) {
           copy_rect(&src[hsize + OD_FILT_HBORDER], OD_FILT_BSTRIDE,
                     &linebuf[pli][coffset + hsize], stride, OD_FILT_VBORDER,
@@ -368,7 +370,7 @@
                   rend + OD_FILT_VBORDER, OD_FILT_HBORDER);
         copy_sb8_16(
             cm, &linebuf[pli][coffset], stride, xd->plane[pli].dst.buf,
-            (MAX_MIB_SIZE << mi_high_l2[pli]) * (sbr + 1) - OD_FILT_VBORDER,
+            (MI_SIZE_64X64 << mi_high_l2[pli]) * (sbr + 1) - OD_FILT_VBORDER,
             coffset, xd->plane[pli].dst.stride, OD_FILT_VBORDER, hsize);
 
         if (tile_top) {
@@ -395,8 +397,8 @@
               (uint8_t *)&CONVERT_TO_SHORTPTR(
                   xd->plane[pli]
                       .dst.buf)[xd->plane[pli].dst.stride *
-                                    (MAX_MIB_SIZE * sbr << mi_high_l2[pli]) +
-                                (sbc * MAX_MIB_SIZE << mi_wide_l2[pli])],
+                                    (MI_SIZE_64X64 * sbr << mi_high_l2[pli]) +
+                                (sbc * MI_SIZE_64X64 << mi_wide_l2[pli])],
               xd->plane[pli].dst.stride, dst,
               &src[OD_FILT_VBORDER * OD_FILT_BSTRIDE + OD_FILT_HBORDER],
               xdec[pli], ydec[pli], dir, NULL, var, pli, dlist, dering_count,
@@ -406,8 +408,8 @@
 #endif
           od_dering(&xd->plane[pli]
                          .dst.buf[xd->plane[pli].dst.stride *
-                                      (MAX_MIB_SIZE * sbr << mi_high_l2[pli]) +
-                                  (sbc * MAX_MIB_SIZE << mi_wide_l2[pli])],
+                                      (MI_SIZE_64X64 * sbr << mi_high_l2[pli]) +
+                                  (sbc * MI_SIZE_64X64 << mi_wide_l2[pli])],
                     xd->plane[pli].dst.stride, dst,
                     &src[OD_FILT_VBORDER * OD_FILT_BSTRIDE + OD_FILT_HBORDER],
                     xdec[pli], ydec[pli], dir, NULL, var, pli, dlist,
diff --git a/av1/common/enums.h b/av1/common/enums.h
index a4dfbf8..2b18d32 100644
--- a/av1/common/enums.h
+++ b/av1/common/enums.h
@@ -65,6 +65,8 @@
 #define MAX_VARTX_DEPTH 2
 #endif
 
+#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
+
 // Bitstream profiles indicated by 2-3 bits in the uncompressed header.
 // 00: Profile 0.  8-bit 4:2:0 only.
 // 10: Profile 1.  8-bit 4:4:4, 4:2:2, and 4:4:0.
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index c887bb5..247e60e 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -2686,12 +2686,21 @@
 
 #if CONFIG_CDEF
   if (bsize == cm->sb_size) {
-    if (!sb_all_skip(cm, mi_row, mi_col) && !cm->all_lossless) {
-      cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]->mbmi.cdef_strength =
-          aom_read_literal(r, cm->cdef_bits, ACCT_STR);
-    } else {
-      cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]->mbmi.cdef_strength =
-          -1;
+    int width_step = mi_size_wide[BLOCK_64X64];
+    int height_step = mi_size_wide[BLOCK_64X64];
+    int w, h;
+    for (h = 0; (h < mi_size_high[cm->sb_size]) && (mi_row + h < cm->mi_rows);
+         h += height_step) {
+      for (w = 0; (w < mi_size_wide[cm->sb_size]) && (mi_col + w < cm->mi_cols);
+           w += width_step) {
+        if (!cm->all_lossless && !sb_all_skip(cm, mi_row + h, mi_col + w))
+          cm->mi_grid_visible[(mi_row + h) * cm->mi_stride + (mi_col + w)]
+              ->mbmi.cdef_strength =
+              aom_read_literal(r, cm->cdef_bits, ACCT_STR);
+        else
+          cm->mi_grid_visible[(mi_row + h) * cm->mi_stride + (mi_col + w)]
+              ->mbmi.cdef_strength = -1;
+      }
     }
   }
 #endif  // CONFIG_CDEF
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index dc46224..2e0abc1 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -3097,11 +3097,24 @@
 #endif  // CONFIG_EXT_PARTITION_TYPES
 
 #if CONFIG_CDEF
-  if (bsize == cm->sb_size && !sb_all_skip(cm, mi_row, mi_col) &&
-      cm->cdef_bits != 0 && !cm->all_lossless) {
-    aom_write_literal(w, cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]
-                             ->mbmi.cdef_strength,
-                      cm->cdef_bits);
+  if (bsize == cm->sb_size && cm->cdef_bits != 0 && !cm->all_lossless) {
+    int width_step = mi_size_wide[BLOCK_64X64];
+    int height_step = mi_size_high[BLOCK_64X64];
+    int width, height;
+    for (height = 0; (height < mi_size_high[cm->sb_size]) &&
+                     (mi_row + height < cm->mi_rows);
+         height += height_step) {
+      for (width = 0; (width < mi_size_wide[cm->sb_size]) &&
+                      (mi_col + width < cm->mi_cols);
+           width += width_step) {
+        if (!sb_all_skip(cm, mi_row + height, mi_col + width))
+          aom_write_literal(
+              w, cm->mi_grid_visible[(mi_row + height) * cm->mi_stride +
+                                     (mi_col + width)]
+                     ->mbmi.cdef_strength,
+              cm->cdef_bits);
+      }
+    }
   }
 #endif
 }
diff --git a/av1/encoder/pickcdef.c b/av1/encoder/pickcdef.c
index 50cf8d7..e4ec388 100644
--- a/av1/encoder/pickcdef.c
+++ b/av1/encoder/pickcdef.c
@@ -285,7 +285,7 @@
   int sbr, sbc;
   uint16_t *src[3];
   uint16_t *ref_coeff[3];
-  dering_list dlist[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  dering_list dlist[MI_SIZE_64X64 * MI_SIZE_64X64];
   int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS] = { { 0 } };
   int var[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS] = { { 0 } };
   int stride[3];
@@ -300,8 +300,8 @@
   uint64_t best_tot_mse = (uint64_t)1 << 63;
   uint64_t tot_mse;
   int sb_count;
-  int nvsb = (cm->mi_rows + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
-  int nhsb = (cm->mi_cols + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
+  int nvsb = (cm->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  int nhsb = (cm->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
   int *sb_index = aom_malloc(nvsb * nhsb * sizeof(*sb_index));
   int *selected_strength = aom_malloc(nvsb * nhsb * sizeof(*sb_index));
   uint64_t(*mse[2])[TOTAL_STRENGTHS];
@@ -387,14 +387,14 @@
       int nvb, nhb;
       int gi;
       int dirinit = 0;
-      nhb = AOMMIN(MAX_MIB_SIZE, cm->mi_cols - MAX_MIB_SIZE * sbc);
-      nvb = AOMMIN(MAX_MIB_SIZE, cm->mi_rows - MAX_MIB_SIZE * sbr);
-      cm->mi_grid_visible[MAX_MIB_SIZE * sbr * cm->mi_stride +
-                          MAX_MIB_SIZE * sbc]
+      nhb = AOMMIN(MI_SIZE_64X64, cm->mi_cols - MI_SIZE_64X64 * sbc);
+      nvb = AOMMIN(MI_SIZE_64X64, cm->mi_rows - MI_SIZE_64X64 * sbr);
+      cm->mi_grid_visible[MI_SIZE_64X64 * sbr * cm->mi_stride +
+                          MI_SIZE_64X64 * sbc]
           ->mbmi.cdef_strength = -1;
-      if (sb_all_skip(cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE)) continue;
-      dering_count = sb_compute_dering_list(cm, sbr * MAX_MIB_SIZE,
-                                            sbc * MAX_MIB_SIZE, dlist, 1);
+      if (sb_all_skip(cm, sbr * MI_SIZE_64X64, sbc * MI_SIZE_64X64)) continue;
+      dering_count = sb_compute_dering_list(cm, sbr * MI_SIZE_64X64,
+                                            sbc * MI_SIZE_64X64, dlist, 1);
       for (pli = 0; pli < nplanes; pli++) {
         for (i = 0; i < OD_DERING_INBUF_SIZE; i++)
           inbuf[i] = OD_DERING_VERY_LARGE;
@@ -419,8 +419,8 @@
           if (clpf_strength == 0)
             copy_sb16_16(&in[(-yoff * OD_FILT_BSTRIDE - xoff)], OD_FILT_BSTRIDE,
                          src[pli],
-                         (sbr * MAX_MIB_SIZE << mi_high_l2[pli]) - yoff,
-                         (sbc * MAX_MIB_SIZE << mi_wide_l2[pli]) - xoff,
+                         (sbr * MI_SIZE_64X64 << mi_high_l2[pli]) - yoff,
+                         (sbc * MI_SIZE_64X64 << mi_wide_l2[pli]) - xoff,
                          stride[pli], ysize, xsize);
           od_dering(clpf_strength ? NULL : (uint8_t *)in, OD_FILT_BSTRIDE,
                     tmp_dst, in, xdec[pli], ydec[pli], dir, &dirinit, var, pli,
@@ -429,8 +429,8 @@
                     dering_damping, coeff_shift, clpf_strength != 0, 1);
           curr_mse = compute_dering_dist(
               ref_coeff[pli] +
-                  (sbr * MAX_MIB_SIZE << mi_high_l2[pli]) * stride[pli] +
-                  (sbc * MAX_MIB_SIZE << mi_wide_l2[pli]),
+                  (sbr * MI_SIZE_64X64 << mi_high_l2[pli]) * stride[pli] +
+                  (sbc * MI_SIZE_64X64 << mi_wide_l2[pli]),
               stride[pli], tmp_dst, dlist, dering_count, bsize[pli],
               coeff_shift, pli);
           if (pli < 2)
@@ -438,7 +438,7 @@
           else
             mse[1][sb_count][gi] += curr_mse;
           sb_index[sb_count] =
-              MAX_MIB_SIZE * sbr * cm->mi_stride + MAX_MIB_SIZE * sbc;
+              MI_SIZE_64X64 * sbr * cm->mi_stride + MI_SIZE_64X64 * sbc;
         }
       }
       sb_count++;