Make intra coding processed in 64x64 unit

Process the intra block coding in maximum of 64x64 block unit.
This allows to simpilify the hardware design for 128x128 level
block coding.

Change-Id: I6ef1dd8549bdb53b2d2394c6aa7d61cea93686bf
diff --git a/av1/common/blockd.c b/av1/common/blockd.c
index 7ec08e8..7a37f6f 100644
--- a/av1/common/blockd.c
+++ b/av1/common/blockd.c
@@ -145,13 +145,27 @@
   const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
   const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
 
+  int blk_row, blk_col;
+
+  const BLOCK_SIZE max_unit_bsize = get_plane_block_size(BLOCK_64X64, pd);
+  int mu_blocks_wide = block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
+  int mu_blocks_high = block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
+  mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide);
+  mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high);
+
   // Keep track of the row and column of the blocks we use so that we know
   // if we are in the unrestricted motion border.
-  for (r = 0; r < max_blocks_high; r += txh_unit) {
+  for (r = 0; r < max_blocks_high; r += mu_blocks_high) {
     // Skip visiting the sub blocks that are wholly within the UMV.
-    for (c = 0; c < max_blocks_wide; c += txw_unit) {
-      visit(plane, i, r, c, plane_bsize, tx_size, arg);
-      i += step;
+    for (c = 0; c < max_blocks_wide; c += mu_blocks_wide) {
+      const int unit_height = AOMMIN(mu_blocks_high + r, max_blocks_high);
+      const int unit_width = AOMMIN(mu_blocks_wide + c, max_blocks_wide);
+      for (blk_row = r; blk_row < unit_height; blk_row += txh_unit) {
+        for (blk_col = c; blk_col < unit_width; blk_col += txw_unit) {
+          visit(plane, i, blk_row, blk_col, plane_bsize, tx_size, arg);
+          i += step;
+        }
+      }
     }
   }
 }
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index d424d63..50c7466 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -1988,11 +1988,26 @@
                                pd->subsampling_y))
         continue;
 #endif
+      int blk_row, blk_col;
+      const BLOCK_SIZE max_unit_bsize = get_plane_block_size(BLOCK_64X64, pd);
+      int mu_blocks_wide =
+          block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
+      int mu_blocks_high =
+          block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
+      mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide);
+      mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high);
 
-      for (row = 0; row < max_blocks_high; row += stepr)
-        for (col = 0; col < max_blocks_wide; col += stepc)
-          predict_and_reconstruct_intra_block(cm, xd, r, mbmi, plane, row, col,
-                                              tx_size);
+      for (row = 0; row < max_blocks_high; row += mu_blocks_high) {
+        for (col = 0; col < max_blocks_wide; col += mu_blocks_wide) {
+          const int unit_height = AOMMIN(mu_blocks_high + row, max_blocks_high);
+          const int unit_width = AOMMIN(mu_blocks_wide + col, max_blocks_wide);
+
+          for (blk_row = row; blk_row < unit_height; blk_row += stepr)
+            for (blk_col = col; blk_col < unit_width; blk_col += stepc)
+              predict_and_reconstruct_intra_block(cm, xd, r, mbmi, plane,
+                                                  blk_row, blk_col, tx_size);
+        }
+      }
     }
   } else {
     int ref;
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index e6fb5b6..c9996c6 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -2672,13 +2672,23 @@
         const TX_SIZE tx = av1_get_tx_size(plane, xd);
         const int bkw = tx_size_wide_unit[tx];
         const int bkh = tx_size_high_unit[tx];
-        for (row = 0; row < num_4x4_h; row += bkh) {
-          for (col = 0; col < num_4x4_w; col += bkw) {
+        int blk_row, blk_col;
+
+        for (row = 0; row < num_4x4_h; row += mu_blocks_high) {
+          for (col = 0; col < num_4x4_w; col += mu_blocks_wide) {
+            const int unit_height = AOMMIN(mu_blocks_high + row, num_4x4_h);
+            const int unit_width = AOMMIN(mu_blocks_wide + col, num_4x4_w);
+
+            for (blk_row = row; blk_row < unit_height; blk_row += bkh) {
+              for (blk_col = col; blk_col < unit_width; blk_col += bkw) {
 #if !CONFIG_PVQ
-            pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx, &token_stats);
+                pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx,
+                               &token_stats);
 #else
-            pack_pvq_tokens(w, x, xd, plane, bsize, tx);
+                pack_pvq_tokens(w, x, xd, plane, bsize, tx);
 #endif
+              }
+            }
           }
         }
 #endif  // CONFIG_LV_MAP