Don't work in TX64X64 blocks on a subsampled chroma plane

With ext-partition and tx64x64 enabled, the encoder could choose
TX_64X64 to encode the transform for a subsampled plane of a
BLOCK_128X128 block. This broke an assumption in the nested loop in
write_tokens_b and also caused bug 827 (with a rather cryptic
failure).

This patch changes get_vartx_max_txsize to ensure that the encoder and
decoder don't think they can use TX_64X64 in this situation. It also
adds a couple of assertions to the loop mentioned above so that if
something comes unstuck it'll be much more obvious what went wrong.

BUG=aomedia:827

Change-Id: Ie093f2f20f6242949d68e950c8f95b100867ee17
diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index 78beb97..8506e0a 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h
@@ -1473,13 +1473,28 @@
 
 #if CONFIG_VAR_TX
 static INLINE int get_vartx_max_txsize(const MB_MODE_INFO *const mbmi,
-                                       BLOCK_SIZE bsize) {
+                                       BLOCK_SIZE bsize, int subsampled) {
 #if CONFIG_CB4X4
   (void)mbmi;
-  return max_txsize_rect_lookup[bsize];
+  TX_SIZE max_txsize = max_txsize_rect_lookup[bsize];
+#else
+  TX_SIZE max_txsize = mbmi->sb_type < BLOCK_8X8
+                           ? max_txsize_rect_lookup[mbmi->sb_type]
+                           : max_txsize_rect_lookup[bsize];
 #endif  // CONFIG_C4X4
-  return mbmi->sb_type < BLOCK_8X8 ? max_txsize_rect_lookup[mbmi->sb_type]
-                                   : max_txsize_rect_lookup[bsize];
+
+#if CONFIG_EXT_PARTITION && CONFIG_TX64X64
+  // The decoder is designed so that it can process 64x64 luma pixels at a
+  // time. If this is a chroma plane with subsampling and bsize corresponds to
+  // a subsampled BLOCK_128X128 then the lookup above will give TX_64X64. That
+  // mustn't be used for the subsampled plane (because it would be bigger than
+  // a 64x64 luma block) so we round down to TX_32X32.
+  if (subsampled && max_txsize == TX_64X64) max_txsize = TX_32X32;
+#else
+  (void)subsampled;
+#endif
+
+  return max_txsize;
 }
 #endif  // CONFIG_VAR_TX
 
diff --git a/av1/common/common_data.h b/av1/common/common_data.h
index 1e5e184..d532735 100644
--- a/av1/common/common_data.h
+++ b/av1/common/common_data.h
@@ -1406,7 +1406,7 @@
       { { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
       { { TX_32X32, TX_32X32 }, { TX_32X32, TX_32X32 } },
 #if CONFIG_TX64X64
-      { { TX_64X64, TX_64X64 }, { TX_32X32, TX_32X32 } },
+      { { TX_64X64, TX_32X32 }, { TX_32X32, TX_32X32 } },
 #endif  // CONFIG_TX64X64
       { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
       { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
@@ -1429,7 +1429,7 @@
       { { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
       { { TX_32X32, TX_32X32 }, { TX_32X32, TX_32X32 } },
 #if CONFIG_TX64X64
-      { { TX_64X64, TX_32X32 }, { TX_64X64, TX_32X32 } },
+      { { TX_64X64, TX_32X32 }, { TX_32X32, TX_32X32 } },
 #endif  // CONFIG_TX64X64
       { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
       { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
@@ -1452,7 +1452,7 @@
       { { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
       { { TX_32X32, TX_32X32 }, { TX_32X32, TX_32X32 } },
 #if CONFIG_TX64X64
-      { { TX_64X64, TX_64X64 }, { TX_64X64, TX_64X64 } },
+      { { TX_64X64, TX_32X32 }, { TX_32X32, TX_32X32 } },
 #endif  // CONFIG_TX64X64
       { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
       { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index 0bb6575..58a354a 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -2068,7 +2068,8 @@
         mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide);
         mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high);
 
-        const TX_SIZE max_tx_size = get_vartx_max_txsize(mbmi, plane_bsize);
+        const TX_SIZE max_tx_size = get_vartx_max_txsize(
+            mbmi, plane_bsize, pd->subsampling_x || pd->subsampling_y);
         const int bh_var_tx = tx_size_high_unit[max_tx_size];
         const int bw_var_tx = tx_size_wide_unit[max_tx_size];
         int block = 0;
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index 9401f1e..00961e9 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -1761,7 +1761,7 @@
       !(is_inter && skip) && !xd->lossless[segment_id]) {
 #if CONFIG_VAR_TX
     if (is_inter) {  // This implies skip flag is 0.
-      const TX_SIZE max_tx_size = get_vartx_max_txsize(mbmi, bsize);
+      const TX_SIZE max_tx_size = get_vartx_max_txsize(mbmi, bsize, 0);
       const int bh = tx_size_high_unit[max_tx_size];
       const int bw = tx_size_wide_unit[max_tx_size];
       const int width = block_size_wide[bsize] >> tx_size_wide_log2[0];
@@ -2630,12 +2630,15 @@
       mu_blocks_high = AOMMIN(num_4x4_h, mu_blocks_high);
 
       if (is_inter_block(mbmi)) {
-        const TX_SIZE max_tx_size = get_vartx_max_txsize(mbmi, plane_bsize);
+        const TX_SIZE max_tx_size = get_vartx_max_txsize(
+            mbmi, plane_bsize, pd->subsampling_x || pd->subsampling_y);
         int block = 0;
         const int step =
             tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
         const int bkw = tx_size_wide_unit[max_tx_size];
         const int bkh = tx_size_high_unit[max_tx_size];
+        assert(bkw <= mu_blocks_wide);
+        assert(bkh <= mu_blocks_high);
         for (row = 0; row < num_4x4_h; row += mu_blocks_high) {
           const int unit_height = AOMMIN(mu_blocks_high + row, num_4x4_h);
           for (col = 0; col < num_4x4_w; col += mu_blocks_wide) {
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 571d642..7da552f 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -5927,7 +5927,7 @@
   MACROBLOCKD *xd = &x->e_mbd;
   const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
   const int mi_height = block_size_high[plane_bsize] >> tx_size_wide_log2[0];
-  TX_SIZE max_tx_size = get_vartx_max_txsize(&xd->mi[0]->mbmi, plane_bsize);
+  TX_SIZE max_tx_size = get_vartx_max_txsize(&xd->mi[0]->mbmi, plane_bsize, 0);
   const int bh = tx_size_high_unit[max_tx_size];
   const int bw = tx_size_wide_unit[max_tx_size];
   int idx, idy;
@@ -5986,7 +5986,7 @@
                                       int mi_row, int mi_col) {
   const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
   const int mi_height = block_size_high[plane_bsize] >> tx_size_high_log2[0];
-  TX_SIZE max_tx_size = get_vartx_max_txsize(&xd->mi[0]->mbmi, plane_bsize);
+  TX_SIZE max_tx_size = get_vartx_max_txsize(&xd->mi[0]->mbmi, plane_bsize, 0);
   const int bh = tx_size_high_unit[max_tx_size];
   const int bw = tx_size_wide_unit[max_tx_size];
   int idx, idy;
diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c
index ecfd1bf..9b24e86 100644
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c
@@ -942,7 +942,8 @@
     const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
     const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
     const int mi_height = block_size_high[plane_bsize] >> tx_size_wide_log2[0];
-    const TX_SIZE max_tx_size = get_vartx_max_txsize(mbmi, plane_bsize);
+    const TX_SIZE max_tx_size = get_vartx_max_txsize(
+        mbmi, plane_bsize, pd->subsampling_x || pd->subsampling_y);
     const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
     const int bw = block_size_wide[txb_size] >> tx_size_wide_log2[0];
     const int bh = block_size_high[txb_size] >> tx_size_wide_log2[0];
diff --git a/av1/encoder/tokenize.c b/av1/encoder/tokenize.c
index e5ce850..0836f6c 100644
--- a/av1/encoder/tokenize.c
+++ b/av1/encoder/tokenize.c
@@ -757,7 +757,8 @@
 #endif
     const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
     const int mi_height = block_size_high[plane_bsize] >> tx_size_wide_log2[0];
-    const TX_SIZE max_tx_size = get_vartx_max_txsize(mbmi, plane_bsize);
+    const TX_SIZE max_tx_size = get_vartx_max_txsize(
+        mbmi, plane_bsize, pd->subsampling_x || pd->subsampling_y);
     const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
     int bw = block_size_wide[txb_size] >> tx_size_wide_log2[0];
     int bh = block_size_high[txb_size] >> tx_size_wide_log2[0];