Limit the transform block partition depth

Limit the recursive transform block partition depth to 2. For a
32x32 transform block unit, one can maximally go down to 8x8 transform
block size.

Change-Id: I2caa92bb2eee64762b7ecca8920259f7c50fb0aa
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index e0fb7ec..92e98b2 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -352,7 +352,8 @@
 #if CONFIG_VAR_TX
 static void write_tx_size_vartx(const AV1_COMMON *cm, const MACROBLOCKD *xd,
                                 const MB_MODE_INFO *mbmi, TX_SIZE tx_size,
-                                int blk_row, int blk_col, aom_writer *w) {
+                                int depth, int blk_row, int blk_col,
+                                aom_writer *w) {
   const int tx_row = blk_row >> 1;
   const int tx_col = blk_col >> 1;
   int max_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
@@ -365,6 +366,12 @@
 
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
+  if (depth == 2) {
+    txfm_partition_update(xd->above_txfm_context + tx_col,
+                          xd->left_txfm_context + tx_row, tx_size);
+    return;
+  }
+
   if (tx_size == mbmi->inter_tx_size[tx_row][tx_col]) {
     aom_write(w, 0, cm->fc->txfm_partition_prob[ctx]);
     txfm_partition_update(xd->above_txfm_context + tx_col,
@@ -386,7 +393,8 @@
     for (i = 0; i < 4; ++i) {
       int offsetr = blk_row + ((i >> 1) << bsl);
       int offsetc = blk_col + ((i & 0x01) << bsl);
-      write_tx_size_vartx(cm, xd, mbmi, tx_size - 1, offsetr, offsetc, w);
+      write_tx_size_vartx(cm, xd, mbmi, tx_size - 1, depth + 1, offsetr,
+                          offsetc, w);
     }
   }
 }
@@ -1216,7 +1224,8 @@
 #endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
         for (idy = 0; idy < height; idy += bs)
           for (idx = 0; idx < width; idx += bs)
-            write_tx_size_vartx(cm, xd, mbmi, max_tx_size, idy, idx, w);
+            write_tx_size_vartx(cm, xd, mbmi, max_tx_size, height != width, idy,
+                                idx, w);
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
       }
 #endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 627352b..1351281 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -3020,11 +3020,11 @@
 
 static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
                             int blk_col, int plane, int block, TX_SIZE tx_size,
-                            BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
-                            ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above,
-                            TXFM_CONTEXT *tx_left, int *rate, int64_t *dist,
-                            int64_t *bsse, int *skip, int64_t ref_best_rd,
-                            int *is_cost_valid) {
+                            int depth, BLOCK_SIZE plane_bsize,
+                            ENTROPY_CONTEXT *ta, ENTROPY_CONTEXT *tl,
+                            TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
+                            int *rate, int64_t *dist, int64_t *bsse, int *skip,
+                            int64_t ref_best_rd, int *is_cost_valid) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   struct macroblock_plane *const p = &x->plane[plane];
@@ -3116,13 +3116,13 @@
       *skip = 0;
     }
 
-    if (tx_size > TX_4X4)
+    if (tx_size > TX_4X4 && depth < 2)
       *rate += av1_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 0);
     this_rd = RDCOST(x->rdmult, x->rddiv, *rate, *dist);
     tmp_eob = p->eobs[block];
   }
 
-  if (tx_size > TX_4X4) {
+  if (tx_size > TX_4X4 && depth < 2) {
     BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
     int bsl = b_height_log2_lookup[bsize];
     int sub_step = num_4x4_blocks_txsize_lookup[tx_size - 1];
@@ -3141,9 +3141,10 @@
       int offsetr = (i >> 1) << bsl;
       int offsetc = (i & 0x01) << bsl;
       select_tx_block(cpi, x, blk_row + offsetr, blk_col + offsetc, plane,
-                      block + i * sub_step, tx_size - 1, plane_bsize, ta, tl,
-                      tx_above, tx_left, &this_rate, &this_dist, &this_bsse,
-                      &this_skip, ref_best_rd - tmp_rd, &this_cost_valid);
+                      block + i * sub_step, tx_size - 1, depth + 1, plane_bsize,
+                      ta, tl, tx_above, tx_left, &this_rate, &this_dist,
+                      &this_bsse, &this_skip, ref_best_rd - tmp_rd,
+                      &this_cost_valid);
       sum_rate += this_rate;
       sum_dist += this_dist;
       sum_bsse += this_bsse;
@@ -3219,9 +3220,10 @@
     for (idy = 0; idy < mi_height; idy += bh) {
       for (idx = 0; idx < mi_width; idx += bh) {
         select_tx_block(cpi, x, idy, idx, 0, block,
-                        max_txsize_lookup[plane_bsize], plane_bsize, ctxa, ctxl,
-                        tx_above, tx_left, &pnrate, &pndist, &pnsse, &pnskip,
-                        ref_best_rd - this_rd, &is_cost_valid);
+                        max_txsize_lookup[plane_bsize], mi_height != mi_width,
+                        plane_bsize, ctxa, ctxl, tx_above, tx_left, &pnrate,
+                        &pndist, &pnsse, &pnskip, ref_best_rd - this_rd,
+                        &is_cost_valid);
         *rate += pnrate;
         *distortion += pndist;
         *sse += pnsse;
@@ -7586,7 +7588,7 @@
       // Y cost and distortion
       av1_subtract_plane(x, bsize, 0);
 #if CONFIG_VAR_TX
-      if (cm->tx_mode == TX_MODE_SELECT || xd->lossless[mbmi->segment_id]) {
+      if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
         select_tx_type_yrd(cpi, x, rate_y, &distortion_y, &skippable_y, psse,
                            bsize, ref_best_rd);
       } else {