Super transform - ported from nextgen branch

Various additional changes were made to make the experiment
compatible with misc_fixes.

derflr: +0.979%
hevcmr: +0.865%

Speed-wise with --enable-supertx the encoder is only about 10%
slower than without. Decoding impact is about 30% slowdown.

Note this does not work with ext-tx or var-tx yet. That is
a TODO.

Change-Id: If25af4241a7a9efbd28f58eda3c4f044c7a7ef4b
diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index 57cd31d..6611868 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c
@@ -457,11 +457,11 @@
   const int16_t *cat6_high_cost = vp10_get_high_cost_table(8);
 #endif
 
-#if !CONFIG_VAR_TX
+#if !CONFIG_VAR_TX && !CONFIG_SUPERTX
   // Check for consistency of tx_size with mode info
   assert(type == PLANE_TYPE_Y ? mbmi->tx_size == tx_size
                               : get_uv_tx_size(mbmi, pd) == tx_size);
-#endif
+#endif  // !CONFIG_VAR_TX && !CONFIG_SUPERTX
 
   if (eob == 0) {
     // single eob token
@@ -732,6 +732,54 @@
   }
 }
 
+#if CONFIG_SUPERTX
+void vp10_txfm_rd_in_plane_supertx(MACROBLOCK *x,
+#if CONFIG_VAR_TX
+                                   const VP10_COMP *cpi,
+#endif
+                                   int *rate, int64_t *distortion,
+                                   int *skippable, int64_t *sse,
+                                   int64_t ref_best_rd, int plane,
+                                   BLOCK_SIZE bsize, TX_SIZE tx_size,
+                                   int use_fast_coef_casting) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  struct rdcost_block_args args;
+  TX_TYPE tx_type;
+
+  vp10_zero(args);
+  args.x = x;
+#if CONFIG_VAR_TX
+  args.cpi = cpi;
+#endif
+  args.best_rd = ref_best_rd;
+  args.use_fast_coef_costing = use_fast_coef_casting;
+
+  if (plane == 0)
+    xd->mi[0]->mbmi.tx_size = tx_size;
+
+  vp10_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left);
+
+  tx_type = get_tx_type(pd->plane_type, xd, 0, tx_size);
+  args.so = get_scan(tx_size, tx_type, is_inter_block(&xd->mi[0]->mbmi));
+
+  block_rd_txfm(plane, 0, 0, 0, get_plane_block_size(bsize, pd),
+                tx_size, &args);
+
+  if (args.exit_early) {
+    *rate       = INT_MAX;
+    *distortion = INT64_MAX;
+    *sse        = INT64_MAX;
+    *skippable  = 0;
+  } else {
+    *distortion = args.this_dist;
+    *rate       = args.this_rate;
+    *sse        = args.this_sse;
+    *skippable  = !x->plane[plane].eobs[0];
+  }
+}
+#endif  // CONFIG_SUPERTX
+
 static void choose_largest_tx_size(VP10_COMP *cpi, MACROBLOCK *x,
                                    int *rate, int64_t *distortion,
                                    int *skip, int64_t *sse,
@@ -4855,7 +4903,11 @@
                                 TileDataEnc *tile_data,
                                 MACROBLOCK *x,
                                 int mi_row, int mi_col,
-                                RD_COST *rd_cost, BLOCK_SIZE bsize,
+                                RD_COST *rd_cost,
+#if CONFIG_SUPERTX
+                                int *returnrate_nocoef,
+#endif  // CONFIG_SUPERTX
+                                BLOCK_SIZE bsize,
                                 PICK_MODE_CONTEXT *ctx,
                                 int64_t best_rd_so_far) {
   VP10_COMMON *const cm = &cpi->common;
@@ -4954,6 +5006,9 @@
   }
 
   rd_cost->rate = INT_MAX;
+#if CONFIG_SUPERTX
+  *returnrate_nocoef = INT_MAX;
+#endif  // CONFIG_SUPERTX
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     x->pred_mv_sad[ref_frame] = INT_MAX;
@@ -5375,7 +5430,8 @@
       if (skippable) {
         // Back out the coefficient coding costs
         rate2 -= (rate_y + rate_uv);
-
+        rate_y = 0;
+        rate_uv = 0;
         // Cost the skip mb case
         rate2 += vp10_cost_bit(vp10_get_skip_prob(cm, xd), 1);
 
@@ -5391,6 +5447,8 @@
           assert(total_sse >= 0);
           rate2 -= (rate_y + rate_uv);
           this_skip2 = 1;
+          rate_y = 0;
+          rate_uv = 0;
         }
       } else {
         // Add in the cost of the no skip flag.
@@ -5437,6 +5495,15 @@
         }
 
         rd_cost->rate = rate2;
+#if CONFIG_SUPERTX
+        *returnrate_nocoef = rate2 - rate_y - rate_uv;
+        if (!disable_skip) {
+          *returnrate_nocoef -= vp10_cost_bit(vp10_get_skip_prob(cm, xd),
+                                              skippable || this_skip2);
+        }
+        *returnrate_nocoef -= vp10_cost_bit(vp10_get_intra_inter_prob(cm, xd),
+                                            mbmi->ref_frame[0] != INTRA_FRAME);
+#endif  // CONFIG_SUPERTX
         rd_cost->dist = distortion2;
         rd_cost->rdcost = this_rd;
         best_rd = this_rd;
@@ -5760,14 +5827,17 @@
                        best_pred_diff, best_filter_diff, 0);
 }
 
-void vp10_rd_pick_inter_mode_sub8x8(VP10_COMP *cpi,
-                                   TileDataEnc *tile_data,
-                                   MACROBLOCK *x,
-                                   int mi_row, int mi_col,
-                                   RD_COST *rd_cost,
-                                   BLOCK_SIZE bsize,
-                                   PICK_MODE_CONTEXT *ctx,
-                                   int64_t best_rd_so_far) {
+void vp10_rd_pick_inter_mode_sub8x8(struct VP10_COMP *cpi,
+                                    TileDataEnc *tile_data,
+                                    struct macroblock *x,
+                                    int mi_row, int mi_col,
+                                    struct RD_COST *rd_cost,
+#if CONFIG_SUPERTX
+                                    int *returnrate_nocoef,
+#endif  // CONFIG_SUPERTX
+                                    BLOCK_SIZE bsize,
+                                    PICK_MODE_CONTEXT *ctx,
+                                    int64_t best_rd_so_far) {
   VP10_COMMON *const cm = &cpi->common;
   RD_OPT *const rd_opt = &cpi->rd;
   SPEED_FEATURES *const sf = &cpi->sf;
@@ -5816,6 +5886,11 @@
   int internal_active_edge =
     vp10_active_edge_sb(cpi, mi_row, mi_col) && vp10_internal_image_edge(cpi);
 
+#if CONFIG_SUPERTX
+  best_rd_so_far = INT64_MAX;
+  best_rd = best_rd_so_far;
+  best_yrd = best_rd_so_far;
+#endif  // CONFIG_SUPERTX
   memset(x->zcoeff_blk[TX_4X4], 0, 4);
   vp10_zero(best_mbmode);
 
@@ -5843,6 +5918,9 @@
   rate_uv_intra = INT_MAX;
 
   rd_cost->rate = INT_MAX;
+#if CONFIG_SUPERTX
+  *returnrate_nocoef = INT_MAX;
+#endif
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
@@ -6300,6 +6378,15 @@
         }
 
         rd_cost->rate = rate2;
+#if CONFIG_SUPERTX
+        *returnrate_nocoef = rate2 - rate_y - rate_uv;
+        if (!disable_skip)
+          *returnrate_nocoef -= vp10_cost_bit(vp10_get_skip_prob(cm, xd),
+                                              this_skip2);
+        *returnrate_nocoef -= vp10_cost_bit(vp10_get_intra_inter_prob(cm, xd),
+                                            mbmi->ref_frame[0] != INTRA_FRAME);
+        assert(*returnrate_nocoef > 0);
+#endif  // CONFIG_SUPERTX
         rd_cost->dist = distortion2;
         rd_cost->rdcost = this_rd;
         best_rd = this_rd;
@@ -6402,6 +6489,9 @@
   if (best_rd >= best_rd_so_far) {
     rd_cost->rate = INT_MAX;
     rd_cost->rdcost = INT64_MAX;
+#if CONFIG_SUPERTX
+    *returnrate_nocoef = INT_MAX;
+#endif  // CONFIG_SUPERTX
     return;
   }
 
@@ -6422,6 +6512,9 @@
     rd_cost->rate = INT_MAX;
     rd_cost->dist = INT64_MAX;
     rd_cost->rdcost = INT64_MAX;
+#if CONFIG_SUPERTX
+    *returnrate_nocoef = INT_MAX;
+#endif  // CONFIG_SUPERTX
     return;
   }