[CFL] basic early termination for alpha search

This causes no change in the encoder output.
Comparing simple SSE-based RDO with the switch to
txfm_rd_in_plane, the overhead is reduced by 23% ~ 50%.
The total encode time increase is now 2.3% ~ 3.1%.

Change-Id: I48c76216871f8ed68631815fd781697139305e94
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index b871dd3..1ed95cb7 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -5360,18 +5360,19 @@
 #if CONFIG_CFL
 static void txfm_rd_in_plane_once(MACROBLOCK *const x,
                                   const AV1_COMP *const cpi, BLOCK_SIZE bsize,
-                                  TX_SIZE tx_size, int plane, int64_t *dist,
-                                  int *rate) {
+                                  TX_SIZE tx_size, int plane, int64_t best_rd,
+                                  int64_t *dist, int *rate) {
   RD_STATS rd_stats;
   av1_init_rd_stats(&rd_stats);
-  txfm_rd_in_plane(x, cpi, &rd_stats, INT64_MAX, plane, bsize, tx_size,
+  txfm_rd_in_plane(x, cpi, &rd_stats, best_rd, plane, bsize, tx_size,
                    cpi->sf.use_fast_coef_costing);
   *dist = rd_stats.dist;
   *rate = rd_stats.rate;
 }
 
 static int cfl_rd_pick_alpha(MACROBLOCK *const x, const AV1_COMP *const cpi,
-                             BLOCK_SIZE bsize, TX_SIZE tx_size) {
+                             BLOCK_SIZE bsize, TX_SIZE tx_size,
+                             int64_t best_rd) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   bsize = scale_chroma_bsize(bsize, xd->plane[AOM_PLANE_U].subsampling_x,
@@ -5381,10 +5382,10 @@
   int64_t dists[CFL_PRED_PLANES][CFL_MAGS_SIZE];
   mbmi->cfl_alpha_idx = 0;
   mbmi->cfl_alpha_signs = CFL_SIGN_ZERO * CFL_SIGNS + CFL_SIGN_POS - 1;
-  txfm_rd_in_plane_once(x, cpi, bsize, tx_size, AOM_PLANE_U,
+  txfm_rd_in_plane_once(x, cpi, bsize, tx_size, AOM_PLANE_U, best_rd,
                         &dists[CFL_PRED_U][0], &rates[CFL_PRED_U][0]);
   mbmi->cfl_alpha_signs = CFL_SIGN_POS * CFL_SIGNS + CFL_SIGN_ZERO - 1;
-  txfm_rd_in_plane_once(x, cpi, bsize, tx_size, AOM_PLANE_V,
+  txfm_rd_in_plane_once(x, cpi, bsize, tx_size, AOM_PLANE_V, best_rd,
                         &dists[CFL_PRED_V][0], &rates[CFL_PRED_V][0]);
 
   for (int c = 0; c < CFL_ALPHABET_SIZE; c++) {
@@ -5392,9 +5393,9 @@
     for (int sign = CFL_SIGN_NEG; sign < CFL_SIGNS; sign++) {
       const int m = c * 2 + 1 + (sign == CFL_SIGN_NEG);
       mbmi->cfl_alpha_signs = sign * CFL_SIGNS + sign - 1;
-      txfm_rd_in_plane_once(x, cpi, bsize, tx_size, AOM_PLANE_U,
+      txfm_rd_in_plane_once(x, cpi, bsize, tx_size, AOM_PLANE_U, best_rd,
                             &dists[CFL_PRED_U][m], &rates[CFL_PRED_U][m]);
-      txfm_rd_in_plane_once(x, cpi, bsize, tx_size, AOM_PLANE_V,
+      txfm_rd_in_plane_once(x, cpi, bsize, tx_size, AOM_PLANE_V, best_rd,
                             &dists[CFL_PRED_V][m], &rates[CFL_PRED_V][m]);
     }
   }
@@ -5402,9 +5403,9 @@
   int64_t dist;
   int64_t cost;
   int64_t best_cost = INT64_MAX;
-  int best_rate_overhead = 0;
+  int best_rate_overhead = INT_MAX;
 #if CONFIG_DEBUG
-  int best_rate = 0;
+  int best_rate = INT_MAX;
 #endif  // CONFIG_DEBUG
 
   int ind = 0;
@@ -5416,17 +5417,19 @@
     const int size_u = (sign_u == CFL_SIGN_ZERO) ? 1 : CFL_ALPHABET_SIZE;
     const int size_v = (sign_v == CFL_SIGN_ZERO) ? 1 : CFL_ALPHABET_SIZE;
     for (int u = 0; u < size_u; u++) {
-      const int idx_u = (sign_u == CFL_SIGN_ZERO) ? 0 : u * 2 + 1;
+      const int idx_u = ((sign_u == CFL_SIGN_ZERO) ? 0 : u * 2 + 1) +
+                        (sign_u == CFL_SIGN_NEG);
+      if (rates[CFL_PRED_U][idx_u] == INT_MAX) continue;
       for (int v = 0; v < size_v; v++) {
-        const int idx_v = (sign_v == CFL_SIGN_ZERO) ? 0 : v * 2 + 1;
-        dist = dists[CFL_PRED_U][idx_u + (sign_u == CFL_SIGN_NEG)] +
-               dists[CFL_PRED_V][idx_v + (sign_v == CFL_SIGN_NEG)];
+        const int idx_v = ((sign_v == CFL_SIGN_ZERO) ? 0 : v * 2 + 1) +
+                          (sign_v == CFL_SIGN_NEG);
+        if (rates[CFL_PRED_V][idx_v] == INT_MAX) continue;
+        dist = dists[CFL_PRED_U][idx_u] + dists[CFL_PRED_V][idx_v];
         int rate_overhead = x->cfl_cost[joint_sign][CFL_PRED_U][u] +
                             x->cfl_cost[joint_sign][CFL_PRED_V][v];
         int rate = x->intra_uv_mode_cost[mbmi->mode][UV_CFL_PRED] +
-                   rate_overhead +
-                   rates[CFL_PRED_U][idx_u + (sign_u == CFL_SIGN_NEG)] +
-                   rates[CFL_PRED_V][idx_v + (sign_v == CFL_SIGN_NEG)];
+                   rate_overhead + rates[CFL_PRED_U][idx_u] +
+                   rates[CFL_PRED_V][idx_v];
         cost = RDCOST(x->rdmult, rate, dist);
         if (cost < best_cost) {
           best_cost = cost;
@@ -5489,7 +5492,8 @@
     if (mode == UV_CFL_PRED) {
       assert(!is_directional_mode);
       const TX_SIZE uv_tx_size = av1_get_uv_tx_size(mbmi, &xd->plane[1]);
-      cfl_alpha_rate = cfl_rd_pick_alpha(x, cpi, bsize, uv_tx_size);
+      cfl_alpha_rate = cfl_rd_pick_alpha(x, cpi, bsize, uv_tx_size, best_rd);
+      if (cfl_alpha_rate == INT_MAX) continue;
     }
 #endif
 #if CONFIG_EXT_INTRA