Refactor probability savings search.

- Avoid excessive copying
- Don't bother searching if no update can possibly offer savings
- Simplify the interface
- Remove the confusing av1_cost_upd256 macro

(cherry picked from libvpx/master commit
19e0b406c9601ec79f96cba654347e48650929ed)

Change-Id: Id9d9676a361fd1203b27e930cd29c23b2813ce59
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index a36f0c9..6829a80 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -2476,8 +2476,8 @@
                 int u = 0;
                 if (t == PIVOT_NODE)
                   s = av1_prob_diff_update_savings_search_model(
-                      frame_branch_ct[i][j][k][l][0],
-                      old_coef_probs[i][j][k][l], &newp, upd, stepsize, probwt);
+                      frame_branch_ct[i][j][k][l][0], oldp, &newp, upd,
+                      stepsize, probwt);
                 else
                   s = av1_prob_diff_update_savings_search(
                       frame_branch_ct[i][j][k][l][t], oldp, &newp, upd, probwt);
@@ -2512,8 +2512,8 @@
                 int u = 0;
                 if (t == PIVOT_NODE)
                   s = av1_prob_diff_update_savings_search_model(
-                      frame_branch_ct[i][j][k][l][0],
-                      old_coef_probs[i][j][k][l], &newp, upd, stepsize, probwt);
+                      frame_branch_ct[i][j][k][l][0], *oldp, &newp, upd,
+                      stepsize, probwt);
                 else
                   s = av1_prob_diff_update_savings_search(
                       frame_branch_ct[i][j][k][l][t], *oldp, &newp, upd,
@@ -2548,8 +2548,8 @@
                 int u = 0;
                 if (t == PIVOT_NODE) {
                   s = av1_prob_diff_update_savings_search_model(
-                      frame_branch_ct[i][j][k][l][0],
-                      old_coef_probs[i][j][k][l], &newp, upd, stepsize, probwt);
+                      frame_branch_ct[i][j][k][l][0], *oldp, &newp, upd,
+                      stepsize, probwt);
                 } else {
                   s = av1_prob_diff_update_savings_search(
                       frame_branch_ct[i][j][k][l][t], *oldp, &newp, upd,
diff --git a/av1/encoder/subexp.c b/av1/encoder/subexp.c
index 81bb56d..ff213c5 100644
--- a/av1/encoder/subexp.c
+++ b/av1/encoder/subexp.c
@@ -15,8 +15,6 @@
 #include "av1/encoder/cost.h"
 #include "av1/encoder/subexp.h"
 
-#define av1_cost_upd256 ((int)(av1_cost_one(upd) - av1_cost_zero(upd)))
-
 static const uint8_t update_bits[255] = {
   5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  6,  6,  6,
   6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  8,  8,  8,  8,  8,  8,
@@ -33,6 +31,7 @@
   11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
   11, 11, 11, 11, 11, 11, 11, 0,
 };
+#define MIN_DELP_BITS 5
 
 static int recenter_nonneg(int v, int m) {
   if (v > (m << 1))
@@ -122,15 +121,17 @@
   int bestsavings = 0;
   aom_prob newp, bestnewp = oldp;
   const int step = *bestp > oldp ? -1 : 1;
+  const int upd_cost = av1_cost_one(upd) - av1_cost_zero(upd);
 
-  for (newp = *bestp; newp != oldp; newp += step) {
-    const uint32_t new_b = cost_branch256(ct, newp);
-    const uint32_t update_b =
-        prob_diff_update_cost(newp, oldp) + av1_cost_upd256;
-    const int savings = (int)((int64_t)old_b - new_b - update_b * probwt);
-    if (savings > bestsavings) {
-      bestsavings = savings;
-      bestnewp = newp;
+  if (old_b > (uint32_t)upd_cost + (MIN_DELP_BITS << AV1_PROB_COST_SHIFT)) {
+    for (newp = *bestp; newp != oldp; newp += step) {
+      const int new_b = cost_branch256(ct, newp);
+      const int update_b = prob_diff_update_cost(newp, oldp) + upd_cost;
+      const int savings = (int)((int64_t)old_b - new_b - update_b * probwt);
+      if (savings > bestsavings) {
+        bestsavings = savings;
+        bestnewp = newp;
+      }
     }
   }
   *bestp = bestnewp;
@@ -138,37 +139,39 @@
 }
 
 int av1_prob_diff_update_savings_search_model(const unsigned int *ct,
-                                              const aom_prob *oldp,
+                                              const aom_prob oldp,
                                               aom_prob *bestp, aom_prob upd,
                                               int stepsize, int probwt) {
   int i, old_b, new_b, update_b, savings, bestsavings;
   int newp;
-  const int step_sign = *bestp > oldp[PIVOT_NODE] ? -1 : 1;
+  const int step_sign = *bestp > oldp ? -1 : 1;
   const int step = stepsize * step_sign;
-  aom_prob bestnewp, newplist[ENTROPY_NODES], oldplist[ENTROPY_NODES];
-  av1_model_to_full_probs(oldp, oldplist);
-  memcpy(newplist, oldp, sizeof(aom_prob) * UNCONSTRAINED_NODES);
-  for (i = UNCONSTRAINED_NODES, old_b = 0; i < ENTROPY_NODES; ++i)
-    old_b += cost_branch256(ct + 2 * i, oldplist[i]);
-  old_b += cost_branch256(ct + 2 * PIVOT_NODE, oldplist[PIVOT_NODE]);
+  const int upd_cost = av1_cost_one(upd) - av1_cost_zero(upd);
+  const aom_prob *newplist, *oldplist;
+  aom_prob bestnewp;
+  oldplist = av1_pareto8_full[oldp - 1];
+  old_b = cost_branch256(ct + 2 * PIVOT_NODE, oldp);
+  for (i = UNCONSTRAINED_NODES; i < ENTROPY_NODES; ++i)
+    old_b += cost_branch256(ct + 2 * i, oldplist[i - UNCONSTRAINED_NODES]);
 
   bestsavings = 0;
-  bestnewp = oldp[PIVOT_NODE];
+  bestnewp = oldp;
 
   assert(stepsize > 0);
 
-  for (newp = *bestp; (newp - oldp[PIVOT_NODE]) * step_sign < 0; newp += step) {
-    if (newp < 1 || newp > 255) continue;
-    newplist[PIVOT_NODE] = newp;
-    av1_model_to_full_probs(newplist, newplist);
-    for (i = UNCONSTRAINED_NODES, new_b = 0; i < ENTROPY_NODES; ++i)
-      new_b += cost_branch256(ct + 2 * i, newplist[i]);
-    new_b += cost_branch256(ct + 2 * PIVOT_NODE, newplist[PIVOT_NODE]);
-    update_b = prob_diff_update_cost(newp, oldp[PIVOT_NODE]) + av1_cost_upd256;
-    savings = old_b - new_b - update_b * probwt;
-    if (savings > bestsavings) {
-      bestsavings = savings;
-      bestnewp = newp;
+  if (old_b > upd_cost + (MIN_DELP_BITS << AV1_PROB_COST_SHIFT)) {
+    for (newp = *bestp; (newp - oldp) * step_sign < 0; newp += step) {
+      if (newp < 1 || newp > 255) continue;
+      newplist = av1_pareto8_full[newp - 1];
+      new_b = cost_branch256(ct + 2 * PIVOT_NODE, newp);
+      for (i = UNCONSTRAINED_NODES; i < ENTROPY_NODES; ++i)
+        new_b += cost_branch256(ct + 2 * i, newplist[i - UNCONSTRAINED_NODES]);
+      update_b = prob_diff_update_cost(newp, oldp) + upd_cost;
+      savings = old_b - new_b - update_b * probwt;
+      if (savings > bestsavings) {
+        bestsavings = savings;
+        bestnewp = newp;
+      }
     }
   }
 
diff --git a/av1/encoder/subexp.h b/av1/encoder/subexp.h
index d01dea9..9cd3c0b 100644
--- a/av1/encoder/subexp.h
+++ b/av1/encoder/subexp.h
@@ -29,7 +29,7 @@
                                         int probwt);
 
 int av1_prob_diff_update_savings_search_model(const unsigned int *ct,
-                                              const aom_prob *oldp,
+                                              const aom_prob oldp,
                                               aom_prob *bestp, aom_prob upd,
                                               int stepsize, int probwt);