Port optimize_b cache optimizations to AV1.

Bring the following libvpx commits to aom:
e446ffd Cache optimizations in optimize_b()
50d3629 Repack vp9_token_state

Saves 24600 bytes of stack in the default configuration.

Change-Id: If9d6506cf3fe1c34ab639dedb3ef62a996293781
diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c
index c2c68a5..23607e2 100644
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c
@@ -73,12 +73,13 @@
 }
 
 typedef struct av1_token_state {
-  int rate;
   int64_t error;
-  int next;
+  int rate;
+  int16_t next;
   int16_t token;
   tran_low_t qc;
   tran_low_t dqc;
+  uint8_t best_index;
 } av1_token_state;
 
 // These numbers are empirically obtained.
@@ -114,7 +115,6 @@
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const int ref = is_inter_block(&xd->mi[0]->mbmi);
   av1_token_state tokens[MAX_TX_SQUARE + 1][2];
-  unsigned best_index[MAX_TX_SQUARE + 1][2];
   uint8_t token_cache[MAX_TX_SQUARE];
   const tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
   tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
@@ -242,7 +242,7 @@
       tokens[i][0].next = next;
       tokens[i][0].qc = x;
       tokens[i][0].dqc = dqcoeff[rc];
-      best_index[i][0] = best;
+      tokens[i][0].best_index = best;
 
       /* Evaluate the second possibility for this state. */
       rate0 = tokens[next][0].rate;
@@ -282,7 +282,6 @@
         x -= 2 * sz + 1;
       } else {
         tokens[i][1] = tokens[i][0];
-        best_index[i][1] = best_index[i][0];
         next = i;
 
         if (UNLIKELY(!(--band_left))) {
@@ -392,7 +391,7 @@
         tokens[i][1].dqc = 0;
       }
 
-      best_index[i][1] = best;
+      tokens[i][1].best_index = best;
       /* Finally, make this the new head of the trellis. */
       next = i;
     } else {
@@ -411,7 +410,7 @@
         tokens[next][1].rate += get_token_bit_costs(*token_costs, 1, pt, t1);
         tokens[next][1].token = ZERO_TOKEN;
       }
-      best_index[i][0] = best_index[i][1] = 0;
+      tokens[i][0].best_index = tokens[i][1].best_index = 0;
       shortcut = (tokens[next][0].rate != tokens[next][1].rate);
       /* Don't update next, because we didn't add a new node. */
     }
@@ -445,7 +444,7 @@
     dqcoeff[rc] = tokens[i][best].dqc;
 
     next = tokens[i][best].next;
-    best = best_index[i][best];
+    best = tokens[i][best].best_index;
   }
   final_eob++;