enable skip prediction for large blocks

This patch enables the skip prediction feature for blocks larger than
16x16; but the tx size used in this module is still limited to 16x16.

Also makes it exit early upon large quantized coeff.

Test results:
-encoding speed up by 0~17%, on average 8.6%, based on several lowres
test sequences;
-no quality loss observed, encoding 30 frames on lowres and midres.

BUG=aomedia:1106

Change-Id: Ic2051a978eda5c8ce6a795a9d97d8d114c6edfe2
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index fc301b6..0a1f0c0 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -4660,33 +4660,33 @@
 
 static const uint32_t skip_pred_threshold[3][BLOCK_SIZES_ALL] = {
   {
-      0,  0,  0,  50, 50, 50, 55, 47, 47, 53, 53, 53, 0, 0, 0, 0,
+      0,  0,  0,  50, 50, 50, 55, 47, 47, 53, 53, 53, 53, 53, 53, 53,
 #if CONFIG_EXT_PARTITION
-      0,  0,  0,
+      53, 53, 53,
 #endif
       50, 50, 55, 55, 53, 53,
 #if CONFIG_EXT_PARTITION
-      0,  0,
+      53, 53,
 #endif
   },
   {
-      0,  0,  0,  69, 69, 69, 67, 68, 68, 53, 53, 53, 0, 0, 0, 0,
+      0,  0,  0,  69, 69, 69, 67, 68, 68, 53, 53, 53, 53, 53, 53, 53,
 #if CONFIG_EXT_PARTITION
-      0,  0,  0,
+      53, 53, 53,
 #endif
       69, 69, 67, 67, 53, 53,
 #if CONFIG_EXT_PARTITION
-      0,  0,
+      53, 53,
 #endif
   },
   {
-      0,  0,  0,  70, 73, 73, 70, 73, 73, 58, 58, 58, 0, 0, 0, 0,
+      0,  0,  0,  70, 73, 73, 70, 73, 73, 58, 58, 58, 58, 58, 58, 58,
 #if CONFIG_EXT_PARTITION
-      0,  0,  0,
+      58, 58, 58,
 #endif
       70, 70, 70, 70, 58, 58,
 #if CONFIG_EXT_PARTITION
-      0,  0,
+      58, 58,
 #endif
   }
 };
@@ -4695,12 +4695,12 @@
 // whether optimal RD decision is to skip encoding the residual.
 // The sse value is stored in dist.
 static int predict_skip_flag(MACROBLOCK *x, BLOCK_SIZE bsize, int64_t *dist) {
-  const int max_tx_size =
+  int max_tx_size =
       get_max_rect_tx_size(bsize, is_inter_block(&x->e_mbd.mi[0]->mbmi));
+  if (tx_size_high[max_tx_size] > 16 || tx_size_wide[max_tx_size] > 16)
+    max_tx_size = AOMMIN(max_txsize_lookup[bsize], TX_16X16);
   const int tx_h = tx_size_high[max_tx_size];
   const int tx_w = tx_size_wide[max_tx_size];
-  if (tx_h > 16 || tx_w > 16) return 0;
-
   const int bw = block_size_wide[bsize];
   const int bh = block_size_high[bsize];
   const MACROBLOCKD *xd = &x->e_mbd;
@@ -4732,6 +4732,8 @@
                                           is_inter_block(&xd->mi[0]->mbmi), 0);
   const uint32_t ac_q = (uint32_t)av1_ac_quant_QTX(x->qindex, 0, xd->bd);
   uint32_t max_quantized_coef = 0;
+  const int bd_idx = (xd->bd == 8) ? 0 : ((xd->bd == 10) ? 1 : 2);
+  const uint32_t max_qcoef_thresh = skip_pred_threshold[bd_idx][bsize];
   const int16_t *src_diff = x->plane[0].src_diff;
   for (int row = 0; row < bh; row += tx_h) {
     for (int col = 0; col < bw; col += tx_w) {
@@ -4748,14 +4750,15 @@
       for (int i = 0; i < tx_w * tx_h; ++i) {
         uint32_t cur_quantized_coef =
             (100 * (uint32_t)abs(DCT_coefs[i])) / (i ? ac_q : dc_q);
-        if (cur_quantized_coef > max_quantized_coef)
+        if (cur_quantized_coef > max_quantized_coef) {
           max_quantized_coef = cur_quantized_coef;
+          if (max_quantized_coef >= max_qcoef_thresh) return 0;
+        }
       }
     }
     src_diff += tx_h * bw;
   }
-  const int bd_idx = (xd->bd == 8) ? 0 : ((xd->bd == 10) ? 1 : 2);
-  return max_quantized_coef < skip_pred_threshold[bd_idx][bsize];
+  return max_quantized_coef < max_qcoef_thresh;
 }
 
 // Used to set proper context for early termination with skip = 1.