Merge "Pass AV1_COMMON into get_scan" into nextgenv2
diff --git a/aom_dsp/answriter.h b/aom_dsp/answriter.h
index 298b255..370472a 100644
--- a/aom_dsp/answriter.h
+++ b/aom_dsp/answriter.h
@@ -20,8 +20,23 @@
 #include "aom_dsp/ans.h"
 #include "aom_dsp/prob.h"
 #include "aom_ports/mem_ops.h"
+#include "av1/common/odintrin.h"
 
-#define ANS_DIV(dividend, divisor) ((dividend) / (divisor))
+#if RANS_PRECISION <= OD_DIVU_DMAX
+#define ANS_DIVREM(quotient, remainder, dividend, divisor) \
+  do {                                                     \
+    quotient = OD_DIVU_SMALL((dividend), (divisor));       \
+    remainder = (dividend) - (quotient) * (divisor);       \
+  } while (0)
+#else
+#define ANS_DIVREM(quotient, remainder, dividend, divisor) \
+  do {                                                     \
+    quotient = (dividend) / (divisor);                     \
+    remainder = (dividend) % (divisor);                    \
+  } while (0)
+#endif
+
+#define ANS_DIV8(dividend, divisor) OD_DIVU_SMALL((dividend), (divisor))
 
 #ifdef __cplusplus
 extern "C" {
@@ -72,9 +87,9 @@
     ans->state /= IO_BASE;
   }
   if (!val)
-    ans->state = ANS_DIV(ans->state * ANS_P8_PRECISION, p0);
+    ans->state = ANS_DIV8(ans->state * ANS_P8_PRECISION, p0);
   else
-    ans->state = ANS_DIV((ans->state + 1) * ANS_P8_PRECISION + p - 1, p) - 1;
+    ans->state = ANS_DIV8((ans->state + 1) * ANS_P8_PRECISION + p - 1, p) - 1;
 }
 
 struct rans_sym {
@@ -88,15 +103,17 @@
 static INLINE void rans_write(struct AnsCoder *ans,
                               const struct rans_sym *const sym) {
   const aom_cdf_prob p = sym->prob;
+  unsigned quot, rem;
   while (ans->state >= L_BASE / RANS_PRECISION * IO_BASE * p) {
     ans->buf[ans->buf_offset++] = ans->state % IO_BASE;
     ans->state /= IO_BASE;
   }
-  ans->state =
-      (ans->state / p) * RANS_PRECISION + ans->state % p + sym->cum_prob;
+  ANS_DIVREM(quot, rem, ans->state, p);
+  ans->state = quot * RANS_PRECISION + rem + sym->cum_prob;
 }
 
-#undef ANS_DIV
+#undef ANS_DIV8
+#undef ANS_DIVREM
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/aom_dsp/entdec.c b/aom_dsp/entdec.c
index 3f2feab..18563b2 100644
--- a/aom_dsp/entdec.c
+++ b/aom_dsp/entdec.c
@@ -206,6 +206,7 @@
   od_ec_window dif;
   od_ec_window vw;
   unsigned r;
+  unsigned r_new;
   unsigned v;
   int ret;
   OD_ASSERT(0 < fz);
@@ -216,10 +217,14 @@
   OD_ASSERT(32768U <= r);
   v = fz * (uint32_t)r >> 15;
   vw = (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16);
-  ret = dif >= vw;
-  if (ret) dif -= vw;
-  r = ret ? r - v : v;
-  return od_ec_dec_normalize(dec, dif, r, ret);
+  ret = 0;
+  r_new = v;
+  if (dif >= vw) {
+    r_new = r - v;
+    dif -= vw;
+    ret = 1;
+  }
+  return od_ec_dec_normalize(dec, dif, r_new, ret);
 }
 
 /*Decodes a symbol given a cumulative distribution function (CDF) table.
diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index 4d9bff9..bc1970c 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h
@@ -191,7 +191,6 @@
   TX_SIZE inter_tx_size[MAX_MIB_SIZE][MAX_MIB_SIZE];
 #endif
   int8_t skip;
-  int8_t has_no_coeffs;
   int8_t segment_id;
 #if CONFIG_SUPERTX
   // Minimum of all segment IDs under the current supertx block.
diff --git a/av1/common/common_data.h b/av1/common/common_data.h
index 4165e35..f068ee7 100644
--- a/av1/common/common_data.h
+++ b/av1/common/common_data.h
@@ -444,7 +444,12 @@
 
 static const int tx_size_1d[TX_SIZES] = { 4, 8, 16, 32 };
 
-static const int tx_size_2d[TX_SIZES] = { 16, 64, 256, 1024 };
+static const int tx_size_2d[TX_SIZES_ALL] = {
+  16, 64, 256, 1024,
+#if CONFIG_EXT_TX
+  32, 32, 128, 128,  512, 512,
+#endif
+};
 
 static const uint8_t tx_size_1d_log2[TX_SIZES] = { 2, 3, 4, 5 };
 
diff --git a/av1/common/loopfilter.c b/av1/common/loopfilter.c
index c8022f2..d0b897c 100644
--- a/av1/common/loopfilter.c
+++ b/av1/common/loopfilter.c
@@ -753,7 +753,7 @@
 
   // If the block has no coefficients and is not intra we skip applying
   // the loop filter on block edges.
-  if ((mbmi->skip || mbmi->has_no_coeffs) && is_inter_block(mbmi)) return;
+  if (mbmi->skip && is_inter_block(mbmi)) return;
 
   // Here we are adding a mask for the transform size. The transform
   // size mask is set to be correct for a 64x64 prediction block size. We
@@ -818,7 +818,7 @@
   *above_y |= above_prediction_mask[block_size] << shift_y;
   *left_y |= left_prediction_mask[block_size] << shift_y;
 
-  if ((mbmi->skip || mbmi->has_no_coeffs) && is_inter_block(mbmi)) return;
+  if (mbmi->skip && is_inter_block(mbmi)) return;
 
   *above_y |= (size_mask[block_size] & above_64x64_txform_mask[tx_size_y])
               << shift_y;
diff --git a/av1/common/reconinter.c b/av1/common/reconinter.c
index c0fc494..b07a8bd 100644
--- a/av1/common/reconinter.c
+++ b/av1/common/reconinter.c
@@ -647,6 +647,87 @@
   }
 #endif
 
+#if CONFIG_SUB8X8_MC
+  if (mi->mbmi.sb_type < BLOCK_8X8 && plane > 0) {
+    // block size in log2
+    const int b4_wl = b_width_log2_lookup[mi->mbmi.sb_type];
+    const int b4_hl = b_height_log2_lookup[mi->mbmi.sb_type];
+    const int b8_sl = b_width_log2_lookup[BLOCK_8X8];
+
+    // block size
+    const int b4_w = 1 << b4_wl;
+    const int b4_h = 1 << b4_hl;
+    const int b8_s = 1 << b8_sl;
+    int idx, idy;
+
+    const int x_base = x;
+    const int y_base = y;
+
+    // processing unit size
+    const int x_step = w >> (b8_sl - b4_wl);
+    const int y_step = h >> (b8_sl - b4_hl);
+
+    for (idy = 0; idy < b8_s; idy += b4_h) {
+      for (idx = 0; idx < b8_s; idx += b4_w) {
+        const int chr_idx = (idy * 2) + idx;
+        for (ref = 0; ref < 1 + is_compound; ++ref) {
+          const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
+          struct buf_2d *const pre_buf = &pd->pre[ref];
+          struct buf_2d *const dst_buf = &pd->dst;
+          uint8_t *dst = dst_buf->buf;
+          const MV mv = mi->bmi[chr_idx].as_mv[ref].as_mv;
+          const MV mv_q4 = clamp_mv_to_umv_border_sb(
+              xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
+          uint8_t *pre;
+          MV32 scaled_mv;
+          int xs, ys, subpel_x, subpel_y;
+          const int is_scaled = av1_is_scaled(sf);
+
+          x = x_base + idx * x_step;
+          y = y_base + idy * y_step;
+
+          dst += dst_buf->stride * y + x;
+
+          if (is_scaled) {
+            pre =
+                pre_buf->buf + scaled_buffer_offset(x, y, pre_buf->stride, sf);
+            scaled_mv = av1_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
+            xs = sf->x_step_q4;
+            ys = sf->y_step_q4;
+          } else {
+            pre = pre_buf->buf + y * pre_buf->stride + x;
+            scaled_mv.row = mv_q4.row;
+            scaled_mv.col = mv_q4.col;
+            xs = ys = 16;
+          }
+
+          subpel_x = scaled_mv.col & SUBPEL_MASK;
+          subpel_y = scaled_mv.row & SUBPEL_MASK;
+          pre += (scaled_mv.row >> SUBPEL_BITS) * pre_buf->stride +
+                 (scaled_mv.col >> SUBPEL_BITS);
+
+#if CONFIG_AOM_HIGHBITDEPTH
+          if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+            high_inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
+                                 subpel_x, subpel_y, sf, x_step, y_step, ref,
+                                 &mi->mbmi.interp_filter, xs, ys, xd->bd);
+          } else {
+            inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
+                            subpel_x, subpel_y, sf, x_step, y_step, ref,
+                            &mi->mbmi.interp_filter, xs, ys);
+          }
+#else
+          inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride, subpel_x,
+                          subpel_y, sf, x_step, y_step, ref,
+                          &mi->mbmi.interp_filter, xs, ys);
+#endif
+        }
+      }
+    }
+    return;
+  }
+#endif
+
   for (ref = 0; ref < 1 + is_compound; ++ref) {
     const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
     struct buf_2d *const pre_buf = &pd->pre[ref];
diff --git a/av1/common/reconinter.h b/av1/common/reconinter.h
index bfa7e95..5f62f0a 100644
--- a/av1/common/reconinter.h
+++ b/av1/common/reconinter.h
@@ -50,7 +50,7 @@
     const int16_t *kernel_y =
         av1_get_interp_filter_subpel_kernel(interp_filter_params_y, subpel_y);
 #else
-  if (interp_filter_params.taps == SUBPEL_TAPS) {
+  if (interp_filter_params.taps == SUBPEL_TAPS && w > 2 && h > 2) {
     const int16_t *kernel_x =
         av1_get_interp_filter_subpel_kernel(interp_filter_params, subpel_x);
     const int16_t *kernel_y =
@@ -109,7 +109,7 @@
     const int16_t *kernel_y =
         av1_get_interp_filter_subpel_kernel(interp_filter_params_y, subpel_y);
 #else
-  if (interp_filter_params.taps == SUBPEL_TAPS) {
+  if (interp_filter_params.taps == SUBPEL_TAPS && w > 2 && h > 2) {
     const int16_t *kernel_x =
         av1_get_interp_filter_subpel_kernel(interp_filter_params, subpel_x);
     const int16_t *kernel_y =
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index bbd788b..2e6e744 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -253,22 +253,12 @@
     }
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
-    if (eob == 1) {
+    // TODO(jingning): This cleans up different reset requests from various
+    // experiments, but incurs unnecessary memset size.
+    if (eob == 1)
       dqcoeff[0] = 0;
-    } else {
-      if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10)
-        memset(dqcoeff, 0, 4 * 4 * num_4x4_blocks_wide_txsize_lookup[tx_size] *
-                               sizeof(dqcoeff[0]));
-#if CONFIG_EXT_TX
-      else
-        memset(dqcoeff, 0, get_tx2d_size(tx_size) * sizeof(dqcoeff[0]));
-#else
-      else if (tx_size == TX_32X32 && eob <= 34)
-        memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0]));
-      else
-        memset(dqcoeff, 0, get_tx2d_size(tx_size) * sizeof(dqcoeff[0]));
-#endif
-    }
+    else
+      memset(dqcoeff, 0, tx_size_2d[tx_size] * sizeof(dqcoeff[0]));
   }
 }
 
@@ -1154,7 +1144,6 @@
 #endif  // CONFIG_EXT_PARTITION_TYPES
                          BLOCK_SIZE bsize, int bwl, int bhl) {
   AV1_COMMON *const cm = &pbi->common;
-  const int less8x8 = bsize < BLOCK_8X8;
   const int bw = 1 << (bwl - 1);
   const int bh = 1 << (bhl - 1);
   const int x_mis = AOMMIN(bw, cm->mi_cols - mi_col);
@@ -1374,9 +1363,6 @@
                                                 plane, row, col, tx_size);
 #endif
       }
-
-      if (!less8x8 && eobtotal == 0)
-        mbmi->has_no_coeffs = 1;  // skip loopfilter
     }
   }
 
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index fdbb4dd..9dc6a2e 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -4489,6 +4489,10 @@
 #define MIN_TRANS_THRESH 8
 #define GLOBAL_MOTION_ADVANTAGE_THRESH 0.60
 #define GLOBAL_MOTION_MODEL ROTZOOM
+// TODO(sarahparker) This function needs to be adjusted
+// to accomodate changes in the paraemter integerization.
+// Commenting it out until the fix is made.
+/*
 static void refine_integerized_param(WarpedMotionParams *wm,
 #if CONFIG_AOM_HIGHBITDEPTH
                                      int use_hbd, int bd,
@@ -4565,6 +4569,7 @@
     *param = best_param;
   }
 }
+*/
 
 static void convert_to_params(const double *params, TransformationType type,
                               int16_t *model) {
@@ -4579,7 +4584,7 @@
              GM_TRANS_DECODE_FACTOR;
 
   for (i = 2; i < n_params; ++i) {
-    diag_value = ((i && 1) ? (1 << GM_ALPHA_PREC_BITS) : 0);
+    diag_value = ((i & 1) ? (1 << GM_ALPHA_PREC_BITS) : 0);
     model[i] = (int16_t)floor(params[i] * (1 << GM_ALPHA_PREC_BITS) + 0.5);
     model[i] =
         (int16_t)(clamp(model[i] - diag_value, GM_ALPHA_MIN, GM_ALPHA_MAX) +
@@ -4643,14 +4648,6 @@
           convert_model_to_params(params, GLOBAL_MOTION_MODEL,
                                   &cm->global_motion[frame]);
           if (get_gmtype(&cm->global_motion[frame]) > GLOBAL_ZERO) {
-            refine_integerized_param(
-                &cm->global_motion[frame].motion_params,
-#if CONFIG_AOM_HIGHBITDEPTH
-                xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
-#endif  // CONFIG_AOM_HIGHBITDEPTH
-                ref_buf->y_buffer, ref_buf->y_width, ref_buf->y_height,
-                ref_buf->y_stride, cpi->Source->y_buffer, cpi->Source->y_width,
-                cpi->Source->y_height, cpi->Source->y_stride, 3);
             // compute the advantage of using gm parameters over 0 motion
             erroradvantage = av1_warp_erroradv(
                 &cm->global_motion[frame].motion_params,