Merge "Raise the probability resolution for rANS tokens to 10-bits per symbol" into nextgenv2
diff --git a/test/vp10_fwd_txfm2d_sse4_test.cc b/test/vp10_fwd_txfm2d_sse4_test.cc
index 84b1d0f..52e2d38 100644
--- a/test/vp10_fwd_txfm2d_sse4_test.cc
+++ b/test/vp10_fwd_txfm2d_sse4_test.cc
@@ -51,6 +51,7 @@
     int func_idx = get_max_bit(txfm_size) - 2;
     Fwd_Txfm2d_Func txfm2d_func_c = txfm2d_func_c_list[func_idx];
     Fwd_Txfm2d_Func txfm2d_func_sse4_1 = txfm2d_func_sse4_1_list[func_idx];
+    int tx_type = libvpx_test::get_tx_type(&cfg);
 
     ACMRandom rnd(ACMRandom::DeterministicSeed());
 
@@ -61,8 +62,8 @@
       }
     }
 
-    txfm2d_func_c(input, output_c, cfg.txfm_size, &cfg, bd);
-    txfm2d_func_sse4_1(input, output_sse4_1, cfg.txfm_size, &cfg, bd);
+    txfm2d_func_c(input, output_c, cfg.txfm_size, tx_type, bd);
+    txfm2d_func_sse4_1(input, output_sse4_1, cfg.txfm_size, tx_type, bd);
     for (int r = 0; r < txfm_size; r++) {
       for (int c = 0; c < txfm_size; c++) {
         EXPECT_EQ(output_c[r * txfm_size + c],
diff --git a/test/vp10_fwd_txfm2d_test.cc b/test/vp10_fwd_txfm2d_test.cc
index d5a6737..8e90dc2 100644
--- a/test/vp10_fwd_txfm2d_test.cc
+++ b/test/vp10_fwd_txfm2d_test.cc
@@ -71,6 +71,7 @@
                           fwd_txfm_cfg->shift[2];
         double amplify_factor =
             amplify_bit >= 0 ? (1 << amplify_bit) : (1.0 / (1 << -amplify_bit));
+        int tx_type = libvpx_test::get_tx_type(fwd_txfm_cfg);
 
         ACMRandom rnd(ACMRandom::DeterministicSeed());
         int count = 500;
@@ -83,7 +84,7 @@
             ref_output[ni] = 0;
           }
 
-          fwd_txfm_func(input, output, txfm_size, fwd_txfm_cfg, bd);
+          fwd_txfm_func(input, output, txfm_size, tx_type, bd);
           reference_hybrid_2d(ref_input, ref_output, txfm_size, type0, type1);
 
           for (int ni = 0; ni < sqr_txfm_size; ++ni) {
diff --git a/test/vp10_inv_txfm2d_test.cc b/test/vp10_inv_txfm2d_test.cc
index fb42bc4..c3552dc 100644
--- a/test/vp10_inv_txfm2d_test.cc
+++ b/test/vp10_inv_txfm2d_test.cc
@@ -30,17 +30,7 @@
 #if CONFIG_VP9_HIGHBITDEPTH
 const int txfm_size_num = 5;
 const int txfm_size_ls[5] = {4, 8, 16, 32, 64};
-const TXFM_2D_CFG* fwd_txfm_cfg_ls[5][4] = {
-    {&fwd_txfm_2d_cfg_dct_dct_4, &fwd_txfm_2d_cfg_dct_adst_4,
-     &fwd_txfm_2d_cfg_adst_adst_4, &fwd_txfm_2d_cfg_adst_dct_4},
-    {&fwd_txfm_2d_cfg_dct_dct_8, &fwd_txfm_2d_cfg_dct_adst_8,
-     &fwd_txfm_2d_cfg_adst_adst_8, &fwd_txfm_2d_cfg_adst_dct_8},
-    {&fwd_txfm_2d_cfg_dct_dct_16, &fwd_txfm_2d_cfg_dct_adst_16,
-     &fwd_txfm_2d_cfg_adst_adst_16, &fwd_txfm_2d_cfg_adst_dct_16},
-    {&fwd_txfm_2d_cfg_dct_dct_32, &fwd_txfm_2d_cfg_dct_adst_32,
-     &fwd_txfm_2d_cfg_adst_adst_32, &fwd_txfm_2d_cfg_adst_dct_32},
-    {&fwd_txfm_2d_cfg_dct_dct_64, NULL, NULL, NULL}};
-
+const int txfm_type[4] = {DCT_DCT, DCT_ADST, ADST_ADST, ADST_DCT};
 const TXFM_2D_CFG* inv_txfm_cfg_ls[5][4] = {
     {&inv_txfm_2d_cfg_dct_dct_4, &inv_txfm_2d_cfg_dct_adst_4,
      &inv_txfm_2d_cfg_adst_adst_4, &inv_txfm_2d_cfg_adst_dct_4},
@@ -72,11 +62,10 @@
 
     for (int txfm_type_idx = 0; txfm_type_idx < txfm_type_num;
          ++txfm_type_idx) {
-      const TXFM_2D_CFG* fwd_txfm_cfg =
-          fwd_txfm_cfg_ls[txfm_size_idx][txfm_type_idx];
       const TXFM_2D_CFG* inv_txfm_cfg =
           inv_txfm_cfg_ls[txfm_size_idx][txfm_type_idx];
-      if (fwd_txfm_cfg != NULL) {
+      if (inv_txfm_cfg != NULL) {
+        int tx_type = txfm_type[txfm_type_idx];
         const Fwd_Txfm2d_Func fwd_txfm_func = fwd_txfm_func_ls[txfm_size_idx];
         const Inv_Txfm2d_Func inv_txfm_func = inv_txfm_func_ls[txfm_size_idx];
         const int count = 1000;
@@ -94,7 +83,7 @@
             }
           }
 
-          fwd_txfm_func(input, output, txfm_size, fwd_txfm_cfg, bd);
+          fwd_txfm_func(input, output, txfm_size, tx_type, bd);
           inv_txfm_func(output, ref_input, txfm_size, inv_txfm_cfg, bd);
 
           for (int ni = 0; ni < sqr_txfm_size; ++ni) {
diff --git a/test/vp10_txfm_test.h b/test/vp10_txfm_test.h
index 6fcc3bb..6b0bd0a 100644
--- a/test/vp10_txfm_test.h
+++ b/test/vp10_txfm_test.h
@@ -21,6 +21,7 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 #include "test/acm_random.h"
+#include "vp10/common/enums.h"
 #include "vp10/common/vp10_txfm.h"
 
 namespace libvpx_test {
@@ -104,11 +105,29 @@
                          const int8_t* range_bit);
 
 typedef void (*Fwd_Txfm2d_Func)(const int16_t*, int32_t*, const int,
-                                const TXFM_2D_CFG*, const int);
+                                int tx_type, const int);
 typedef void (*Inv_Txfm2d_Func)(const int32_t*, uint16_t*, const int,
                                 const TXFM_2D_CFG*, const int);
 
 static const int bd = 10;
 static const int input_base = (1 << bd);
+
+static INLINE int get_tx_type(const TXFM_2D_CFG *cfg) {
+  int tx_type;
+  if (cfg->txfm_type_col <= TXFM_TYPE_DCT64) {
+    if (cfg->txfm_type_row <= TXFM_TYPE_DCT64) {
+      tx_type = DCT_DCT;
+    } else {
+      tx_type = DCT_ADST;
+    }
+  } else {
+    if (cfg->txfm_type_row <= TXFM_TYPE_DCT64) {
+      tx_type = ADST_DCT;
+    } else {
+      tx_type = ADST_ADST;
+    }
+  }
+  return tx_type;
+}
 }  // namespace libvpx_test
 #endif  // VP10_TXFM_TEST_H_
diff --git a/vp10/common/loopfilter.c b/vp10/common/loopfilter.c
index 23c131d..b0f100e 100644
--- a/vp10/common/loopfilter.c
+++ b/vp10/common/loopfilter.c
@@ -731,9 +731,11 @@
   } else {
     const int w = num_8x8_blocks_wide_lookup[block_size];
     const int h = num_8x8_blocks_high_lookup[block_size];
-    for (i = 0; i < h; i++) {
-      memset(&lfm->lfl_y[i][shift_y], filter_level, w);
-    }
+    const int row = (shift_y >> MAX_MIB_SIZE_LOG2);
+    const int col = shift_y - (row << MAX_MIB_SIZE_LOG2);
+
+    for (i = 0; i < h; i++)
+      memset(&lfm->lfl_y[row + i][col], filter_level, w);
   }
 
   // These set 1 in the current block size for the block size edges.
@@ -811,9 +813,11 @@
   } else {
     const int w = num_8x8_blocks_wide_lookup[block_size];
     const int h = num_8x8_blocks_high_lookup[block_size];
-    for (i = 0; i < h; i++) {
-      memset(&lfm->lfl_y[i][shift_y], filter_level, w);
-    }
+    const int row = (shift_y >> MAX_MIB_SIZE_LOG2);
+    const int col = shift_y - (row << MAX_MIB_SIZE_LOG2);
+
+    for (i = 0; i < h; i++)
+      memset(&lfm->lfl_y[row + i][col], filter_level, w);
   }
 
   *above_y |= above_prediction_mask[block_size] << shift_y;
diff --git a/vp10/common/reconinter.c b/vp10/common/reconinter.c
index 674b037..2d2563e 100644
--- a/vp10/common/reconinter.c
+++ b/vp10/common/reconinter.c
@@ -454,7 +454,7 @@
     const MACROBLOCKD *xd) {
   const MODE_INFO *mi = xd->mi[0];
 #if CONFIG_VP9_HIGHBITDEPTH
-  uint8_t tmp_dst_[2 * MAX_SB_SQUARE];
+  DECLARE_ALIGNED(16, uint8_t, tmp_dst_[2 * MAX_SB_SQUARE]);
   uint8_t *tmp_dst =
       (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ?
       CONVERT_TO_BYTEPTR(tmp_dst_) : tmp_dst_;
@@ -487,7 +487,7 @@
         mi->mbmi.sb_type, h, w);
 #endif  // CONFIG_SUPERTX
 #else   // CONFIG_VP9_HIGHBITDEPTH
-  uint8_t tmp_dst[MAX_SB_SQUARE];
+  DECLARE_ALIGNED(16, uint8_t, tmp_dst[MAX_SB_SQUARE]);
   vp10_make_inter_predictor(pre, pre_stride, tmp_dst, MAX_SB_SIZE,
                             subpel_x, subpel_y, sf, w, h, 0,
                             interp_filter, xs, ys, xd);
@@ -2020,7 +2020,7 @@
   }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   {
-    uint8_t intrapredictor[MAX_SB_SQUARE];
+    DECLARE_ALIGNED(16, uint8_t, intrapredictor[MAX_SB_SQUARE]);
     vp10_build_intra_predictors_for_interintra(
         xd, bsize, 0, intrapredictor, MAX_SB_SIZE);
     vp10_combine_interintra(xd, bsize, 0, ypred, ystride,
@@ -2045,7 +2045,7 @@
   }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   {
-    uint8_t uintrapredictor[MAX_SB_SQUARE];
+    DECLARE_ALIGNED(16, uint8_t, uintrapredictor[MAX_SB_SQUARE]);
     vp10_build_intra_predictors_for_interintra(
         xd, bsize, plane, uintrapredictor, MAX_SB_SIZE);
     vp10_combine_interintra(xd, bsize, plane, upred, ustride,
@@ -2204,12 +2204,12 @@
     if (ref && get_wedge_bits(mi->mbmi.sb_type)
         && mi->mbmi.use_wedge_interinter) {
 #if CONFIG_VP9_HIGHBITDEPTH
-      uint8_t tmp_dst_[2 * MAX_SB_SQUARE];
+      DECLARE_ALIGNED(16, uint8_t, tmp_dst_[2 * MAX_SB_SQUARE]);
       uint8_t *tmp_dst =
           (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ?
           CONVERT_TO_BYTEPTR(tmp_dst_) : tmp_dst_;
 #else
-      uint8_t tmp_dst[MAX_SB_SQUARE];
+      DECLARE_ALIGNED(16, uint8_t, tmp_dst[MAX_SB_SQUARE]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #if CONFIG_VP9_HIGHBITDEPTH
         if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
diff --git a/vp10/common/vp10_fwd_txfm2d.c b/vp10/common/vp10_fwd_txfm2d.c
index d54a174..00f8834 100644
--- a/vp10/common/vp10_fwd_txfm2d.c
+++ b/vp10/common/vp10_fwd_txfm2d.c
@@ -8,8 +8,12 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vp10/common/vp10_txfm.h"
+#include <assert.h>
+
+#include "vp10/common/enums.h"
 #include "vp10/common/vp10_fwd_txfm1d.h"
+#include "vp10/common/vp10_fwd_txfm2d_cfg.h"
+#include "vp10/common/vp10_txfm.h"
 
 static inline TxfmFunc fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
   switch (txfm_type) {
@@ -83,41 +87,145 @@
 }
 
 void vp10_fwd_txfm2d_4x4_c(const int16_t *input, int32_t *output,
-                         const int stride, const TXFM_2D_CFG *cfg,
+                         const int stride, int tx_type,
                          const int bd) {
   int32_t txfm_buf[4 * 4];
+  const TXFM_2D_CFG* cfg = vp10_get_txfm_4x4_cfg(tx_type);
   (void)bd;
   fwd_txfm2d_c(input, output, stride, cfg, txfm_buf);
 }
 
 void vp10_fwd_txfm2d_8x8_c(const int16_t *input, int32_t *output,
-                         const int stride, const TXFM_2D_CFG *cfg,
+                         const int stride, int tx_type,
                          const int bd) {
   int32_t txfm_buf[8 * 8];
+  const TXFM_2D_CFG* cfg = vp10_get_txfm_8x8_cfg(tx_type);
   (void)bd;
   fwd_txfm2d_c(input, output, stride, cfg, txfm_buf);
 }
 
 void vp10_fwd_txfm2d_16x16_c(const int16_t *input, int32_t *output,
-                           const int stride, const TXFM_2D_CFG *cfg,
+                           const int stride, int tx_type,
                            const int bd) {
   int32_t txfm_buf[16 * 16];
+  const TXFM_2D_CFG* cfg = vp10_get_txfm_16x16_cfg(tx_type);
   (void)bd;
   fwd_txfm2d_c(input, output, stride, cfg, txfm_buf);
 }
 
 void vp10_fwd_txfm2d_32x32_c(const int16_t *input, int32_t *output,
-                           const int stride, const TXFM_2D_CFG *cfg,
+                           const int stride, int tx_type,
                            const int bd) {
   int32_t txfm_buf[32 * 32];
+  const TXFM_2D_CFG* cfg = vp10_get_txfm_32x32_cfg(tx_type);
   (void)bd;
   fwd_txfm2d_c(input, output, stride, cfg, txfm_buf);
 }
 
 void vp10_fwd_txfm2d_64x64_c(const int16_t *input, int32_t *output,
-                           const int stride, const TXFM_2D_CFG *cfg,
+                           const int stride, int tx_type,
                            const int bd) {
   int32_t txfm_buf[64 * 64];
+  const TXFM_2D_CFG* cfg = vp10_get_txfm_64x64_cfg(tx_type);
   (void)bd;
   fwd_txfm2d_c(input, output, stride, cfg, txfm_buf);
 }
+
+const TXFM_2D_CFG* vp10_get_txfm_4x4_cfg(int tx_type) {
+  const TXFM_2D_CFG* cfg = NULL;
+  switch (tx_type) {
+    case DCT_DCT:
+      cfg = &fwd_txfm_2d_cfg_dct_dct_4;
+      break;
+    case ADST_DCT:
+      cfg = &fwd_txfm_2d_cfg_adst_dct_4;
+      break;
+    case DCT_ADST:
+      cfg = &fwd_txfm_2d_cfg_dct_adst_4;
+      break;
+    case ADST_ADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_4;
+      break;
+    default:
+      assert(0);
+  }
+  return cfg;
+}
+
+const TXFM_2D_CFG* vp10_get_txfm_8x8_cfg(int tx_type) {
+  const TXFM_2D_CFG* cfg = NULL;
+  switch (tx_type) {
+    case DCT_DCT:
+      cfg = &fwd_txfm_2d_cfg_dct_dct_8;
+      break;
+    case ADST_DCT:
+      cfg = &fwd_txfm_2d_cfg_adst_dct_8;
+      break;
+    case DCT_ADST:
+      cfg = &fwd_txfm_2d_cfg_dct_adst_8;
+      break;
+    case ADST_ADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_8;
+      break;
+    default:
+      assert(0);
+  }
+  return cfg;
+}
+
+const TXFM_2D_CFG* vp10_get_txfm_16x16_cfg(int tx_type) {
+  const TXFM_2D_CFG* cfg = NULL;
+  switch (tx_type) {
+    case DCT_DCT:
+      cfg = &fwd_txfm_2d_cfg_dct_dct_16;
+      break;
+    case ADST_DCT:
+      cfg = &fwd_txfm_2d_cfg_adst_dct_16;
+      break;
+    case DCT_ADST:
+      cfg = &fwd_txfm_2d_cfg_dct_adst_16;
+      break;
+    case ADST_ADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_16;
+      break;
+    default:
+      assert(0);
+  }
+  return cfg;
+}
+
+const TXFM_2D_CFG* vp10_get_txfm_32x32_cfg(int tx_type) {
+  const TXFM_2D_CFG* cfg = NULL;
+  switch (tx_type) {
+    case DCT_DCT:
+      cfg = &fwd_txfm_2d_cfg_dct_dct_32;
+      break;
+    case ADST_DCT:
+      cfg = &fwd_txfm_2d_cfg_adst_dct_32;
+      break;
+    case DCT_ADST:
+      cfg = &fwd_txfm_2d_cfg_dct_adst_32;
+      break;
+    case ADST_ADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_32;
+      break;
+    default:
+      assert(0);
+  }
+  return cfg;
+}
+
+const TXFM_2D_CFG* vp10_get_txfm_64x64_cfg(int tx_type) {
+  const TXFM_2D_CFG* cfg = NULL;
+  switch (tx_type) {
+    case DCT_DCT:
+      cfg = &fwd_txfm_2d_cfg_dct_dct_64;
+      break;
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    default:
+      assert(0);
+  }
+  return cfg;
+}
diff --git a/vp10/common/vp10_fwd_txfm2d_cfg.h b/vp10/common/vp10_fwd_txfm2d_cfg.h
index e8c82fd..ed976df 100644
--- a/vp10/common/vp10_fwd_txfm2d_cfg.h
+++ b/vp10/common/vp10_fwd_txfm2d_cfg.h
@@ -10,6 +10,7 @@
 
 #ifndef VP10_FWD_TXFM2D_CFG_H_
 #define VP10_FWD_TXFM2D_CFG_H_
+#include "vp10/common/enums.h"
 #include "vp10/common/vp10_fwd_txfm1d.h"
 //  ---------------- config fwd_dct_dct_4 ----------------
 static const int8_t fwd_shift_dct_dct_4[3] = {2, 0, 0};
@@ -399,4 +400,10 @@
     TXFM_TYPE_ADST32,                 // .txfm_type_col
     TXFM_TYPE_DCT32};                 // .txfm_type_row
 
+const TXFM_2D_CFG* vp10_get_txfm_4x4_cfg(int tx_type);
+const TXFM_2D_CFG* vp10_get_txfm_8x8_cfg(int tx_type);
+const TXFM_2D_CFG* vp10_get_txfm_16x16_cfg(int tx_type);
+const TXFM_2D_CFG* vp10_get_txfm_32x32_cfg(int tx_type);
+const TXFM_2D_CFG* vp10_get_txfm_64x64_cfg(int tx_type);
+
 #endif  // VP10_FWD_TXFM2D_CFG_H_
diff --git a/vp10/common/vp10_rtcd_defs.pl b/vp10/common/vp10_rtcd_defs.pl
index 1d227dd..ae0d2cb 100644
--- a/vp10/common/vp10_rtcd_defs.pl
+++ b/vp10/common/vp10_rtcd_defs.pl
@@ -614,15 +614,15 @@
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   #fwd txfm
-  add_proto qw/void vp10_fwd_txfm2d_4x4/, "const int16_t *input, int32_t *output, const int stride, const TXFM_2D_CFG *cfg, const int bd";
+  add_proto qw/void vp10_fwd_txfm2d_4x4/, "const int16_t *input, int32_t *output, const int stride, int tx_type, const int bd";
   specialize qw/vp10_fwd_txfm2d_4x4 sse4_1/;
-  add_proto qw/void vp10_fwd_txfm2d_8x8/, "const int16_t *input, int32_t *output, const int stride, const TXFM_2D_CFG *cfg, const int bd";
+  add_proto qw/void vp10_fwd_txfm2d_8x8/, "const int16_t *input, int32_t *output, const int stride, int tx_type, const int bd";
   specialize qw/vp10_fwd_txfm2d_8x8 sse4_1/;
-  add_proto qw/void vp10_fwd_txfm2d_16x16/, "const int16_t *input, int32_t *output, const int stride, const TXFM_2D_CFG *cfg, const int bd";
+  add_proto qw/void vp10_fwd_txfm2d_16x16/, "const int16_t *input, int32_t *output, const int stride, int tx_type, const int bd";
   specialize qw/vp10_fwd_txfm2d_16x16 sse4_1/;
-  add_proto qw/void vp10_fwd_txfm2d_32x32/, "const int16_t *input, int32_t *output, const int stride, const TXFM_2D_CFG *cfg, const int bd";
+  add_proto qw/void vp10_fwd_txfm2d_32x32/, "const int16_t *input, int32_t *output, const int stride, int tx_type, const int bd";
   specialize qw/vp10_fwd_txfm2d_32x32 sse4_1/;
-  add_proto qw/void vp10_fwd_txfm2d_64x64/, "const int16_t *input, int32_t *output, const int stride, const TXFM_2D_CFG *cfg, const int bd";
+  add_proto qw/void vp10_fwd_txfm2d_64x64/, "const int16_t *input, int32_t *output, const int stride, int tx_type, const int bd";
   specialize qw/vp10_fwd_txfm2d_64x64 sse4_1/;
 
   #inv txfm
diff --git a/vp10/common/x86/vp10_fwd_txfm2d_sse4.c b/vp10/common/x86/vp10_fwd_txfm2d_sse4.c
index 6664bd5..d884571 100644
--- a/vp10/common/x86/vp10_fwd_txfm2d_sse4.c
+++ b/vp10/common/x86/vp10_fwd_txfm2d_sse4.c
@@ -1,3 +1,14 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp10/common/vp10_fwd_txfm2d_cfg.h"
 #include "vp10/common/x86/vp10_txfm1d_sse4.h"
 
 static inline void int16_array_with_stride_to_int32_array_without_stride(
@@ -49,8 +60,8 @@
 }
 
 static inline void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output,
-                                   const int stride, const TXFM_2D_CFG *cfg,
-                                   int32_t *txfm_buf) {
+                                     const int stride, const TXFM_2D_CFG *cfg,
+                                     int32_t *txfm_buf) {
   const int txfm_size = cfg->txfm_size;
   const int8_t *shift = cfg->shift;
   const int8_t *stage_range_col = cfg->stage_range_col;
@@ -77,41 +88,46 @@
 }
 
 void vp10_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *output,
-                              const int stride, const TXFM_2D_CFG *cfg,
-                              const int bd) {
+                                const int stride, int tx_type,
+                                const int bd) {
   int32_t txfm_buf[16];
+  const TXFM_2D_CFG* cfg = vp10_get_txfm_4x4_cfg(tx_type);
   (void)bd;
   fwd_txfm2d_sse4_1(input, output, stride, cfg, txfm_buf);
 }
 
 void vp10_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *output,
-                              const int stride, const TXFM_2D_CFG *cfg,
-                              const int bd) {
+                                const int stride, int tx_type,
+                                const int bd) {
   int32_t txfm_buf[64];
+  const TXFM_2D_CFG* cfg = vp10_get_txfm_8x8_cfg(tx_type);
   (void)bd;
   fwd_txfm2d_sse4_1(input, output, stride, cfg, txfm_buf);
 }
 
 void vp10_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *output,
-                                const int stride, const TXFM_2D_CFG *cfg,
-                                const int bd) {
+                                  const int stride, int tx_type,
+                                  const int bd) {
   int32_t txfm_buf[256];
+  const TXFM_2D_CFG* cfg = vp10_get_txfm_16x16_cfg(tx_type);
   (void)bd;
   fwd_txfm2d_sse4_1(input, output, stride, cfg, txfm_buf);
 }
 
 void vp10_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output,
-                                const int stride, const TXFM_2D_CFG *cfg,
-                                const int bd) {
+                                  const int stride, int tx_type,
+                                  const int bd) {
   int32_t txfm_buf[1024];
+  const TXFM_2D_CFG* cfg = vp10_get_txfm_32x32_cfg(tx_type);
   (void)bd;
   fwd_txfm2d_sse4_1(input, output, stride, cfg, txfm_buf);
 }
 
 void vp10_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output,
-                                const int stride, const TXFM_2D_CFG *cfg,
-                                const int bd) {
+                                  const int stride, int tx_type,
+                                  const int bd) {
   int32_t txfm_buf[4096];
+  const TXFM_2D_CFG* cfg = vp10_get_txfm_64x64_cfg(tx_type);
   (void)bd;
   fwd_txfm2d_sse4_1(input, output, stride, cfg, txfm_buf);
 }
diff --git a/vp10/encoder/hybrid_fwd_txfm.c b/vp10/encoder/hybrid_fwd_txfm.c
index b192164..491f2ac 100644
--- a/vp10/encoder/hybrid_fwd_txfm.c
+++ b/vp10/encoder/hybrid_fwd_txfm.c
@@ -195,20 +195,10 @@
 
   switch (tx_type) {
     case DCT_DCT:
-      vp10_fwd_txfm2d_4x4(src_diff, coeff, diff_stride,
-                          &fwd_txfm_2d_cfg_dct_dct_4, bd);
-      break;
     case ADST_DCT:
-      vp10_fwd_txfm2d_4x4(src_diff, coeff, diff_stride,
-                          &fwd_txfm_2d_cfg_adst_dct_4, bd);
-      break;
     case DCT_ADST:
-      vp10_fwd_txfm2d_4x4(src_diff, coeff, diff_stride,
-                          &fwd_txfm_2d_cfg_dct_adst_4, bd);
-      break;
     case ADST_ADST:
-      vp10_fwd_txfm2d_4x4(src_diff, coeff, diff_stride,
-                          &fwd_txfm_2d_cfg_adst_adst_4, bd);
+      vp10_fwd_txfm2d_4x4(src_diff, coeff, diff_stride, tx_type, bd);
       break;
 #if CONFIG_EXT_TX
     case FLIPADST_DCT:
@@ -242,20 +232,10 @@
   (void)fwd_txfm_opt;
   switch (tx_type) {
     case DCT_DCT:
-      vp10_fwd_txfm2d_8x8(src_diff, coeff, diff_stride,
-                          &fwd_txfm_2d_cfg_dct_dct_8, bd);
-      break;
     case ADST_DCT:
-      vp10_fwd_txfm2d_8x8(src_diff, coeff, diff_stride,
-                          &fwd_txfm_2d_cfg_adst_dct_8, bd);
-      break;
     case DCT_ADST:
-      vp10_fwd_txfm2d_8x8(src_diff, coeff, diff_stride,
-                          &fwd_txfm_2d_cfg_dct_adst_8, bd);
-      break;
     case ADST_ADST:
-      vp10_fwd_txfm2d_8x8(src_diff, coeff, diff_stride,
-                          &fwd_txfm_2d_cfg_adst_adst_8, bd);
+      vp10_fwd_txfm2d_8x8(src_diff, coeff, diff_stride, tx_type, bd);
       break;
 #if CONFIG_EXT_TX
     case FLIPADST_DCT:
@@ -290,20 +270,10 @@
   (void)fwd_txfm_opt;
   switch (tx_type) {
     case DCT_DCT:
-      vp10_fwd_txfm2d_16x16(src_diff, coeff, diff_stride,
-                            &fwd_txfm_2d_cfg_dct_dct_16, bd);
-      break;
     case ADST_DCT:
-      vp10_fwd_txfm2d_16x16(src_diff, coeff, diff_stride,
-                            &fwd_txfm_2d_cfg_adst_dct_16, bd);
-      break;
     case DCT_ADST:
-      vp10_fwd_txfm2d_16x16(src_diff, coeff, diff_stride,
-                            &fwd_txfm_2d_cfg_dct_adst_16, bd);
-      break;
     case ADST_ADST:
-      vp10_fwd_txfm2d_16x16(src_diff, coeff, diff_stride,
-                            &fwd_txfm_2d_cfg_adst_adst_16, bd);
+      vp10_fwd_txfm2d_16x16(src_diff, coeff, diff_stride, tx_type, bd);
       break;
 #if CONFIG_EXT_TX
     case FLIPADST_DCT:
@@ -340,8 +310,7 @@
   (void)fwd_txfm_opt;
   switch (tx_type) {
     case DCT_DCT:
-      vp10_fwd_txfm2d_32x32(src_diff, coeff, diff_stride,
-                            &fwd_txfm_2d_cfg_dct_dct_32, bd);
+      vp10_fwd_txfm2d_32x32(src_diff, coeff, diff_stride, tx_type, bd);
       break;
 #if CONFIG_EXT_TX
     case ADST_DCT:
diff --git a/vp10/encoder/mcomp.c b/vp10/encoder/mcomp.c
index 9423ed2..823095e 100644
--- a/vp10/encoder/mcomp.c
+++ b/vp10/encoder/mcomp.c
@@ -99,8 +99,8 @@
 static int mvsad_err_cost(const MACROBLOCK *x, const MV *mv, const MV *ref,
                           int sad_per_bit) {
 #if CONFIG_REF_MV
-  const MV diff = { (mv->row - ref->row) << 3,
-                    (mv->col - ref->col) << 3 };
+  const MV diff = { (mv->row - ref->row) * 8,
+                    (mv->col - ref->col) * 8 };
   return ROUND_POWER_OF_TWO(
       (unsigned)mv_cost(&diff, x->nmvjointsadcost, x->mvsadcost) *
           sad_per_bit,
@@ -171,15 +171,6 @@
  * could reduce the area.
  */
 
-/* Estimated (square) error cost of a motion vector (r,c). The 14 scale comes
- * from the same math as in mv_err_cost(). */
-#define MVC(r, c)                                              \
-    (mvcost ?                                                  \
-     ((unsigned)(mvjcost[((r) != rr) * 2 + ((c) != rc)] +      \
-       mvcost[0][((r) - rr)] + mvcost[1][((c) - rc)]) *        \
-      error_per_bit + 8192) >> 14 : 0)
-
-
 // convert motion vector component to offset for sv[a]f calc
 static INLINE int sp(int x) {
   return x & 7;
@@ -192,13 +183,16 @@
 /* checks if (r, c) has better score than previous best */
 #define CHECK_BETTER(v, r, c) \
   if (c >= minc && c <= maxc && r >= minr && r <= maxr) {              \
+    MV this_mv = {r, c};                                               \
+    v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit); \
     if (second_pred == NULL)                                           \
       thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \
                              src_stride, &sse);                        \
     else                                                               \
       thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), \
                               z, src_stride, &sse, second_pred);       \
-    if ((v = MVC(r, c) + thismse) < besterr) {                         \
+    v += thismse;                                                      \
+    if (v < besterr) {                                                 \
       besterr = v;                                                     \
       br = r;                                                          \
       bc = c;                                                          \
@@ -219,10 +213,13 @@
 /* checks if (r, c) has better score than previous best */
 #define CHECK_BETTER1(v, r, c) \
   if (c >= minc && c <= maxc && r >= minr && r <= maxr) {              \
+    MV this_mv = {r, c};                                               \
     thismse = upsampled_pref_error(xd, vfp, z, src_stride,             \
                                    upre(y, y_stride, r, c), y_stride,  \
                                    second_pred, w, h, &sse);           \
-    if ((v = MVC(r, c) + thismse) < besterr) {                         \
+    v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit); \
+    v += thismse;                                                      \
+    if (v < besterr) {                                                 \
       besterr = v;                                                     \
       br = r;                                                          \
       bc = c;                                                          \
@@ -334,8 +331,6 @@
   const int offset = bestmv->row * y_stride + bestmv->col;                 \
   const uint8_t *const y = xd->plane[0].pre[0].buf;                        \
                                                                            \
-  int rr = ref_mv->row;                                                    \
-  int rc = ref_mv->col;                                                    \
   int br = bestmv->row * 8;                                                \
   int bc = bestmv->col * 8;                                                \
   int hstep = 4;                                                           \
@@ -762,8 +757,6 @@
   const int offset = bestmv->row * y_stride + bestmv->col;
   const uint8_t *const y = xd->plane[0].pre[0].buf;
 
-  int rr = ref_mv->row;
-  int rc = ref_mv->col;
   int br = bestmv->row * 8;
   int bc = bestmv->col * 8;
   int hstep = 4;
@@ -790,7 +783,7 @@
   if (use_upsampled_ref)
     besterr = upsampled_setup_center_error(xd, bestmv, ref_mv, error_per_bit,
                                            vfp, z, src_stride, y, y_stride,
-                                           second_pred, w, h, (offset << 3),
+                                           second_pred, w, h, (offset * 8),
                                            mvjcost, mvcost, sse1, distortion);
   else
     besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
@@ -920,7 +913,6 @@
   return besterr;
 }
 
-#undef MVC
 #undef PRE
 #undef CHECK_BETTER
 
diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index c8f5715..6ffa790 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c
@@ -1829,6 +1829,8 @@
         if (centroids[i] == centroids[i - 1]) {
           j = i;
           while (j < k - 1) {
+            assert((j + 1) < PALETTE_MAX_SIZE);
+            assert(j > 0);
             centroids[j] = centroids[j + 1];
             ++j;
           }
@@ -5046,7 +5048,8 @@
         if (!has_second_rf &&
 #if CONFIG_EXT_INTER
             have_newmv_in_inter_mode(this_mode) &&
-            seg_mvs[i][mv_idx][mbmi->ref_frame[0]].as_int == INVALID_MV
+            (seg_mvs[i][mv_idx][mbmi->ref_frame[0]].as_int == INVALID_MV ||
+             vp10_use_mv_hp(&bsi->ref_mv[0]->as_mv) == 0)
 #else
             this_mode == NEWMV &&
             (seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV ||
@@ -8873,7 +8876,6 @@
       else if (best_mbmode.mv[0].as_int == 0)
         best_mbmode.mode = ZEROMV;
     } else {
-      int i;
       int_mv nearestmv[2] = { frame_mv[NEARESTMV][refs[0]],
                               frame_mv[NEARESTMV][refs[1]] };
       int_mv nearmv[2] = { frame_mv[NEARMV][refs[0]],
@@ -8885,6 +8887,7 @@
          nearmv[1] = mbmi_ext->ref_mv_stack[rf_type][1].comp_mv;
        }
 #else
+      int i;
       int ref_set = (mbmi_ext->ref_mv_count[rf_type] >= 2) ?
           VPXMIN(2, mbmi_ext->ref_mv_count[rf_type] - 2) : INT_MAX;