Merge "Replacing (raster_block >> tx_size) with (block >> (tx_size << 1))."
diff --git a/test/vp9_lossless_test.cc b/test/vp9_lossless_test.cc
index 441cc44..30a3118 100644
--- a/test/vp9_lossless_test.cc
+++ b/test/vp9_lossless_test.cc
@@ -35,7 +35,7 @@
   }
 
   virtual void BeginPassHook(unsigned int /*pass*/) {
-    psnr_ = 0.0;
+    psnr_ = kMaxPsnr;
     nframes_ = 0;
   }
 
@@ -65,9 +65,9 @@
   init_flags_ = VPX_CODEC_USE_PSNR;
 
   // intentionally changed the dimension for better testing coverage
-  libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 356, 284,
+  libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
                                      timebase.den, timebase.num, 0, 30);
-
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   const double psnr_lossless = GetMinPsnr();
   EXPECT_GE(psnr_lossless, kMaxPsnr);
 }
diff --git a/tools_common.h b/tools_common.h
index 7dfd5ad..068e7b5 100644
--- a/tools_common.h
+++ b/tools_common.h
@@ -12,6 +12,46 @@
 
 #include <stdio.h>
 
+#include "./vpx_config.h"
+
+#if defined(_MSC_VER)
+/* MSVS doesn't define off_t, and uses _f{seek,tell}i64. */
+typedef __int64 off_t;
+#define fseeko _fseeki64
+#define ftello _ftelli64
+#elif defined(_WIN32)
+/* MinGW defines off_t as long and uses f{seek,tell}o64/off64_t for large
+ * files. */
+#define fseeko fseeko64
+#define ftello ftello64
+#define off_t off64_t
+#endif  /* _WIN32 */
+
+#if CONFIG_OS_SUPPORT
+#if defined(_MSC_VER)
+#include <io.h>  /* NOLINT */
+#define snprintf _snprintf
+#define isatty   _isatty
+#define fileno   _fileno
+#else
+#include <unistd.h>  /* NOLINT */
+#endif  /* _MSC_VER */
+#endif  /* CONFIG_OS_SUPPORT */
+
+/* Use 32-bit file operations in WebM file format when building ARM
+ * executables (.axf) with RVCT. */
+#if !CONFIG_OS_SUPPORT
+typedef long off_t;  /* NOLINT */
+#define fseeko fseek
+#define ftello ftell
+#endif  /* CONFIG_OS_SUPPORT */
+
+#define LITERALU64(hi, lo) ((((uint64_t)hi) << 32) | lo)
+
+#ifndef PATH_MAX
+#define PATH_MAX 512
+#endif
+
 #define VP8_FOURCC (0x30385056)
 #define VP9_FOURCC (0x30395056)
 #define VP8_FOURCC_MASK (0x00385056)
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index d0d4852..f52adfc 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -82,9 +82,8 @@
 
 #define INTER_MODES (1 + NEWMV - NEARESTMV)
 
-static INLINE int inter_mode_offset(MB_PREDICTION_MODE mode) {
-  return (mode - NEARESTMV);
-}
+#define INTER_OFFSET(mode) ((mode) - NEARESTMV)
+
 
 /* For keyframes, intra block modes are predicted by the (already decoded)
    modes for the Y blocks to the left and above us; for interframes, there
diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c
index d3a867c..2676762 100644
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@@ -332,7 +332,7 @@
           if (l >= 3 && k == 0)
             continue;
           vp9_tree_probs_from_distribution(vp9_coefmodel_tree, branch_ct,
-                                           coef_counts[i][j][k][l], 0);
+                                           coef_counts[i][j][k][l]);
           branch_ct[0][1] = eob_branch_count[i][j][k][l] - branch_ct[0][0];
           for (m = 0; m < UNCONSTRAINED_NODES; ++m)
             dst_coef_probs[i][j][k][l][m] = merge_probs(
diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index a963d55..3b2510d 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -235,9 +235,9 @@
 struct vp9_token vp9_intra_mode_encodings[INTRA_MODES];
 
 const vp9_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)] = {
-  -ZEROMV, 2,
-  -NEARESTMV, 4,
-  -NEARMV, -NEWMV
+  -INTER_OFFSET(ZEROMV), 2,
+  -INTER_OFFSET(NEARESTMV), 4,
+  -INTER_OFFSET(NEARMV), -INTER_OFFSET(NEWMV)
 };
 struct vp9_token vp9_inter_mode_encodings[INTER_MODES];
 
@@ -343,8 +343,7 @@
   vp9_tokens_from_tree(vp9_switchable_interp_encodings,
                        vp9_switchable_interp_tree);
   vp9_tokens_from_tree(vp9_partition_encodings, vp9_partition_tree);
-  vp9_tokens_from_tree_offset(vp9_inter_mode_encodings,
-                              vp9_inter_mode_tree, NEARESTMV);
+  vp9_tokens_from_tree(vp9_inter_mode_encodings, vp9_inter_mode_tree);
 }
 
 #define COUNT_SAT 20
@@ -356,9 +355,9 @@
 
 static void adapt_probs(const vp9_tree_index *tree,
                         const vp9_prob *pre_probs, const unsigned int *counts,
-                        unsigned int offset, vp9_prob *probs) {
-  tree_merge_probs(tree, pre_probs, counts, offset,
-                   COUNT_SAT, MAX_UPDATE_FACTOR, probs);
+                        vp9_prob *probs) {
+  tree_merge_probs(tree, pre_probs, counts, COUNT_SAT, MAX_UPDATE_FACTOR,
+                   probs);
 }
 
 void vp9_adapt_mode_probs(VP9_COMMON *cm) {
@@ -383,25 +382,24 @@
 
   for (i = 0; i < INTER_MODE_CONTEXTS; i++)
     adapt_probs(vp9_inter_mode_tree, pre_fc->inter_mode_probs[i],
-                counts->inter_mode[i], NEARESTMV, fc->inter_mode_probs[i]);
+                counts->inter_mode[i], fc->inter_mode_probs[i]);
 
   for (i = 0; i < BLOCK_SIZE_GROUPS; i++)
     adapt_probs(vp9_intra_mode_tree, pre_fc->y_mode_prob[i],
-                counts->y_mode[i], 0, fc->y_mode_prob[i]);
+                counts->y_mode[i], fc->y_mode_prob[i]);
 
   for (i = 0; i < INTRA_MODES; ++i)
     adapt_probs(vp9_intra_mode_tree, pre_fc->uv_mode_prob[i],
-                counts->uv_mode[i], 0, fc->uv_mode_prob[i]);
+                counts->uv_mode[i], fc->uv_mode_prob[i]);
 
   for (i = 0; i < PARTITION_CONTEXTS; i++)
     adapt_probs(vp9_partition_tree, pre_fc->partition_prob[i],
-                counts->partition[i], 0, fc->partition_prob[i]);
+                counts->partition[i], fc->partition_prob[i]);
 
   if (cm->mcomp_filter_type == SWITCHABLE) {
     for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
       adapt_probs(vp9_switchable_interp_tree, pre_fc->switchable_interp_prob[i],
-                  counts->switchable_interp[i], 0,
-                  fc->switchable_interp_prob[i]);
+                  counts->switchable_interp[i], fc->switchable_interp_prob[i]);
   }
 
   if (cm->tx_mode == TX_MODE_SELECT) {
diff --git a/vp9/common/vp9_entropymv.c b/vp9/common/vp9_entropymv.c
index b061cdb..290dcdd 100644
--- a/vp9/common/vp9_entropymv.c
+++ b/vp9/common/vp9_entropymv.c
@@ -196,8 +196,8 @@
 
 static void adapt_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs,
                         const unsigned int *counts, vp9_prob *probs) {
-  tree_merge_probs(tree, pre_probs, counts, 0,
-                   MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR, probs);
+  tree_merge_probs(tree, pre_probs, counts, MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR,
+                   probs);
 }
 
 void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) {
@@ -207,8 +207,7 @@
   const nmv_context *pre_fc = &cm->frame_contexts[cm->frame_context_idx].nmvc;
   const nmv_context_counts *counts = &cm->counts.mv;
 
-  adapt_probs(vp9_mv_joint_tree, pre_fc->joints, counts->joints,
-                     fc->joints);
+  adapt_probs(vp9_mv_joint_tree, pre_fc->joints, counts->joints, fc->joints);
 
   for (i = 0; i < 2; ++i) {
     nmv_component *comp = &fc->comps[i];
diff --git a/vp9/common/vp9_treecoder.c b/vp9/common/vp9_treecoder.c
index 1805fb4..e2a5b9f 100644
--- a/vp9/common/vp9_treecoder.c
+++ b/vp9/common/vp9_treecoder.c
@@ -35,28 +35,20 @@
   tree2tok(p, t, 0, 0, 0);
 }
 
-void vp9_tokens_from_tree_offset(struct vp9_token *p, vp9_tree t,
-                                 int offset) {
-  tree2tok(p - offset, t, 0, 0, 0);
-}
-
 static unsigned int convert_distribution(unsigned int i, vp9_tree tree,
                                          unsigned int branch_ct[][2],
-                                         const unsigned int num_events[],
-                                         unsigned int tok0_offset) {
+                                         const unsigned int num_events[]) {
   unsigned int left, right;
 
-  if (tree[i] <= 0) {
-    left = num_events[-tree[i] - tok0_offset];
-  } else {
-    left = convert_distribution(tree[i], tree, branch_ct, num_events,
-                                tok0_offset);
-  }
-  if (tree[i + 1] <= 0)
-    right = num_events[-tree[i + 1] - tok0_offset];
+  if (tree[i] <= 0)
+    left = num_events[-tree[i]];
   else
-    right = convert_distribution(tree[i + 1], tree, branch_ct, num_events,
-                                 tok0_offset);
+    left = convert_distribution(tree[i], tree, branch_ct, num_events);
+
+  if (tree[i + 1] <= 0)
+    right = num_events[-tree[i + 1]];
+  else
+    right = convert_distribution(tree[i + 1], tree, branch_ct, num_events);
 
   branch_ct[i >> 1][0] = left;
   branch_ct[i >> 1][1] = right;
@@ -65,9 +57,8 @@
 
 void vp9_tree_probs_from_distribution(vp9_tree tree,
                                       unsigned int branch_ct[/* n-1 */][2],
-                                      const unsigned int num_events[/* n */],
-                                      unsigned int tok0_offset) {
-  convert_distribution(0, tree, branch_ct, num_events, tok0_offset);
+                                      const unsigned int num_events[/* n */]) {
+  convert_distribution(0, tree, branch_ct, num_events);
 }
 
 
diff --git a/vp9/common/vp9_treecoder.h b/vp9/common/vp9_treecoder.h
index 9c776d6..a79b156 100644
--- a/vp9/common/vp9_treecoder.h
+++ b/vp9/common/vp9_treecoder.h
@@ -42,7 +42,6 @@
 /* Construct encoding array from tree. */
 
 void vp9_tokens_from_tree(struct vp9_token*, vp9_tree);
-void vp9_tokens_from_tree_offset(struct vp9_token*, vp9_tree, int offset);
 
 /* Convert array of token occurrence counts into a table of probabilities
    for the associated binary encoding tree.  Also writes count of branches
@@ -51,8 +50,7 @@
 
 void vp9_tree_probs_from_distribution(vp9_tree tree,
                                       unsigned int branch_ct[ /* n - 1 */ ][2],
-                                      const unsigned int num_events[ /* n */ ],
-                                      unsigned int tok0_offset);
+                                      const unsigned int num_events[ /* n */ ]);
 
 
 static INLINE vp9_prob clip_prob(int p) {
@@ -116,10 +114,10 @@
 
 static void tree_merge_probs(const vp9_tree_index *tree,
                              const vp9_prob *pre_probs,
-                             const unsigned int *counts, int offset,
+                             const unsigned int *counts,
                              unsigned int count_sat,
                              unsigned int max_update_factor, vp9_prob *probs) {
-  tree_merge_probs_impl(0, tree, pre_probs, &counts[-offset],
+  tree_merge_probs_impl(0, tree, pre_probs, counts,
                         count_sat, max_update_factor, probs);
 }
 
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index ccf5aac..2a33844 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -15,6 +15,16 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_idct.h"
 
+#define RECON_AND_STORE4X4(dest, in_x) \
+{                                                     \
+  __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
+  d0 = _mm_unpacklo_epi8(d0, zero); \
+  d0 = _mm_add_epi16(in_x, d0); \
+  d0 = _mm_packus_epi16(d0, d0); \
+  *(int *)dest = _mm_cvtsi128_si32(d0); \
+  dest += stride; \
+}
+
 void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i eight = _mm_set1_epi16(8);
@@ -26,21 +36,19 @@
   __m128i input0, input1, input2, input3;
 
   // Rows
-  input0 = _mm_loadl_epi64((const __m128i *)input);
-  input1 = _mm_loadl_epi64((const __m128i *)(input + 4));
-  input2 = _mm_loadl_epi64((const __m128i *)(input + 8));
-  input3 = _mm_loadl_epi64((const __m128i *)(input + 12));
+  input0 = _mm_load_si128((const __m128i *)input);
+  input2 = _mm_load_si128((const __m128i *)(input + 8));
 
   // Construct i3, i1, i3, i1, i2, i0, i2, i0
   input0 = _mm_shufflelo_epi16(input0, 0xd8);
-  input1 = _mm_shufflelo_epi16(input1, 0xd8);
+  input0 = _mm_shufflehi_epi16(input0, 0xd8);
   input2 = _mm_shufflelo_epi16(input2, 0xd8);
-  input3 = _mm_shufflelo_epi16(input3, 0xd8);
+  input2 = _mm_shufflehi_epi16(input2, 0xd8);
 
+  input1 = _mm_unpackhi_epi32(input0, input0);
   input0 = _mm_unpacklo_epi32(input0, input0);
-  input1 = _mm_unpacklo_epi32(input1, input1);
+  input3 = _mm_unpackhi_epi32(input2, input2);
   input2 = _mm_unpacklo_epi32(input2, input2);
-  input3 = _mm_unpacklo_epi32(input3, input3);
 
   // Stage 1
   input0 = _mm_madd_epi16(input0, cst);
@@ -59,16 +67,14 @@
   input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
 
   // Stage 2
-  input0 = _mm_packs_epi32(input0, zero);
-  input1 = _mm_packs_epi32(input1, zero);
-  input2 = _mm_packs_epi32(input2, zero);
-  input3 = _mm_packs_epi32(input3, zero);
+  input0 = _mm_packs_epi32(input0, input1);
+  input1 = _mm_packs_epi32(input2, input3);
 
   // Transpose
-  input1 = _mm_unpacklo_epi16(input0, input1);
-  input3 = _mm_unpacklo_epi16(input2, input3);
-  input0 = _mm_unpacklo_epi32(input1, input3);
-  input1 = _mm_unpackhi_epi32(input1, input3);
+  input2 = _mm_unpacklo_epi16(input0, input1);
+  input3 = _mm_unpackhi_epi16(input0, input1);
+  input0 = _mm_unpacklo_epi32(input2, input3);
+  input1 = _mm_unpackhi_epi32(input2, input3);
 
   // Switch column2, column 3, and then, we got:
   // input2: column1, column 0;  input3: column2, column 3.
@@ -78,14 +84,9 @@
 
   // Columns
   // Construct i3, i1, i3, i1, i2, i0, i2, i0
-  input0 = _mm_shufflelo_epi16(input2, 0xd8);
-  input1 = _mm_shufflehi_epi16(input2, 0xd8);
-  input2 = _mm_shufflehi_epi16(input3, 0xd8);
-  input3 = _mm_shufflelo_epi16(input3, 0xd8);
-
-  input0 = _mm_unpacklo_epi32(input0, input0);
-  input1 = _mm_unpackhi_epi32(input1, input1);
-  input2 = _mm_unpackhi_epi32(input2, input2);
+  input0 = _mm_unpacklo_epi32(input2, input2);
+  input1 = _mm_unpackhi_epi32(input2, input2);
+  input2 = _mm_unpackhi_epi32(input3, input3);
   input3 = _mm_unpacklo_epi32(input3, input3);
 
   // Stage 1
@@ -105,16 +106,14 @@
   input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
 
   // Stage 2
-  input0 = _mm_packs_epi32(input0, zero);
-  input1 = _mm_packs_epi32(input1, zero);
-  input2 = _mm_packs_epi32(input2, zero);
-  input3 = _mm_packs_epi32(input3, zero);
+  input0 = _mm_packs_epi32(input0, input2);
+  input1 = _mm_packs_epi32(input1, input3);
 
   // Transpose
-  input1 = _mm_unpacklo_epi16(input0, input1);
-  input3 = _mm_unpacklo_epi16(input2, input3);
-  input0 = _mm_unpacklo_epi32(input1, input3);
-  input1 = _mm_unpackhi_epi32(input1, input3);
+  input2 = _mm_unpacklo_epi16(input0, input1);
+  input3 = _mm_unpackhi_epi16(input0, input1);
+  input0 = _mm_unpacklo_epi32(input2, input3);
+  input1 = _mm_unpackhi_epi32(input2, input3);
 
   // Switch column2, column 3, and then, we got:
   // input2: column1, column 0;  input3: column2, column 3.
@@ -129,23 +128,31 @@
   input2 = _mm_srai_epi16(input2, 4);
   input3 = _mm_srai_epi16(input3, 4);
 
-#define RECON_AND_STORE4X4(dest, in_x) \
-  {                                                     \
-      __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
-      d0 = _mm_unpacklo_epi8(d0, zero); \
-      d0 = _mm_add_epi16(in_x, d0); \
-      d0 = _mm_packus_epi16(d0, d0); \
-      *(int *)dest = _mm_cvtsi128_si32(d0); \
-      dest += stride; \
+  // Reconstruction and Store
+  {
+     __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
+     __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
+     d0 = _mm_unpacklo_epi32(d0,
+          _mm_cvtsi32_si128(*(const int *) (dest + stride)));
+     d2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(
+                    *(const int *) (dest + stride * 3)), d2);
+     d0 = _mm_unpacklo_epi8(d0, zero);
+     d2 = _mm_unpacklo_epi8(d2, zero);
+     d0 = _mm_add_epi16(d0, input2);
+     d2 = _mm_add_epi16(d2, input3);
+     d0 = _mm_packus_epi16(d0, d2);
+     // store input0
+     *(int *)dest = _mm_cvtsi128_si32(d0);
+     // store input1
+     d0 = _mm_srli_si128(d0, 4);
+     *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
+     // store input2
+     d0 = _mm_srli_si128(d0, 4);
+     *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
+     // store input3
+     d0 = _mm_srli_si128(d0, 4);
+     *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
   }
-
-  input0 = _mm_srli_si128(input2, 8);
-  input1 = _mm_srli_si128(input3, 8);
-
-  RECON_AND_STORE4X4(dest, input2);
-  RECON_AND_STORE4X4(dest, input0);
-  RECON_AND_STORE4X4(dest, input1);
-  RECON_AND_STORE4X4(dest, input3);
 }
 
 void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 9792d2c..14f2ce5 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -48,12 +48,13 @@
 }
 
 static MB_PREDICTION_MODE read_inter_mode(VP9_COMMON *cm, vp9_reader *r,
-                                          uint8_t context) {
-  const MB_PREDICTION_MODE mode = treed_read(r, vp9_inter_mode_tree,
-                                             cm->fc.inter_mode_probs[context]);
+                                          int ctx) {
+  const int mode = treed_read(r, vp9_inter_mode_tree,
+                              cm->fc.inter_mode_probs[ctx]);
   if (!cm->frame_parallel_decoding_mode)
-    ++cm->counts.inter_mode[context][inter_mode_offset(mode)];
-  return mode;
+    ++cm->counts.inter_mode[ctx][mode];
+
+  return NEARESTMV + mode;
 }
 
 static int read_segment_id(vp9_reader *r, const struct segmentation *seg) {
@@ -260,6 +261,16 @@
   mv->col = ref->col + diff.col;
 }
 
+static COMPPREDMODE_TYPE read_reference_mode(VP9_COMMON *cm,
+                                             const MACROBLOCKD *xd,
+                                             vp9_reader *r) {
+  const int ctx = vp9_get_pred_context_comp_inter_inter(cm, xd);
+  const int mode = vp9_read(r, cm->fc.comp_inter_prob[ctx]);
+  if (!cm->frame_parallel_decoding_mode)
+    ++cm->counts.comp_inter[ctx][mode];
+  return mode;  // SINGLE_PREDICTION_ONLY or COMP_PREDICTION_ONLY
+}
+
 // Read the referncence frame
 static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                             vp9_reader *r,
@@ -271,27 +282,20 @@
     ref_frame[0] = vp9_get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
     ref_frame[1] = NONE;
   } else {
-    const int comp_ctx = vp9_get_pred_context_comp_inter_inter(cm, xd);
-    int is_comp;
-
-    if (cm->comp_pred_mode == HYBRID_PREDICTION) {
-      is_comp = vp9_read(r, fc->comp_inter_prob[comp_ctx]);
-      if (!cm->frame_parallel_decoding_mode)
-        ++counts->comp_inter[comp_ctx][is_comp];
-    } else {
-      is_comp = cm->comp_pred_mode == COMP_PREDICTION_ONLY;
-    }
+    const COMPPREDMODE_TYPE mode = (cm->comp_pred_mode == HYBRID_PREDICTION)
+                                      ? read_reference_mode(cm, xd, r)
+                                      : cm->comp_pred_mode;
 
     // FIXME(rbultje) I'm pretty sure this breaks segmentation ref frame coding
-    if (is_comp) {
-      const int fix_ref_idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
-      const int ref_ctx = vp9_get_pred_context_comp_ref_p(cm, xd);
-      const int b = vp9_read(r, fc->comp_ref_prob[ref_ctx]);
+    if (mode == COMP_PREDICTION_ONLY) {
+      const int idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
+      const int ctx = vp9_get_pred_context_comp_ref_p(cm, xd);
+      const int bit = vp9_read(r, fc->comp_ref_prob[ctx]);
       if (!cm->frame_parallel_decoding_mode)
-        ++counts->comp_ref[ref_ctx][b];
-      ref_frame[fix_ref_idx] = cm->comp_fixed_ref;
-      ref_frame[!fix_ref_idx] = cm->comp_var_ref[b];
-    } else {
+        ++counts->comp_ref[ctx][bit];
+      ref_frame[idx] = cm->comp_fixed_ref;
+      ref_frame[!idx] = cm->comp_var_ref[bit];
+    } else if (mode == SINGLE_PREDICTION_ONLY) {
       const int ctx0 = vp9_get_pred_context_single_ref_p1(xd);
       const int bit0 = vp9_read(r, fc->single_ref_prob[ctx0][0]);
       if (!cm->frame_parallel_decoding_mode)
@@ -299,14 +303,16 @@
       if (bit0) {
         const int ctx1 = vp9_get_pred_context_single_ref_p2(xd);
         const int bit1 = vp9_read(r, fc->single_ref_prob[ctx1][1]);
-        ref_frame[0] = bit1 ? ALTREF_FRAME : GOLDEN_FRAME;
         if (!cm->frame_parallel_decoding_mode)
           ++counts->single_ref[ctx1][1][bit1];
+        ref_frame[0] = bit1 ? ALTREF_FRAME : GOLDEN_FRAME;
       } else {
         ref_frame[0] = LAST_FRAME;
       }
 
       ref_frame[1] = NONE;
+    } else {
+      assert(!"Invalid prediction mode.");
     }
   }
 }
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index 86fba05..aad400a 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -153,47 +153,38 @@
       vp9_diff_update_prob(r, &cm->fc.comp_ref_prob[i]);
 }
 
-static void update_mv(vp9_reader *r, vp9_prob *p) {
-  if (vp9_read(r, NMV_UPDATE_PROB))
-    *p = (vp9_read_literal(r, 7) << 1) | 1;
+static void update_mv_probs(vp9_prob *p, int n, vp9_reader *r) {
+  int i;
+  for (i = 0; i < n; ++i)
+    if (vp9_read(r, NMV_UPDATE_PROB))
+       p[i] = (vp9_read_literal(r, 7) << 1) | 1;
 }
 
-static void read_mv_probs(vp9_reader *r, nmv_context *mvc, int allow_hp) {
-  int i, j, k;
+static void read_mv_probs(nmv_context *ctx, int allow_hp, vp9_reader *r) {
+  int i, j;
 
-  for (j = 0; j < MV_JOINTS - 1; ++j)
-    update_mv(r, &mvc->joints[j]);
+  update_mv_probs(ctx->joints, MV_JOINTS - 1, r);
 
   for (i = 0; i < 2; ++i) {
-    nmv_component *const comp = &mvc->comps[i];
-
-    update_mv(r, &comp->sign);
-
-    for (j = 0; j < MV_CLASSES - 1; ++j)
-      update_mv(r, &comp->classes[j]);
-
-    for (j = 0; j < CLASS0_SIZE - 1; ++j)
-      update_mv(r, &comp->class0[j]);
-
-    for (j = 0; j < MV_OFFSET_BITS; ++j)
-      update_mv(r, &comp->bits[j]);
+    nmv_component *const comp_ctx = &ctx->comps[i];
+    update_mv_probs(&comp_ctx->sign, 1, r);
+    update_mv_probs(comp_ctx->classes, MV_CLASSES - 1, r);
+    update_mv_probs(comp_ctx->class0, CLASS0_SIZE - 1, r);
+    update_mv_probs(comp_ctx->bits, MV_OFFSET_BITS, r);
   }
 
   for (i = 0; i < 2; ++i) {
-    nmv_component *const comp = &mvc->comps[i];
-
+    nmv_component *const comp_ctx = &ctx->comps[i];
     for (j = 0; j < CLASS0_SIZE; ++j)
-      for (k = 0; k < 3; ++k)
-        update_mv(r, &comp->class0_fp[j][k]);
-
-    for (j = 0; j < 3; ++j)
-      update_mv(r, &comp->fp[j]);
+      update_mv_probs(comp_ctx->class0_fp[j], 3, r);
+    update_mv_probs(comp_ctx->fp, 3, r);
   }
 
   if (allow_hp) {
     for (i = 0; i < 2; ++i) {
-      update_mv(r, &mvc->comps[i].class0_hp);
-      update_mv(r, &mvc->comps[i].hp);
+      nmv_component *const comp_ctx = &ctx->comps[i];
+      update_mv_probs(&comp_ctx->class0_hp, 1, r);
+      update_mv_probs(&comp_ctx->hp, 1, r);
     }
   }
 }
@@ -209,20 +200,22 @@
 // Allocate storage for each tile column.
 // TODO(jzern): when max_threads <= 1 the same storage could be used for each
 // tile.
-static void alloc_tile_storage(VP9D_COMP *pbi, int tile_cols) {
+static void alloc_tile_storage(VP9D_COMP *pbi, int tile_rows, int tile_cols) {
   VP9_COMMON *const cm = &pbi->common;
   const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
-  int i, tile_col;
+  int i, tile_row, tile_col;
 
   CHECK_MEM_ERROR(cm, pbi->mi_streams,
-                  vpx_realloc(pbi->mi_streams, tile_cols *
+                  vpx_realloc(pbi->mi_streams, tile_rows * tile_cols *
                               sizeof(*pbi->mi_streams)));
-  for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
-    TileInfo tile;
-
-    vp9_tile_init(&tile, cm, 0, tile_col);
-    pbi->mi_streams[tile_col] =
-        &cm->mi[cm->mi_rows * tile.mi_col_start];
+  for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
+    for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+      TileInfo tile;
+      vp9_tile_init(&tile, cm, tile_row, tile_col);
+      pbi->mi_streams[tile_row * tile_cols + tile_col] =
+          &cm->mi[tile.mi_row_start * cm->mode_info_stride
+                  + tile.mi_col_start];
+    }
   }
 
   // 2 contexts per 'mi unit', so that we have one context per 4x4 txfm
@@ -360,16 +353,15 @@
   const int bh = num_8x8_blocks_high_lookup[bsize];
   const int bw = num_8x8_blocks_wide_lookup[bsize];
   const int offset = mi_row * cm->mode_info_stride + mi_col;
-
-  xd->mode_info_stride = cm->mode_info_stride;
+  const int tile_offset = tile->mi_row_start * cm->mode_info_stride +
+                          tile->mi_col_start;
 
   xd->mi_8x8 = cm->mi_grid_visible + offset;
   xd->prev_mi_8x8 = cm->prev_mi_grid_visible + offset;
 
   // we are using the mode info context stream here
-  xd->mi_8x8[0] = xd->mi_stream;
+  xd->mi_8x8[0] = xd->mi_stream + offset - tile_offset;
   xd->mi_8x8[0]->mbmi.sb_type = bsize;
-  ++xd->mi_stream;
 
   // Special case: if prev_mi is NULL, the previous mode info context
   // cannot be used.
@@ -768,9 +760,10 @@
 }
 
 static void setup_tile_context(VP9D_COMP *const pbi, MACROBLOCKD *const xd,
-                               int tile_col) {
+                               int tile_row, int tile_col) {
   int i;
-  xd->mi_stream = pbi->mi_streams[tile_col];
+  const int tile_cols = 1 << pbi->common.log2_tile_cols;
+  xd->mi_stream = pbi->mi_streams[tile_row * tile_cols + tile_col];
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     xd->above_context[i] = pbi->above_context[i];
@@ -874,77 +867,68 @@
   return size;
 }
 
-static const uint8_t *decode_tiles(VP9D_COMP *pbi, const uint8_t *data) {
-  vp9_reader residual_bc;
+typedef struct TileBuffer {
+  const uint8_t *data;
+  size_t size;
+} TileBuffer;
 
+static const uint8_t *decode_tiles(VP9D_COMP *pbi, const uint8_t *data) {
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
-
-  const uint8_t *const data_end = pbi->source + pbi->source_sz;
-  const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+  const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols);
   const int tile_cols = 1 << cm->log2_tile_cols;
   const int tile_rows = 1 << cm->log2_tile_rows;
+  TileBuffer tile_buffers[4][1 << 6];
   int tile_row, tile_col;
+  const uint8_t *const data_end = pbi->source + pbi->source_sz;
+  const uint8_t *end = NULL;
+  vp9_reader r;
+
+  assert(tile_rows <= 4);
+  assert(tile_cols <= (1 << 6));
 
   // Note: this memset assumes above_context[0], [1] and [2]
   // are allocated as part of the same buffer.
   vpx_memset(pbi->above_context[0], 0,
-             sizeof(*pbi->above_context[0]) * MAX_MB_PLANE *
-             2 * aligned_mi_cols);
+             sizeof(*pbi->above_context[0]) * MAX_MB_PLANE * 2 * aligned_cols);
 
   vpx_memset(pbi->above_seg_context, 0,
-             sizeof(*pbi->above_seg_context) * aligned_mi_cols);
+             sizeof(*pbi->above_seg_context) * aligned_cols);
 
-  if (pbi->oxcf.inv_tile_order) {
-    const uint8_t *data_ptr2[4][1 << 6];
-    vp9_reader bc_bak = {0};
-
-    // pre-initialize the offsets, we're going to decode in inverse order
-    data_ptr2[0][0] = data;
-    for (tile_row = 0; tile_row < tile_rows; tile_row++) {
-      for (tile_col = 0; tile_col < tile_cols; tile_col++) {
-        const int last_tile =
-            tile_row == tile_rows - 1 && tile_col == tile_cols - 1;
-        const size_t size = get_tile(data_end, last_tile, &cm->error, &data);
-        data_ptr2[tile_row][tile_col] = data;
-        data += size;
-      }
-    }
-
-    for (tile_row = 0; tile_row < tile_rows; tile_row++) {
-      for (tile_col = tile_cols - 1; tile_col >= 0; tile_col--) {
-        TileInfo tile;
-
-        vp9_tile_init(&tile, cm, tile_row, tile_col);
-        setup_token_decoder(data_ptr2[tile_row][tile_col], data_end,
-                            data_end - data_ptr2[tile_row][tile_col],
-                            &cm->error, &residual_bc);
-        setup_tile_context(pbi, xd, tile_col);
-        decode_tile(pbi, &tile, &residual_bc);
-        if (tile_row == tile_rows - 1 && tile_col == tile_cols - 1)
-          bc_bak = residual_bc;
-      }
-    }
-    residual_bc = bc_bak;
-  } else {
-    for (tile_row = 0; tile_row < tile_rows; tile_row++) {
-      for (tile_col = 0; tile_col < tile_cols; tile_col++) {
-        const int last_tile =
-            tile_row == tile_rows - 1 && tile_col == tile_cols - 1;
-        const size_t size = get_tile(data_end, last_tile, &cm->error, &data);
-        TileInfo tile;
-
-        vp9_tile_init(&tile, cm, tile_row, tile_col);
-
-        setup_token_decoder(data, data_end, size, &cm->error, &residual_bc);
-        setup_tile_context(pbi, xd, tile_col);
-        decode_tile(pbi, &tile, &residual_bc);
-        data += size;
-      }
+  // Load tile data into tile_buffers
+  for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
+    for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+      const int last_tile = tile_row == tile_rows - 1 &&
+                            tile_col == tile_cols - 1;
+      const size_t size = get_tile(data_end, last_tile, &cm->error, &data);
+      TileBuffer *const buf = &tile_buffers[tile_row][tile_col];
+      buf->data = data;
+      buf->size = size;
+      data += size;
     }
   }
 
-  return vp9_reader_find_end(&residual_bc);
+  // Decode tiles using data from tile_buffers
+  for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
+    for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+      const int col = pbi->oxcf.inv_tile_order ? tile_cols - tile_col - 1
+                                               : tile_col;
+      const int last_tile = tile_row == tile_rows - 1 &&
+                                 col == tile_cols - 1;
+      const TileBuffer *const buf = &tile_buffers[tile_row][col];
+      TileInfo tile;
+
+      vp9_tile_init(&tile, cm, tile_row, col);
+      setup_token_decoder(buf->data, data_end, buf->size, &cm->error, &r);
+      setup_tile_context(pbi, xd, tile_row, col);
+      decode_tile(pbi, &tile, &r);
+
+      if (last_tile)
+        end = vp9_reader_find_end(&r);
+    }
+  }
+
+  return end;
 }
 
 static int tile_worker_hook(void *arg1, void *arg2) {
@@ -1023,7 +1007,7 @@
 
       setup_token_decoder(data, data_end, size, &cm->error,
                           &tile_data->bit_reader);
-      setup_tile_context(pbi, &tile_data->xd, tile_col);
+      setup_tile_context(pbi, &tile_data->xd, 0, tile_col);
 
       worker->had_error = 0;
       if (i == num_workers - 1 || tile_col == tile_cols - 1) {
@@ -1227,7 +1211,7 @@
       for (i = 0; i < PARTITION_TYPES - 1; ++i)
         vp9_diff_update_prob(&r, &fc->partition_prob[j][i]);
 
-    read_mv_probs(&r, nmvc, cm->allow_high_precision_mv);
+    read_mv_probs(nmvc, cm->allow_high_precision_mv, &r);
   }
 
   return vp9_reader_has_error(&r);
@@ -1323,7 +1307,7 @@
     }
   }
 
-  alloc_tile_storage(pbi, tile_cols);
+  alloc_tile_storage(pbi, tile_rows, tile_cols);
 
   xd->mi_8x8 = cm->mi_grid_visible;
   xd->mode_info_stride = cm->mode_info_stride;
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 87bd36c..e1978d4 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -169,10 +169,8 @@
                         const unsigned int num_events[/* n */]) {
   int i = 0;
 
-  vp9_tree_probs_from_distribution(tree, bct, num_events, 0);
-  n--;
-
-  for (i = 0; i < n; ++i)
+  vp9_tree_probs_from_distribution(tree, bct, num_events);
+  for (i = 0; i < n - 1; ++i)
     vp9_cond_prob_diff_update(w, &Pcur[i], bct[i]);
 }
 
@@ -231,7 +229,7 @@
   int i, j;
   for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) {
     vp9_tree_probs_from_distribution(vp9_switchable_interp_tree, branch_ct,
-                                     cm->counts.switchable_interp[j], 0);
+                                     cm->counts.switchable_interp[j]);
 
     for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i)
       vp9_cond_prob_diff_update(w, &cm->fc.switchable_interp_prob[j][i],
@@ -250,7 +248,7 @@
   for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
     unsigned int branch_ct[INTER_MODES - 1][2];
     vp9_tree_probs_from_distribution(vp9_inter_mode_tree, branch_ct,
-                                     cm->counts.inter_mode[i], NEARESTMV);
+                                     cm->counts.inter_mode[i]);
 
     for (j = 0; j < INTER_MODES - 1; ++j)
       vp9_cond_prob_diff_update(w, &cm->fc.inter_mode_probs[i][j],
@@ -321,7 +319,7 @@
                             const vp9_prob *p) {
   assert(is_inter_mode(mode));
   write_token(w, vp9_inter_mode_tree, p,
-              &vp9_inter_mode_encodings[inter_mode_offset(mode)]);
+              &vp9_inter_mode_encodings[INTER_OFFSET(mode)]);
 }
 
 
@@ -448,7 +446,7 @@
       if (bsize >= BLOCK_8X8) {
         write_sb_mv_ref(bc, mode, mv_ref_p);
         ++cm->counts.inter_mode[mi->mode_context[rf]]
-                               [inter_mode_offset(mode)];
+                               [INTER_OFFSET(mode)];
       }
     }
 
@@ -471,7 +469,7 @@
           const MB_PREDICTION_MODE blockmode = m->bmi[j].as_mode;
           write_sb_mv_ref(bc, blockmode, mv_ref_p);
           ++cm->counts.inter_mode[mi->mode_context[rf]]
-                                 [inter_mode_offset(blockmode)];
+                                 [INTER_OFFSET(blockmode)];
 
           if (blockmode == NEWMV) {
 #ifdef ENTROPY_STATS
@@ -703,7 +701,7 @@
             continue;
           vp9_tree_probs_from_distribution(vp9_coef_tree,
                                            coef_branch_ct[i][j][k][l],
-                                           coef_counts[i][j][k][l], 0);
+                                           coef_counts[i][j][k][l]);
           coef_branch_ct[i][j][k][l][0][1] = eob_branch_ct[i][j][k][l] -
                                              coef_branch_ct[i][j][k][l][0][0];
           for (m = 0; m < UNCONSTRAINED_NODES; ++m)
@@ -1217,7 +1215,7 @@
     for (tile_col = 0; tile_col < tile_cols; tile_col++) {
       TileInfo tile;
 
-      vp9_tile_init(&tile, cm, 0, tile_col);
+      vp9_tile_init(&tile, cm, tile_row, tile_col);
       tok_end = tok[tile_row][tile_col] + cpi->tok_count[tile_row][tile_col];
 
       if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1)
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index a45299b..702fc70 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -1765,7 +1765,7 @@
 }
 
 static void encode_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
-                          int mi_row, TOKENEXTRA **tp, int *totalrate) {
+                          int mi_row, TOKENEXTRA **tp) {
   VP9_COMMON * const cm = &cpi->common;
   int mi_col;
 
@@ -1910,7 +1910,6 @@
   MACROBLOCK * const x = &cpi->mb;
   VP9_COMMON * const cm = &cpi->common;
   MACROBLOCKD * const xd = &x->e_mbd;
-  int totalrate;
 
 //  fprintf(stderr, "encode_frame_internal frame %d (%d) type %d\n",
 //           cpi->common.current_video_frame, cpi->common.show_frame,
@@ -1926,8 +1925,6 @@
   }
 #endif
 
-  totalrate = 0;
-
   vp9_zero(cm->counts.switchable_interp);
   vp9_zero(cpi->tx_stepdown_count);
 
@@ -1989,7 +1986,7 @@
           vp9_tile_init(&tile, cm, tile_row, tile_col);
           for (mi_row = tile.mi_row_start;
                mi_row < tile.mi_row_end; mi_row += 8)
-            encode_sb_row(cpi, &tile, mi_row, &tp, &totalrate);
+            encode_sb_row(cpi, &tile, mi_row, &tp);
 
           cpi->tok_count[tile_row][tile_col] = (unsigned int)(tp - tp_old);
           assert(tp - cpi->tok <= get_token_alloc(cm->mb_rows, cm->mb_cols));
@@ -2015,10 +2012,6 @@
     cpi->sf.skip_encode_frame = 0;
   }
 
-  // 256 rate units to the bit,
-  // projected_frame_size in units of BYTES
-  cpi->projected_frame_size = totalrate >> 8;
-
 #if 0
   // Keep record of the total distortion this time around for future use
   cpi->last_frame_distortion = cpi->frame_distortion;
diff --git a/vp9/encoder/vp9_encodemv.c b/vp9/encoder/vp9_encodemv.c
index e2c6c4c..030ca64 100644
--- a/vp9/encoder/vp9_encodemv.c
+++ b/vp9/encoder/vp9_encodemv.c
@@ -155,9 +155,8 @@
     unsigned int (*branch_ct_class0_hp)[2],
     unsigned int (*branch_ct_hp)[2]) {
   int i, j, k;
-  vp9_tree_probs_from_distribution(vp9_mv_joint_tree,
-                                   branch_ct_joint,
-                                   nmv_count->joints, 0);
+  vp9_tree_probs_from_distribution(vp9_mv_joint_tree, branch_ct_joint,
+                                   nmv_count->joints);
   for (i = 0; i < 2; ++i) {
     const uint32_t s0 = nmv_count->comps[i].sign[0];
     const uint32_t s1 = nmv_count->comps[i].sign[1];
@@ -166,10 +165,10 @@
     branch_ct_sign[i][1] = s1;
     vp9_tree_probs_from_distribution(vp9_mv_class_tree,
                                     branch_ct_classes[i],
-                                    nmv_count->comps[i].classes, 0);
+                                    nmv_count->comps[i].classes);
     vp9_tree_probs_from_distribution(vp9_mv_class0_tree,
                                      branch_ct_class0[i],
-                                     nmv_count->comps[i].class0, 0);
+                                     nmv_count->comps[i].class0);
     for (j = 0; j < MV_OFFSET_BITS; ++j) {
       const uint32_t b0 = nmv_count->comps[i].bits[j][0];
       const uint32_t b1 = nmv_count->comps[i].bits[j][1];
@@ -182,11 +181,11 @@
     for (k = 0; k < CLASS0_SIZE; ++k) {
       vp9_tree_probs_from_distribution(vp9_mv_fp_tree,
                                        branch_ct_class0_fp[i][k],
-                                       nmv_count->comps[i].class0_fp[k], 0);
+                                       nmv_count->comps[i].class0_fp[k]);
     }
     vp9_tree_probs_from_distribution(vp9_mv_fp_tree,
                                      branch_ct_fp[i],
-                                     nmv_count->comps[i].fp, 0);
+                                     nmv_count->comps[i].fp);
   }
   if (usehp) {
     for (i = 0; i < 2; ++i) {
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index fca7525..6222394 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -22,7 +22,7 @@
 extern int enc_debug;
 #endif
 
-void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
+void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t count,
                       int skip_block,
                       const int16_t *zbin_ptr, const int16_t *round_ptr,
                       const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
@@ -30,58 +30,44 @@
                       const int16_t *dequant_ptr,
                       int zbin_oq_value, uint16_t *eob_ptr,
                       const int16_t *scan, const int16_t *iscan) {
-  int i, rc, eob;
-  int zbins[2], nzbins[2], zbin;
-  int x, y, z, sz;
-  int zero_flag = n_coeffs;
+  int i, non_zero_count = count, eob = -1;
+  const int zbins[2] = { zbin_ptr[0] + zbin_oq_value,
+                         zbin_ptr[1] + zbin_oq_value };
+  const int nzbins[2] = { zbins[0] * -1,
+                          zbins[1] * -1 };
 
-  vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
-  vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
-
-  eob = -1;
-
-  // Base ZBIN
-  zbins[0] = zbin_ptr[0] + zbin_oq_value;
-  zbins[1] = zbin_ptr[1] + zbin_oq_value;
-  nzbins[0] = zbins[0] * -1;
-  nzbins[1] = zbins[1] * -1;
+  vpx_memset(qcoeff_ptr, 0, count * sizeof(int16_t));
+  vpx_memset(dqcoeff_ptr, 0, count * sizeof(int16_t));
 
   if (!skip_block) {
     // Pre-scan pass
-    for (i = n_coeffs - 1; i >= 0; i--) {
-      rc = scan[i];
-      z = coeff_ptr[rc];
+    for (i = count - 1; i >= 0; i--) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
 
-      if (z < zbins[rc != 0] && z > nzbins[rc != 0]) {
-        zero_flag--;
-      } else {
+      if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
+        non_zero_count--;
+      else
         break;
-      }
     }
 
     // Quantization pass: All coefficients with index >= zero_flag are
     // skippable. Note: zero_flag can be zero.
-    for (i = 0; i < zero_flag; i++) {
-      rc = scan[i];
-      z  = coeff_ptr[rc];
+    for (i = 0; i < non_zero_count; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
 
-      zbin = (zbins[rc != 0]);
+      if (abs_coeff >= zbins[rc != 0]) {
+        int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+        tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+                  quant_shift_ptr[rc != 0]) >> 16;  // quantization
+        qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
+        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
 
-      sz = (z >> 31);                               // sign of z
-      x  = (z ^ sz) - sz;
-
-      if (x >= zbin) {
-        x += (round_ptr[rc != 0]);
-        x  = clamp(x, INT16_MIN, INT16_MAX);
-        y  = (((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x)) *
-              quant_shift_ptr[rc != 0]) >> 16;      // quantize (x)
-        x  = (y ^ sz) - sz;                         // get the sign back
-        qcoeff_ptr[rc]  = x;                        // write to destination
-        dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0];  // dequantized value
-
-        if (y) {
-          eob = i;                                  // last nonzero coeffs
-        }
+        if (tmp)
+          eob = i;
       }
     }
   }
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 993919e..6aaa2c7 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -268,10 +268,10 @@
       MB_PREDICTION_MODE m;
 
       for (m = NEARESTMV; m < MB_MODE_COUNT; m++)
-        cpi->mb.inter_mode_cost[i][inter_mode_offset(m)] =
+        cpi->mb.inter_mode_cost[i][INTER_OFFSET(m)] =
             cost_token(vp9_inter_mode_tree,
                        cm->fc.inter_mode_probs[i],
-                       &vp9_inter_mode_encodings[inter_mode_offset(m)]);
+                       &vp9_inter_mode_encodings[INTER_OFFSET(m)]);
     }
   }
 }
@@ -609,7 +609,7 @@
 
   // TODO(jingning): temporarily enabled only for luma component
   rd = MIN(rd1, rd2);
-  if (plane == 0)
+  if (!xd->lossless && plane == 0)
     x->zcoeff_blk[tx_size][block] = rd1 > rd2 || !xd->plane[plane].eobs[block];
 
   args->this_rate += args->rate;
@@ -1416,7 +1416,7 @@
   // Don't account for mode here if segment skip is enabled.
   if (!vp9_segfeature_active(&cpi->common.seg, segment_id, SEG_LVL_SKIP)) {
     assert(is_inter_mode(mode));
-    return x->inter_mode_cost[mode_context][inter_mode_offset(mode)];
+    return x->inter_mode_cost[mode_context][INTER_OFFSET(mode)];
   } else {
     return 0;
   }
@@ -1707,7 +1707,7 @@
         const struct buf_2d orig_src = x->plane[0].src;
         struct buf_2d orig_pre[2];
 
-        mode_idx = inter_mode_offset(this_mode);
+        mode_idx = INTER_OFFSET(this_mode);
         bsi->rdstat[i][mode_idx].brdcost = INT64_MAX;
 
         // if we're near/nearest and mv == 0,0, compare to zeromv
@@ -2002,7 +2002,7 @@
         return;
       }
 
-      mode_idx = inter_mode_offset(mode_selected);
+      mode_idx = INTER_OFFSET(mode_selected);
       vpx_memcpy(t_above, bsi->rdstat[i][mode_idx].ta, sizeof(t_above));
       vpx_memcpy(t_left, bsi->rdstat[i][mode_idx].tl, sizeof(t_left));
 
@@ -2078,7 +2078,7 @@
     return INT64_MAX;
   /* set it to the best */
   for (i = 0; i < 4; i++) {
-    mode_idx = inter_mode_offset(bsi->modes[i]);
+    mode_idx = INTER_OFFSET(bsi->modes[i]);
     mi->bmi[i].as_mv[0].as_int = bsi->rdstat[i][mode_idx].mvs[0].as_int;
     if (has_second_ref(mbmi))
       mi->bmi[i].as_mv[1].as_int = bsi->rdstat[i][mode_idx].mvs[1].as_int;
diff --git a/vp9/encoder/x86/vp9_subpel_variance.asm b/vp9/encoder/x86/vp9_subpel_variance.asm
index 533456b..1a9e4e8 100644
--- a/vp9/encoder/x86/vp9_subpel_variance.asm
+++ b/vp9/encoder/x86/vp9_subpel_variance.asm
@@ -118,6 +118,14 @@
   RET
 %endmacro
 
+%macro INC_SRC_BY_SRC_STRIDE  0
+%if ARCH_X86=1 && CONFIG_PIC=1
+  add                srcq, src_stridemp
+%else
+  add                srcq, src_strideq
+%endif
+%endmacro
+
 %macro SUBPEL_VARIANCE 1-2 0 ; W
 %if cpuflag(ssse3)
 %define bilin_filter_m bilin_filter_m_ssse3
@@ -129,41 +137,85 @@
 ; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
 ; 11, not 13, if the registers are ordered correctly. May make a minor speed
 ; difference on Win64
-%ifdef PIC
-%if %2 == 1 ; avg
-cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
-                                              x_offset, y_offset, \
-                                              dst, dst_stride, \
-                                              sec, sec_stride, height, sse
-%define sec_str sec_strideq
+
+%ifdef PIC    ; 64bit PIC
+  %if %2 == 1 ; avg
+    cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
+                                      x_offset, y_offset, \
+                                      dst, dst_stride, \
+                                      sec, sec_stride, height, sse
+    %define sec_str sec_strideq
+  %else
+    cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
+                                  y_offset, dst, dst_stride, height, sse
+  %endif
+  %define h heightd
+  %define bilin_filter sseq
 %else
-cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, y_offset, \
-                                          dst, dst_stride, height, sse
+  %if ARCH_X86=1 && CONFIG_PIC=1
+    %if %2 == 1 ; avg
+      cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+                                  x_offset, y_offset, \
+                                  dst, dst_stride, \
+                                  sec, sec_stride, \
+                                  height, sse, g_bilin_filter, g_pw_8
+      %define h dword heightm
+      %define sec_str sec_stridemp
+
+      ;Store bilin_filter and pw_8 location in stack
+      GET_GOT eax
+      add esp, 4                ; restore esp
+
+      lea ecx, [GLOBAL(bilin_filter_m)]
+      mov g_bilin_filterm, ecx
+
+      lea ecx, [GLOBAL(pw_8)]
+      mov g_pw_8m, ecx
+
+      LOAD_IF_USED 0, 1         ; load eax, ecx back
+    %else
+      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
+                                y_offset, dst, dst_stride, height, sse, \
+                                g_bilin_filter, g_pw_8
+      %define h heightd
+
+      ;Store bilin_filter and pw_8 location in stack
+      GET_GOT eax
+      add esp, 4                ; restore esp
+
+      lea ecx, [GLOBAL(bilin_filter_m)]
+      mov g_bilin_filterm, ecx
+
+      lea ecx, [GLOBAL(pw_8)]
+      mov g_pw_8m, ecx
+
+      LOAD_IF_USED 0, 1         ; load eax, ecx back
+    %endif
+  %else
+    %if %2 == 1 ; avg
+      cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
+                        7 + 2 * ARCH_X86_64, 13, src, src_stride, \
+                                             x_offset, y_offset, \
+                                             dst, dst_stride, \
+                                             sec, sec_stride, \
+                                             height, sse
+      %if ARCH_X86_64
+      %define h heightd
+      %define sec_str sec_strideq
+      %else
+      %define h dword heightm
+      %define sec_str sec_stridemp
+      %endif
+    %else
+      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
+                              y_offset, dst, dst_stride, height, sse
+      %define h heightd
+    %endif
+
+    %define bilin_filter bilin_filter_m
+  %endif
 %endif
-%define h heightd
-%define bilin_filter sseq
-%else
-%if %2 == 1 ; avg
-cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
-                                    7 + 2 * ARCH_X86_64, 13, src, src_stride, \
-                                                         x_offset, y_offset, \
-                                                         dst, dst_stride, \
-                                                         sec, sec_stride, \
-                                                         height, sse
-%if ARCH_X86_64
-%define h heightd
-%define sec_str sec_strideq
-%else
-%define h dword heightm
-%define sec_str sec_stridemp
-%endif
-%else
-cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
-                                          dst, dst_stride, height, sse
-%define h heightd
-%endif
-%define bilin_filter bilin_filter_m
-%endif
+
   ASSERT               %1 <= 16         ; m6 overflows if w > 16
   pxor                 m6, m6           ; sum
   pxor                 m7, m7           ; sse
@@ -329,11 +381,22 @@
 %define filter_y_b m9
 %define filter_rnd m10
 %else ; x86-32 or mmx
+%if ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0, reuse x_offset reg
+%define tempq x_offsetq
+  add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
   add           y_offsetq, bilin_filter
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]
 %define filter_rnd [pw_8]
 %endif
+%endif
+
 .x_zero_y_other_loop:
 %if %1 == 16
   movu                 m0, [srcq]
@@ -615,12 +678,23 @@
 %define filter_y_a m8
 %define filter_y_b m9
 %define filter_rnd m10
+%else  ;x86_32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0.5. We can reuse x_offset reg
+%define tempq x_offsetq
+  add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
 %else
   add           y_offsetq, bilin_filter
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]
 %define filter_rnd [pw_8]
 %endif
+%endif
+
 %if %1 == 16
   movu                 m0, [srcq]
   movu                 m3, [srcq+1]
@@ -752,12 +826,23 @@
 %define filter_x_a m8
 %define filter_x_b m9
 %define filter_rnd m10
+%else    ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+;y_offset == 0. We can reuse y_offset reg.
+%define tempq y_offsetq
+  add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
 %else
   add           x_offsetq, bilin_filter
 %define filter_x_a [x_offsetq]
 %define filter_x_b [x_offsetq+16]
 %define filter_rnd [pw_8]
 %endif
+%endif
+
 .x_other_y_zero_loop:
 %if %1 == 16
   movu                 m0, [srcq]
@@ -873,12 +958,23 @@
 %define filter_x_a m8
 %define filter_x_b m9
 %define filter_rnd m10
+%else    ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; y_offset == 0.5. We can reuse y_offset reg.
+%define tempq y_offsetq
+  add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
 %else
   add           x_offsetq, bilin_filter
 %define filter_x_a [x_offsetq]
 %define filter_x_b [x_offsetq+16]
 %define filter_rnd [pw_8]
 %endif
+%endif
+
 %if %1 == 16
   movu                 m0, [srcq]
   movu                 m1, [srcq+1]
@@ -1057,6 +1153,21 @@
 %define filter_y_a m10
 %define filter_y_b m11
 %define filter_rnd m12
+%else   ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; In this case, there is NO unused register. Used src_stride register. Later,
+; src_stride has to be loaded from stack when it is needed.
+%define tempq src_strideq
+  mov tempq, g_bilin_filterm
+  add           x_offsetq, tempq
+  add           y_offsetq, tempq
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
 %else
   add           x_offsetq, bilin_filter
   add           y_offsetq, bilin_filter
@@ -1066,6 +1177,8 @@
 %define filter_y_b [y_offsetq+16]
 %define filter_rnd [pw_8]
 %endif
+%endif
+
   ; x_offset == bilin interpolation && y_offset == bilin interpolation
 %if %1 == 16
   movu                 m0, [srcq]
@@ -1093,7 +1206,9 @@
 %endif
   psraw                m0, 4
   psraw                m2, 4
-  add                srcq, src_strideq
+
+  INC_SRC_BY_SRC_STRIDE
+
   packuswb             m0, m2
 .x_other_y_other_loop:
 %if cpuflag(ssse3)
@@ -1163,7 +1278,7 @@
   SUM_SSE              m0, m1, m2, m3, m6, m7
   mova                 m0, m4
 
-  add                srcq, src_strideq
+  INC_SRC_BY_SRC_STRIDE
   add                dstq, dst_strideq
 %else ; %1 < 16
   movh                 m0, [srcq]
@@ -1184,12 +1299,17 @@
 %if cpuflag(ssse3)
   packuswb             m0, m0
 %endif
-  add                srcq, src_strideq
+
+  INC_SRC_BY_SRC_STRIDE
+
 .x_other_y_other_loop:
   movh                 m2, [srcq]
   movh                 m1, [srcq+1]
-  movh                 m4, [srcq+src_strideq]
-  movh                 m3, [srcq+src_strideq+1]
+
+  INC_SRC_BY_SRC_STRIDE
+  movh                 m4, [srcq]
+  movh                 m3, [srcq+1]
+
 %if cpuflag(ssse3)
   punpcklbw            m2, m1
   punpcklbw            m4, m3
@@ -1253,7 +1373,7 @@
   SUM_SSE              m0, m1, m2, m3, m6, m7
   mova                 m0, m4
 
-  lea                srcq, [srcq+src_strideq*2]
+  INC_SRC_BY_SRC_STRIDE
   lea                dstq, [dstq+dst_strideq*2]
 %endif
 %if %2 == 1 ; avg
diff --git a/vpxdec.c b/vpxdec.c
index 8e575e1..110e4ac 100644
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -33,21 +33,6 @@
 #include "nestegg/include/nestegg/nestegg.h"
 #include "third_party/libyuv/include/libyuv/scale.h"
 
-#if CONFIG_OS_SUPPORT
-#if defined(_MSC_VER)
-#include <io.h>
-#define snprintf _snprintf
-#define isatty   _isatty
-#define fileno   _fileno
-#else
-#include <unistd.h>
-#endif
-#endif
-
-#ifndef PATH_MAX
-#define PATH_MAX 256
-#endif
-
 static const char *exec_name;
 
 static const struct {
diff --git a/vpxenc.c b/vpxenc.c
index df75b85..f0f0df3 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -49,31 +49,6 @@
 #include "webmenc.h"
 #include "y4minput.h"
 
-
-/* Need special handling of these functions on Windows */
-#if defined(_MSC_VER)
-/* MSVS doesn't define off_t, and uses _f{seek,tell}i64 */
-typedef __int64 off_t;
-#define fseeko _fseeki64
-#define ftello _ftelli64
-#elif defined(_WIN32)
-/* MinGW defines off_t as long
-   and uses f{seek,tell}o64/off64_t for large files */
-#define fseeko fseeko64
-#define ftello ftello64
-#define off_t off64_t
-#endif
-
-#define LITERALU64(hi,lo) ((((uint64_t)hi)<<32)|lo)
-
-/* We should use 32-bit file operations in WebM file format
- * when building ARM executable file (.axf) with RVCT */
-#if !CONFIG_OS_SUPPORT
-typedef long off_t;
-#define fseeko fseek
-#define ftello ftell
-#endif
-
 /* Swallow warnings about unused results of fread/fwrite */
 static size_t wrap_fread(void *ptr, size_t size, size_t nmemb,
                          FILE *stream) {
@@ -1409,17 +1384,7 @@
     } else
       fatal("Unsupported Y4M stream.");
   } else if (input->detect.buf_read == 4 && file_is_ivf(input, &fourcc)) {
-    input->file_type = FILE_TYPE_IVF;
-    switch (fourcc) {
-      case 0x32315659:
-        input->use_i420 = 0;
-        break;
-      case 0x30323449:
-        input->use_i420 = 1;
-        break;
-      default:
-        fatal("Unsupported fourcc (%08x) in IVF", fourcc);
-    }
+    fatal("IVF is not supported as input.");
   } else {
     input->file_type = FILE_TYPE_RAW;
   }
@@ -1433,7 +1398,7 @@
 }
 
 static struct stream_state *new_stream(struct global_config *global,
-                                       struct stream_state  *prev) {
+                                       struct stream_state *prev) {
   struct stream_state *stream;
 
   stream = calloc(1, sizeof(*stream));
diff --git a/webmenc.c b/webmenc.c
index a584e9d..17bbeec 100644
--- a/webmenc.c
+++ b/webmenc.c
@@ -15,20 +15,6 @@
 #include "third_party/libmkv/EbmlWriter.h"
 #include "third_party/libmkv/EbmlIDs.h"
 
-#if defined(_MSC_VER)
-/* MSVS uses _f{seek,tell}i64 */
-#define fseeko _fseeki64
-#define ftello _ftelli64
-#elif defined(_WIN32)
-/* MinGW defines off_t as long, and uses f{seek,tell}o64/off64_t for large
- * files */
-#define fseeko fseeko64
-#define ftello ftello64
-#define off_t off64_t
-#endif
-
-#define LITERALU64(hi, lo) ((((uint64_t)hi) << 32) | lo)
-
 void Ebml_Write(struct EbmlGlobal *glob,
                 const void *buffer_in,
                 unsigned long len) {