Merge changes Ifacbf5a0,Ibad7c3dd into experimental

* changes:
  vpxenc: actually report mismatch on stderr.
  Make superblocks independent of macroblock code and data.
diff --git a/vp8/encoder/x86/quantize_sse2.c b/vp8/encoder/x86/quantize_sse2_intrinsics.c
similarity index 100%
rename from vp8/encoder/x86/quantize_sse2.c
rename to vp8/encoder/x86/quantize_sse2_intrinsics.c
diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk
index f6feafb..2a0e7c5 100644
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -89,13 +89,13 @@
 VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp8_enc_stubs_mmx.c
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
-VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.c
+VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2_intrinsics.c
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.asm
 
 # TODO(johann) make this generic
 ifeq ($(HAVE_SSE2),yes)
-vp8/encoder/x86/quantize_sse2.c.o: CFLAGS += -msse2
-vp8/encoder/x86/quantize_sse2.c.d: CFLAGS += -msse2
+vp8/encoder/x86/quantize_sse2_intrinsics.c.o: CFLAGS += -msse2
+vp8/encoder/x86/quantize_sse2_intrinsics.c.d: CFLAGS += -msse2
 endif
 
 ifeq ($(CONFIG_TEMPORAL_DENOISING),yes)
diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h
index 17d0134..bc79b5c 100644
--- a/vp9/common/vp9_idct.h
+++ b/vp9/common/vp9_idct.h
@@ -84,11 +84,4 @@
   transform_1d cols, rows;  // vertical and horizontal
 } transform_2d;
 
-#define ROUND_POWER_OF_TWO(value, n) (((value) + (1 << ((n) - 1))) >> (n))
-
-/* If we don't want to use ROUND_POWER_OF_TWO macro
-static INLINE int16_t round_power_of_two(int16_t value, int n) {
-  return (value + (1 << (n - 1))) >> n;
-}*/
-
 #endif  // VP9_COMMON_VP9_IDCT_H_
diff --git a/vp9/common/vp9_onyx.h b/vp9/common/vp9_onyx.h
index d93b7d5..79d0609 100644
--- a/vp9/common/vp9_onyx.h
+++ b/vp9/common/vp9_onyx.h
@@ -94,7 +94,7 @@
     int Width;              // width of data passed to the compressor
     int Height;             // height of data passed to the compressor
     double frame_rate;       // set to passed in framerate
-    int target_bandwidth;    // bandwidth to be used in kilobits per second
+    int64_t target_bandwidth;    // bandwidth to be used in kilobits per second
 
     int noise_sensitivity;   // parameter used for applying pre processing blur: recommendation 0
     int Sharpness;          // parameter used for sharpening output: recommendation 0:
@@ -135,9 +135,9 @@
     int over_shoot_pct;
 
     // buffering parameters
-    int starting_buffer_level;  // in seconds
-    int optimal_buffer_level;
-    int maximum_buffer_size;
+    int64_t starting_buffer_level;  // in seconds
+    int64_t optimal_buffer_level;
+    int64_t maximum_buffer_size;
 
     // controlling quality
     int fixed_q;
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index 30e8951..a516eb3 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -20,14 +20,14 @@
 void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,
                                        YV12_BUFFER_CONFIG *other,
                                        int this_w, int this_h) {
-  int other_w, other_h;
+  int other_h = other->y_height;
+  int other_w = other->y_width;
 
-  other_h = other->y_height;
-  other_w = other->y_width;
   scale->x_num = other_w;
   scale->x_den = this_w;
   scale->x_offset_q4 = 0;  // calculated per-mb
   scale->x_step_q4 = 16 * other_w / this_w;
+
   scale->y_num = other_h;
   scale->y_den = this_h;
   scale->y_offset_q4 = 0;  // calculated per-mb
@@ -271,10 +271,8 @@
                                const struct scale_factors *scale,
                                int w, int h, int do_avg,
                                const struct subpix_fn_table *subpix) {
-  int_mv32 mv;
-
-  mv = scale_motion_vector_q3_to_q4(mv_q3, scale);
-  src = src + (mv.as_mv.row >> 4) * src_stride + (mv.as_mv.col >> 4);
+  int_mv32 mv = scale_motion_vector_q3_to_q4(mv_q3, scale);
+  src += (mv.as_mv.row >> 4) * src_stride + (mv.as_mv.col >> 4);
 
   scale->predict[!!(mv.as_mv.col & 15)][!!(mv.as_mv.row & 15)][do_avg](
       src, src_stride, dst, dst_stride,
@@ -306,7 +304,7 @@
   const int subpel_x = scaled_mv_col_q4 & 15;
   const int subpel_y = scaled_mv_row_q4 & 15;
 
-  src = src + (scaled_mv_row_q4 >> 4) * src_stride + (scaled_mv_col_q4 >> 4);
+  src += (scaled_mv_row_q4 >> 4) * src_stride + (scaled_mv_col_q4 >> 4);
   scale->predict[!!subpel_x][!!subpel_y][do_avg](
       src, src_stride, dst, dst_stride,
       subpix->filter_x[subpel_x], scale->x_step_q4,
@@ -500,18 +498,15 @@
   int which_mv;
 
   for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {
-    const int clamp_mvs =
-        which_mv ? xd->mode_info_context->mbmi.need_to_clamp_secondmv
-                 : xd->mode_info_context->mbmi.need_to_clamp_mvs;
-    uint8_t *base_pre;
-    int_mv ymv;
-    int pre_stride;
+    const int clamp_mvs = which_mv ?
+         xd->mode_info_context->mbmi.need_to_clamp_secondmv :
+         xd->mode_info_context->mbmi.need_to_clamp_mvs;
 
+    uint8_t *base_pre = which_mv ? xd->second_pre.y_buffer : xd->pre.y_buffer;
+    int pre_stride = which_mv ? xd->second_pre.y_stride : xd->pre.y_stride;
+    int_mv ymv;
     ymv.as_int = xd->mode_info_context->mbmi.mv[which_mv].as_int;
-    base_pre = which_mv ? xd->second_pre.y_buffer
-                        : xd->pre.y_buffer;
-    pre_stride = which_mv ? xd->second_pre.y_stride
-                          : xd->pre.y_stride;
+
     if (clamp_mvs)
       clamp_mv_to_umv_border(&ymv.as_mv, xd);
 
@@ -811,93 +806,61 @@
   }
 }
 
-static
-void build_4x4uvmvs(MACROBLOCKD *xd) {
+static int mv_pred_row(MACROBLOCKD *mb, int off, int idx) {
+  int temp = mb->mode_info_context->bmi[off + 0].as_mv[idx].as_mv.row +
+             mb->mode_info_context->bmi[off + 1].as_mv[idx].as_mv.row +
+             mb->mode_info_context->bmi[off + 4].as_mv[idx].as_mv.row +
+             mb->mode_info_context->bmi[off + 5].as_mv[idx].as_mv.row;
+  return (temp < 0 ? temp - 4 : temp + 4) / 8;
+}
+
+static int mv_pred_col(MACROBLOCKD *mb, int off, int idx) {
+  int temp = mb->mode_info_context->bmi[off + 0].as_mv[idx].as_mv.col +
+             mb->mode_info_context->bmi[off + 1].as_mv[idx].as_mv.col +
+             mb->mode_info_context->bmi[off + 4].as_mv[idx].as_mv.col +
+             mb->mode_info_context->bmi[off + 5].as_mv[idx].as_mv.col;
+  return (temp < 0 ? temp - 4 : temp + 4) / 8;
+}
+
+static void build_4x4uvmvs(MACROBLOCKD *xd) {
   int i, j;
   BLOCKD *blockd = xd->block;
+  const int mask = xd->fullpixel_mask;
 
   for (i = 0; i < 2; i++) {
     for (j = 0; j < 2; j++) {
-      int yoffset = i * 8 + j * 2;
-      int uoffset = 16 + i * 2 + j;
-      int voffset = 20 + i * 2 + j;
+      const int yoffset = i * 8 + j * 2;
+      const int uoffset = 16 + i * 2 + j;
+      const int voffset = 20 + i * 2 + j;
 
-      int temp;
-
-      temp = xd->mode_info_context->bmi[yoffset + 0].as_mv[0].as_mv.row
-             + xd->mode_info_context->bmi[yoffset + 1].as_mv[0].as_mv.row
-             + xd->mode_info_context->bmi[yoffset + 4].as_mv[0].as_mv.row
-             + xd->mode_info_context->bmi[yoffset + 5].as_mv[0].as_mv.row;
-
-      if (temp < 0) temp -= 4;
-      else temp += 4;
-
-      blockd[uoffset].bmi.as_mv[0].as_mv.row = (temp / 8) &
-                                                  xd->fullpixel_mask;
-
-      temp = xd->mode_info_context->bmi[yoffset + 0].as_mv[0].as_mv.col
-             + xd->mode_info_context->bmi[yoffset + 1].as_mv[0].as_mv.col
-             + xd->mode_info_context->bmi[yoffset + 4].as_mv[0].as_mv.col
-             + xd->mode_info_context->bmi[yoffset + 5].as_mv[0].as_mv.col;
-
-      if (temp < 0) temp -= 4;
-      else temp += 4;
-
-      blockd[uoffset].bmi.as_mv[0].as_mv.col = (temp / 8) &
-        xd->fullpixel_mask;
+      MV *u = &blockd[uoffset].bmi.as_mv[0].as_mv;
+      MV *v = &blockd[voffset].bmi.as_mv[0].as_mv;
+      u->row = mv_pred_row(xd, yoffset, 0) & mask;
+      u->col = mv_pred_col(xd, yoffset, 0) & mask;
 
       // if (x->mode_info_context->mbmi.need_to_clamp_mvs)
-      clamp_uvmv_to_umv_border(&blockd[uoffset].bmi.as_mv[0].as_mv, xd);
+      clamp_uvmv_to_umv_border(u, xd);
 
       // if (x->mode_info_context->mbmi.need_to_clamp_mvs)
-      clamp_uvmv_to_umv_border(&blockd[uoffset].bmi.as_mv[0].as_mv, xd);
+      clamp_uvmv_to_umv_border(u, xd);
 
-      blockd[voffset].bmi.as_mv[0].as_mv.row =
-        blockd[uoffset].bmi.as_mv[0].as_mv.row;
-      blockd[voffset].bmi.as_mv[0].as_mv.col =
-        blockd[uoffset].bmi.as_mv[0].as_mv.col;
+      v->row = u->row;
+      v->col = u->col;
 
       if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
-        temp = xd->mode_info_context->bmi[yoffset + 0].as_mv[1].as_mv.row
-               + xd->mode_info_context->bmi[yoffset + 1].as_mv[1].as_mv.row
-               + xd->mode_info_context->bmi[yoffset + 4].as_mv[1].as_mv.row
-               + xd->mode_info_context->bmi[yoffset + 5].as_mv[1].as_mv.row;
-
-        if (temp < 0) {
-          temp -= 4;
-        } else {
-          temp += 4;
-        }
-
-       blockd[uoffset].bmi.as_mv[1].as_mv.row = (temp / 8) &
-                                                    xd->fullpixel_mask;
-
-        temp = xd->mode_info_context->bmi[yoffset + 0].as_mv[1].as_mv.col
-               + xd->mode_info_context->bmi[yoffset + 1].as_mv[1].as_mv.col
-               + xd->mode_info_context->bmi[yoffset + 4].as_mv[1].as_mv.col
-               + xd->mode_info_context->bmi[yoffset + 5].as_mv[1].as_mv.col;
-
-        if (temp < 0) {
-          temp -= 4;
-        } else {
-          temp += 4;
-        }
-
-        blockd[uoffset].bmi.as_mv[1].as_mv.col = (temp / 8) &
-                                                        xd->fullpixel_mask;
+        u = &blockd[uoffset].bmi.as_mv[1].as_mv;
+        v = &blockd[voffset].bmi.as_mv[1].as_mv;
+        u->row = mv_pred_row(xd, yoffset, 1) & mask;
+        u->col = mv_pred_col(xd, yoffset, 1) & mask;
 
         // if (mbmi->need_to_clamp_mvs)
-        clamp_uvmv_to_umv_border(
-          &blockd[uoffset].bmi.as_mv[1].as_mv, xd);
+        clamp_uvmv_to_umv_border(u, xd);
 
         // if (mbmi->need_to_clamp_mvs)
-        clamp_uvmv_to_umv_border(
-          &blockd[uoffset].bmi.as_mv[1].as_mv, xd);
+        clamp_uvmv_to_umv_border(u, xd);
 
-        blockd[voffset].bmi.as_mv[1].as_mv.row =
-          blockd[uoffset].bmi.as_mv[1].as_mv.row;
-        blockd[voffset].bmi.as_mv[1].as_mv.col =
-          blockd[uoffset].bmi.as_mv[1].as_mv.col;
+        v->row = u->row;
+        v->col = u->col;
       }
     }
   }
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index db1b467..dab88a3 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -260,7 +260,7 @@
 specialize vp9_short_idct4x4llm_1
 
 prototype void vp9_short_idct4x4llm "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct4x4llm
+specialize vp9_short_idct4x4llm sse2
 
 prototype void vp9_short_idct8x8 "int16_t *input, int16_t *output, int pitch"
 specialize vp9_short_idct8x8
diff --git a/vp9/common/x86/vp9_idct_x86.h b/vp9/common/x86/vp9_idct_x86.h
index 8320cf8..bd66d8c 100644
--- a/vp9/common/x86/vp9_idct_x86.h
+++ b/vp9/common/x86/vp9_idct_x86.h
@@ -20,23 +20,10 @@
  */
 
 #if HAVE_MMX
-extern prototype_idct(vp9_short_idct4x4llm_1_mmx);
-extern prototype_idct(vp9_short_idct4x4llm_mmx);
-extern prototype_idct_scalar_add(vp9_dc_only_idct_add_mmx);
-
 extern prototype_second_order(vp9_short_inv_walsh4x4_mmx);
 extern prototype_second_order(vp9_short_inv_walsh4x4_1_mmx);
 
 #if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp9_idct_idct1
-#define vp9_idct_idct1 vp9_short_idct4x4llm_1_mmx
-
-#undef  vp9_idct_idct16
-#define vp9_idct_idct16 vp9_short_idct4x4llm_mmx
-
-#undef  vp9_idct_idct1_scalar_add
-#define vp9_idct_idct1_scalar_add vp9_dc_only_idct_add_mmx
-
 #undef vp9_idct_iwalsh16
 #define vp9_idct_iwalsh16 vp9_short_inv_walsh4x4_mmx
 
diff --git a/vp9/common/x86/vp9_idctllm_x86.c b/vp9/common/x86/vp9_idctllm_x86.c
index 667f5c1..7b3c579 100644
--- a/vp9/common/x86/vp9_idctllm_x86.c
+++ b/vp9/common/x86/vp9_idctllm_x86.c
@@ -73,4 +73,129 @@
   p1 = _mm_srli_si128(p1, 4);
   *(int *)dst_ptr = _mm_cvtsi128_si32(p1);
 }
+
+void vp9_short_idct4x4llm_sse2(int16_t *input, int16_t *output, int pitch) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i eight = _mm_set1_epi16(8);
+  const __m128i cst = _mm_setr_epi16((short)cospi_16_64, (short)cospi_16_64,
+                                     (short)cospi_16_64, (short)-cospi_16_64,
+                                     (short)cospi_24_64, (short)-cospi_8_64,
+                                     (short)cospi_8_64, (short)cospi_24_64);
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const int half_pitch = pitch >> 1;
+  __m128i input0, input1, input2, input3;
+
+  // Rows
+  input0 = _mm_loadl_epi64((__m128i *)input);
+  input1 = _mm_loadl_epi64((__m128i *)(input + 4));
+  input2 = _mm_loadl_epi64((__m128i *)(input + 8));
+  input3 = _mm_loadl_epi64((__m128i *)(input + 12));
+
+  // Construct i3, i1, i3, i1, i2, i0, i2, i0
+  input0 = _mm_shufflelo_epi16(input0, 0xd8);
+  input1 = _mm_shufflelo_epi16(input1, 0xd8);
+  input2 = _mm_shufflelo_epi16(input2, 0xd8);
+  input3 = _mm_shufflelo_epi16(input3, 0xd8);
+
+  input0 = _mm_unpacklo_epi32(input0, input0);
+  input1 = _mm_unpacklo_epi32(input1, input1);
+  input2 = _mm_unpacklo_epi32(input2, input2);
+  input3 = _mm_unpacklo_epi32(input3, input3);
+
+  // Stage 1
+  input0 = _mm_madd_epi16(input0, cst);
+  input1 = _mm_madd_epi16(input1, cst);
+  input2 = _mm_madd_epi16(input2, cst);
+  input3 = _mm_madd_epi16(input3, cst);
+
+  input0 = _mm_add_epi32(input0, rounding);
+  input1 = _mm_add_epi32(input1, rounding);
+  input2 = _mm_add_epi32(input2, rounding);
+  input3 = _mm_add_epi32(input3, rounding);
+
+  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
+  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
+  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
+  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
+
+  // Stage 2
+  input0 = _mm_packs_epi32(input0, zero);
+  input1 = _mm_packs_epi32(input1, zero);
+  input2 = _mm_packs_epi32(input2, zero);
+  input3 = _mm_packs_epi32(input3, zero);
+
+  // Transpose
+  input1 = _mm_unpacklo_epi16(input0, input1);
+  input3 = _mm_unpacklo_epi16(input2, input3);
+  input0 = _mm_unpacklo_epi32(input1, input3);
+  input1 = _mm_unpackhi_epi32(input1, input3);
+
+  // Switch column2, column 3, and then, we got:
+  // input2: column1, column 0;  input3: column2, column 3.
+  input1 = _mm_shuffle_epi32(input1, 0x4e);
+  input2 = _mm_add_epi16(input0, input1);
+  input3 = _mm_sub_epi16(input0, input1);
+
+  // Columns
+  // Construct i3, i1, i3, i1, i2, i0, i2, i0
+  input0 = _mm_shufflelo_epi16(input2, 0xd8);
+  input1 = _mm_shufflehi_epi16(input2, 0xd8);
+  input2 = _mm_shufflehi_epi16(input3, 0xd8);
+  input3 = _mm_shufflelo_epi16(input3, 0xd8);
+
+  input0 = _mm_unpacklo_epi32(input0, input0);
+  input1 = _mm_unpackhi_epi32(input1, input1);
+  input2 = _mm_unpackhi_epi32(input2, input2);
+  input3 = _mm_unpacklo_epi32(input3, input3);
+
+  // Stage 1
+  input0 = _mm_madd_epi16(input0, cst);
+  input1 = _mm_madd_epi16(input1, cst);
+  input2 = _mm_madd_epi16(input2, cst);
+  input3 = _mm_madd_epi16(input3, cst);
+
+  input0 = _mm_add_epi32(input0, rounding);
+  input1 = _mm_add_epi32(input1, rounding);
+  input2 = _mm_add_epi32(input2, rounding);
+  input3 = _mm_add_epi32(input3, rounding);
+
+  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
+  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
+  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
+  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
+
+  // Stage 2
+  input0 = _mm_packs_epi32(input0, zero);
+  input1 = _mm_packs_epi32(input1, zero);
+  input2 = _mm_packs_epi32(input2, zero);
+  input3 = _mm_packs_epi32(input3, zero);
+
+  // Transpose
+  input1 = _mm_unpacklo_epi16(input0, input1);
+  input3 = _mm_unpacklo_epi16(input2, input3);
+  input0 = _mm_unpacklo_epi32(input1, input3);
+  input1 = _mm_unpackhi_epi32(input1, input3);
+
+  // Switch column2, column 3, and then, we got:
+  // input2: column1, column 0;  input3: column2, column 3.
+  input1 = _mm_shuffle_epi32(input1, 0x4e);
+  input2 = _mm_add_epi16(input0, input1);
+  input3 = _mm_sub_epi16(input0, input1);
+
+  // Final round and shift
+  input2 = _mm_add_epi16(input2, eight);
+  input3 = _mm_add_epi16(input3, eight);
+
+  input2 = _mm_srai_epi16(input2, 4);
+  input3 = _mm_srai_epi16(input3, 4);
+
+  // Store results
+  _mm_storel_epi64((__m128i *)output, input2);
+  input2 = _mm_srli_si128(input2, 8);
+  _mm_storel_epi64((__m128i *)(output + half_pitch), input2);
+
+  _mm_storel_epi64((__m128i *)(output + 3 * half_pitch), input3);
+  input3 = _mm_srli_si128(input3, 8);
+  _mm_storel_epi64((__m128i *)(output + 2 * half_pitch), input3);
+}
 #endif
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index 055e97b..b44d659 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -1191,13 +1191,14 @@
        * if we have enough data. Otherwise we will end up with the wrong
        * size.
        */
-      if (data + 4 < data_end) {
-        pc->Width = (data[0] | (data[1] << 8)) & 0x3fff;
-        pc->horiz_scale = data[1] >> 6;
-        pc->Height = (data[2] | (data[3] << 8)) & 0x3fff;
-        pc->vert_scale = data[3] >> 6;
+      if (data + 5 < data_end) {
+        pc->Width  = (data[0] | (data[1] << 8));
+        pc->Height = (data[2] | (data[3] << 8));
+
+        pc->horiz_scale = data[4] >> 4;
+        pc->vert_scale  = data[4] & 0x0F;
       }
-      data += 4;
+      data += 5;
 
       if (width != pc->Width || height != pc->Height) {
         if (pc->Width <= 0) {
diff --git a/vp9/decoder/vp9_dequantize.c b/vp9/decoder/vp9_dequantize.c
index 85246d8..eaf9860 100644
--- a/vp9/decoder/vp9_dequantize.c
+++ b/vp9/decoder/vp9_dequantize.c
@@ -90,7 +90,7 @@
       input[i] *= dq[i];
 
     // the idct halves ( >> 1) the pitch
-    vp9_short_idct4x4llm_c(input, output, 4 << 1);
+    vp9_short_idct4x4llm(input, output, 4 << 1);
 
     vpx_memset(input, 0, 32);
 
@@ -112,7 +112,7 @@
     input[i] *= dq[i];
 
   // the idct halves ( >> 1) the pitch
-  vp9_short_idct4x4llm_c(input, output, 4 << 1);
+  vp9_short_idct4x4llm(input, output, 4 << 1);
   vpx_memset(input, 0, 32);
   add_residual(output, pred, pitch, dest, stride, 4, 4);
 }
diff --git a/vp9/decoder/x86/vp9_idct_mmx.h b/vp9/decoder/x86/vp9_idct_mmx.h
index c0e9bfd..7d98291 100644
--- a/vp9/decoder/x86/vp9_idct_mmx.h
+++ b/vp9/decoder/x86/vp9_idct_mmx.h
@@ -16,9 +16,6 @@
                                  unsigned char *pred, unsigned char *dest,
                                  int pitch, int stride, int Dc);
 
-void vp9_dc_only_idct_add_mmx(short input_dc, const unsigned char *pred_ptr,
-                              unsigned char *dst_ptr, int pitch, int stride);
-
 void vp9_dequant_idct_add_mmx(short *input, const short *dq, unsigned char *pred,
                               unsigned char *dest, int pitch, int stride);
 
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 971da05..b05da87 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -1500,17 +1500,20 @@
   {
     int v;
 
-    /* TODO(jkoleszar): support arbitrary resolutions */
-    v = (pc->horiz_scale << 14) | pc->Width;
+    // support arbitrary resolutions
+    v = pc->Width;
     cx_data[0] = v;
     cx_data[1] = v >> 8;
 
-    v = (pc->vert_scale << 14) | pc->Height;
+    v = pc->Height;
     cx_data[2] = v;
     cx_data[3] = v >> 8;
 
-    extra_bytes_packed += 4;
-    cx_data += 4;
+    // use a separate byte to store the scale factors, each ranging 0-15
+    cx_data[4] = (pc->horiz_scale << 4) | (pc->vert_scale);
+
+    extra_bytes_packed += 5;
+    cx_data += 5;
   }
 
   vp9_start_encode(&header_bc, cx_data);
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 3b48f46..a4dbdc5 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -1247,8 +1247,8 @@
   MACROBLOCKD *const xd = &x->e_mbd;
   int totalrate;
 
-  // printf("encode_frame_internal frame %d (%d)\n",
-  //        cpi->common.current_video_frame, cpi->common.show_frame);
+//   fprintf(stderr, "encode_frame_internal frame %d (%d)\n",
+//          cpi->common.current_video_frame, cpi->common.show_frame);
 
   // Compute a modified set of reference frame probabilities to use when
   // prediction fails. These are based on the current general estimates for
@@ -1329,12 +1329,11 @@
       // Take tiles into account and give start/end MB
       int tile_col;
       TOKENEXTRA *tp = cpi->tok;
-
       for (tile_col = 0; tile_col < cm->tile_columns; tile_col++) {
         TOKENEXTRA *tp_old = tp;
-
         // For each row of SBs in the frame
         vp9_get_tile_col_offsets(cm, tile_col);
+
         for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 4) {
           encode_sb_row(cpi, mb_row, &tp, &totalrate);
         }
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 5278ac2..6335827 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -1086,14 +1086,12 @@
     cpi->max_gf_interval = cpi->twopass.static_scene_max_gf_interval;
 }
 
-
-static int
-rescale(int val, int num, int denom) {
+static int64_t rescale(int val, int64_t num, int denom) {
   int64_t llnum = num;
   int64_t llden = denom;
   int64_t llval = val;
 
-  return (int)(llval * llnum / llden);
+  return (llval * llnum / llden);
 }
 
 static void set_tile_limits(VP9_COMP *cpi) {
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 50780d0..3dc4772 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -485,7 +485,7 @@
   int kf_boost;
   int kf_zeromotion_pct;
 
-  int target_bandwidth;
+  int64_t target_bandwidth;
   struct vpx_codec_pkt_list  *output_pkt_list;
 
 #if 0
diff --git a/vp9/encoder/vp9_picklpf.h b/vp9/encoder/vp9_picklpf.h
index cb01500..ca3cab6 100644
--- a/vp9/encoder/vp9_picklpf.h
+++ b/vp9/encoder/vp9_picklpf.h
@@ -15,12 +15,12 @@
 struct yv12_buffer_config;
 struct VP9_COMP;
 
-extern void vp9_pick_filter_level_fast(struct yv12_buffer_config *sd,
-                                       struct VP9_COMP *cpi);
+void vp9_pick_filter_level_fast(struct yv12_buffer_config *sd,
+                                struct VP9_COMP *cpi);
 
-extern void vp9_set_alt_lf_level(struct VP9_COMP *cpi, int filt_val);
+void vp9_set_alt_lf_level(struct VP9_COMP *cpi, int filt_val);
 
-extern void vp9_pick_filter_level(struct yv12_buffer_config *sd,
-                                  struct VP9_COMP *cpi);
+void vp9_pick_filter_level(struct yv12_buffer_config *sd,
+                           struct VP9_COMP *cpi);
 
 #endif  // VP9_ENCODER_VP9_PICKLPF_H_
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index c5b3e3a..61379b8 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -193,19 +193,17 @@
 }
 
 static int compute_rd_mult(int qindex) {
-  int q;
-
-  q = vp9_dc_quant(qindex, 0);
+  int q = vp9_dc_quant(qindex, 0);
   return (11 * q * q) >> 6;
 }
 
-void vp9_initialize_me_consts(VP9_COMP *cpi, int QIndex) {
-  cpi->mb.sadperbit16 =  sad_per_bit16lut[QIndex];
-  cpi->mb.sadperbit4  =  sad_per_bit4lut[QIndex];
+void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex) {
+  cpi->mb.sadperbit16 = sad_per_bit16lut[qindex];
+  cpi->mb.sadperbit4 = sad_per_bit4lut[qindex];
 }
 
 
-void vp9_initialize_rd_consts(VP9_COMP *cpi, int QIndex) {
+void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
   int q, i;
 
   vp9_clear_system_state();  // __asm emms;
@@ -214,16 +212,16 @@
   // for key frames, golden frames and arf frames.
   // if (cpi->common.refresh_golden_frame ||
   //     cpi->common.refresh_alt_ref_frame)
-  QIndex = (QIndex < 0) ? 0 : ((QIndex > MAXQ) ? MAXQ : QIndex);
+  qindex = (qindex < 0) ? 0 : ((qindex > MAXQ) ? MAXQ : qindex);
 
-  cpi->RDMULT = compute_rd_mult(QIndex);
+  cpi->RDMULT = compute_rd_mult(qindex);
 
   if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
     if (cpi->twopass.next_iiratio > 31)
       cpi->RDMULT += (cpi->RDMULT * rd_iifactor[31]) >> 4;
     else
       cpi->RDMULT +=
-        (cpi->RDMULT * rd_iifactor[cpi->twopass.next_iiratio]) >> 4;
+          (cpi->RDMULT * rd_iifactor[cpi->twopass.next_iiratio]) >> 4;
   }
 
   if (cpi->RDMULT < 7)
@@ -234,8 +232,8 @@
 
   vp9_set_speed_features(cpi);
 
-  q = (int)pow(vp9_dc_quant(QIndex, 0) >> 2, 1.25);
-  q = q << 2;
+  q = (int)pow(vp9_dc_quant(qindex, 0) >> 2, 1.25);
+  q <<= 2;
   cpi->RDMULT = cpi->RDMULT << 4;
 
   if (q < 8)
diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h
index 01b1560..d1b4777 100644
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h
@@ -15,34 +15,34 @@
 #define RDCOST(RM,DM,R,D) ( ((128+((int64_t)R)*(RM)) >> 8) + ((int64_t)DM)*(D) )
 #define RDCOST_8x8(RM,DM,R,D) ( ((128+((int64_t)R)*(RM)) >> 8) + ((int64_t)DM)*(D) )
 
-extern void vp9_initialize_rd_consts(VP9_COMP *cpi, int Qvalue);
+void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex);
 
-extern void vp9_initialize_me_consts(VP9_COMP *cpi, int QIndex);
+void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex);
 
-extern void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x,
-                                   int *r, int *d);
+void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x,
+                            int *r, int *d);
 
-extern void vp9_rd_pick_intra_mode_sb32(VP9_COMP *cpi, MACROBLOCK *x,
-                                        int *r, int *d);
+void vp9_rd_pick_intra_mode_sb32(VP9_COMP *cpi, MACROBLOCK *x,
+                                 int *r, int *d);
 
-extern void vp9_rd_pick_intra_mode_sb64(VP9_COMP *cpi, MACROBLOCK *x,
-                                        int *r, int *d);
+void vp9_rd_pick_intra_mode_sb64(VP9_COMP *cpi, MACROBLOCK *x,
+                                 int *r, int *d);
 
-extern void vp9_pick_mode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
-                                           int mb_row, int mb_col,
-                                           int *r, int *d);
+void vp9_pick_mode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
+                                    int mb_row, int mb_col,
+                                    int *r, int *d);
 
-extern int64_t vp9_rd_pick_inter_mode_sb32(VP9_COMP *cpi, MACROBLOCK *x,
-                                           int mb_row, int mb_col,
-                                           int *r, int *d);
+int64_t vp9_rd_pick_inter_mode_sb32(VP9_COMP *cpi, MACROBLOCK *x,
+                                    int mb_row, int mb_col,
+                                    int *r, int *d);
 
-extern int64_t vp9_rd_pick_inter_mode_sb64(VP9_COMP *cpi, MACROBLOCK *x,
-                                           int mb_row, int mb_col,
-                                           int *r, int *d);
+int64_t vp9_rd_pick_inter_mode_sb64(VP9_COMP *cpi, MACROBLOCK *x,
+                                    int mb_row, int mb_col,
+                                    int *r, int *d);
 
-extern void vp9_init_me_luts();
+void vp9_init_me_luts();
 
-extern void vp9_set_mbmode_and_mvs(MACROBLOCK *x,
-                                   MB_PREDICTION_MODE mb, int_mv *mv);
+void vp9_set_mbmode_and_mvs(MACROBLOCK *x,
+                            MB_PREDICTION_MODE mb, int_mv *mv);
 
 #endif  // VP9_ENCODER_VP9_RDOPT_H_
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 2653954..db7a2fd 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -126,8 +126,8 @@
 static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
                                        const vpx_codec_enc_cfg_t *cfg,
                                        const struct vp8_extracfg *vp8_cfg) {
-  RANGE_CHECK(cfg, g_w,                   1, 16383); /* 14 bits available */
-  RANGE_CHECK(cfg, g_h,                   1, 16383); /* 14 bits available */
+  RANGE_CHECK(cfg, g_w,                   1, 65535); /* 16 bits available */
+  RANGE_CHECK(cfg, g_h,                   1, 65535); /* 16 bits available */
   RANGE_CHECK(cfg, g_timebase.den,        1, 1000000000);
   RANGE_CHECK(cfg, g_timebase.num,        1, cfg->g_timebase.den);
   RANGE_CHECK_HI(cfg, g_profile,          3);
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index b2ce9aa..f2b80e1 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -229,8 +229,8 @@
       if (c[0] != 0x9d || c[1] != 0x01 || c[2] != 0x2a)
         res = VPX_CODEC_UNSUP_BITSTREAM;
 
-      si->w = (c[3] | (c[4] << 8)) & 0x3fff;
-      si->h = (c[5] | (c[6] << 8)) & 0x3fff;
+      si->w = (c[3] | (c[4] << 8));
+      si->h = (c[5] | (c[6] << 8));
 
       /*printf("w=%d, h=%d\n", si->w, si->h);*/
       if (!(si->h | si->w))