Merge "encode_test_driver: make ~Encoder virtual" into experimental
diff --git a/build/make/Makefile b/build/make/Makefile
index 4ac5bcf..de71c61 100644
--- a/build/make/Makefile
+++ b/build/make/Makefile
@@ -103,6 +103,18 @@
 .PHONY: testdata
 testdata::
 
+# Add compiler flags for intrinsic files
+$(BUILD_PFX)%_mmx.c.d: CFLAGS += -mmmx
+$(BUILD_PFX)%_mmx.c.o: CFLAGS += -mmmx
+$(BUILD_PFX)%_sse2.c.d: CFLAGS += -msse2
+$(BUILD_PFX)%_sse2.c.o: CFLAGS += -msse2
+$(BUILD_PFX)%_sse3.c.d: CFLAGS += -msse3
+$(BUILD_PFX)%_sse3.c.o: CFLAGS += -msse3
+$(BUILD_PFX)%_ssse3.c.d: CFLAGS += -mssse3
+$(BUILD_PFX)%_ssse3.c.o: CFLAGS += -mssse3
+$(BUILD_PFX)%_sse4.c.d: CFLAGS += -msse4.1
+$(BUILD_PFX)%_sse4.c.o: CFLAGS += -msse4.1
+
 $(BUILD_PFX)%.c.d: %.c
 	$(if $(quiet),@echo "    [DEP] $@")
 	$(qexec)mkdir -p $(dir $@)
diff --git a/configure b/configure
index 6e60af1..573ec36 100755
--- a/configure
+++ b/configure
@@ -250,6 +250,7 @@
     multiple_arf
     code_zerogroup
     sb8x8
+    non420
 "
 CONFIG_LIST="
     external_build
diff --git a/test/tile_independence_test.cc b/test/tile_independence_test.cc
index 711d0bd..9633ed7 100644
--- a/test/tile_independence_test.cc
+++ b/test/tile_independence_test.cc
@@ -56,7 +56,13 @@
 
   void UpdateMD5(::libvpx_test::Decoder *dec, const vpx_codec_cx_pkt_t *pkt,
                  ::libvpx_test::MD5 *md5) {
-    dec->DecodeFrame((uint8_t *) pkt->data.frame.buf, pkt->data.frame.sz);
+    const vpx_codec_err_t res =
+        dec->DecodeFrame(reinterpret_cast<uint8_t*>(pkt->data.frame.buf),
+                         pkt->data.frame.sz);
+    if (res != VPX_CODEC_OK) {
+      abort_ = true;
+      ASSERT_EQ(VPX_CODEC_OK, res);
+    }
     const vpx_image_t *img = dec->GetDxData().Next();
     md5->Add(img);
   }
diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk
index 7d1904a..cd091f3 100644
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -91,18 +91,8 @@
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.c
 
-# TODO(johann) make this generic
-ifeq ($(HAVE_SSE2),yes)
-vp8/encoder/x86/quantize_sse2.c.o: CFLAGS += -msse2
-vp8/encoder/x86/quantize_sse2.c.d: CFLAGS += -msse2
-endif
-
 ifeq ($(CONFIG_TEMPORAL_DENOISING),yes)
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/denoising_sse2.c
-ifeq ($(HAVE_SSE2),yes)
-vp8/encoder/x86/denoising_sse2.c.o: CFLAGS += -msse2
-vp8/encoder/x86/denoising_sse2.c.d: CFLAGS += -msse2
-endif
 endif
 
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm
diff --git a/vp9/common/vp9_recon.c b/vp9/common/vp9_recon.c
index 4ab4f39..6b102d1 100644
--- a/vp9/common/vp9_recon.c
+++ b/vp9/common/vp9_recon.c
@@ -28,10 +28,10 @@
 }
 
 
-void vp9_recon_b_c(uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr,
-                   int stride) {
+void vp9_recon_b_c(uint8_t *pred_ptr, int16_t *diff_ptr, int diff_stride,
+                   uint8_t *dst_ptr, int stride) {
   assert(pred_ptr == dst_ptr);
-  recon(4, 4, diff_ptr, 16 >> CONFIG_SB8X8, dst_ptr, stride);
+  recon(4, 4, diff_ptr, diff_stride, dst_ptr, stride);
 }
 
 #if !CONFIG_SB8X8
diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c
index daeb6b5..aef34c9 100644
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@@ -592,14 +592,16 @@
 #if !CONFIG_NEWBINTRAMODES
 void vp9_intra4x4_predict(MACROBLOCKD *xd,
                           int block_idx,
+                          BLOCK_SIZE_TYPE bsize,
                           int mode,
                           uint8_t *predictor, int pre_stride) {
+  const int bwl = b_width_log2(bsize);
+  const int wmask = (1 << bwl) - 1;
   const int have_top =
-      (block_idx >> (2 >> CONFIG_SB8X8)) || xd->up_available;
+      (block_idx >> bwl) || xd->up_available;
   const int have_left =
-      (block_idx & (3 >> CONFIG_SB8X8)) || xd->left_available;
-  const int have_right =
-      ((block_idx & (3 >> CONFIG_SB8X8)) != (3 >> CONFIG_SB8X8));
+      (block_idx & wmask) || xd->left_available;
+  const int have_right = ((block_idx & wmask) != wmask);
 
   vp9_build_intra_predictors(predictor, pre_stride,
                              predictor, pre_stride,
diff --git a/vp9/common/vp9_reconintra4x4.c b/vp9/common/vp9_reconintra4x4.c
index 2a7c7f3..ce33aa5 100644
--- a/vp9/common/vp9_reconintra4x4.c
+++ b/vp9/common/vp9_reconintra4x4.c
@@ -160,13 +160,16 @@
 
 void vp9_intra4x4_predict(MACROBLOCKD *xd,
                           int block_idx,
+                          BLOCK_SIZE_TYPE bsize,
                           int b_mode,
                           uint8_t *predictor,
                           int ps) {
+  const int bwl = b_width_log2(bsize);
+  const int wmask = (1 << bwl) - 1;
   int i, r, c;
-  const int have_top = (block_idx >> 2) || xd->up_available;
-  const int have_left = (block_idx & 3)  || xd->left_available;
-  const int have_right = (block_idx & 3) != 3 || xd->right_available;
+  const int have_top = (block_idx >> bwl) || xd->up_available;
+  const int have_left = (block_idx & wmask)  || xd->left_available;
+  const int have_right = (block_idx & wmask) != wmask || xd->right_available;
   uint8_t left[4], above[8], top_left;
   /*
    * 127 127 127 .. 127 127 127 127 127 127
@@ -197,8 +200,8 @@
     above[1] = above_ptr[1];
     above[2] = above_ptr[2];
     above[3] = above_ptr[3];
-    if (((block_idx & 3) != 3) ||
-        (have_right && block_idx == 3 &&
+    if (((block_idx & wmask) != wmask) ||
+        (have_right && block_idx == wmask &&
          ((xd->mb_index != 3 && xd->sb_index != 3) ||
           ((xd->mb_index & 1) == 0 && xd->sb_index == 3)))) {
       above[4] = above_ptr[4];
@@ -212,7 +215,7 @@
         above_right -= 32 * ps;
       if (xd->mb_index == 3)
         above_right -= 16 * ps;
-      above_right -= (block_idx & ~3) * ps;
+      above_right -= 4 * (block_idx >> bwl) * ps;
 
       /* use a more distant above-right (from closest available top-right
        * corner), but with a "localized DC" (similar'ish to TM-pred):
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index b1acc04..e473d81 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -60,7 +60,7 @@
 prototype void vp9_copy_mem8x4 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
 specialize vp9_copy_mem8x4 mmx
 
-prototype void vp9_recon_b "uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr, int stride"
+prototype void vp9_recon_b "uint8_t *pred_ptr, int16_t *diff_ptr, int diff_stride, uint8_t *dst_ptr, int stride"
 specialize vp9_recon_b
 
 if [ "$CONFIG_SB8X8" != "yes" ]; then
@@ -98,7 +98,7 @@
 prototype void vp9_build_intra_predictors_sbuv_s "struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize"
 specialize vp9_build_intra_predictors_sbuv_s
 
-prototype void vp9_intra4x4_predict "struct macroblockd *xd, int block, int b_mode, uint8_t *predictor, int pre_stride"
+prototype void vp9_intra4x4_predict "struct macroblockd *xd, int block, enum BLOCK_SIZE_TYPE bsize, int b_mode, uint8_t *predictor, int pre_stride"
 specialize vp9_intra4x4_predict;
 
 if [ "$CONFIG_SB8X8" != "yes" ]; then
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 408573f..c663244 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -144,7 +144,8 @@
     int i;
     for (i = 0; i < (16 >> (2 * CONFIG_SB8X8)); ++i) {
       const B_PREDICTION_MODE a = above_block_mode(m, i, mis);
-      const B_PREDICTION_MODE l = xd->left_available || (i & 3) ?
+      const B_PREDICTION_MODE l = xd->left_available ||
+                                  (i & (3 >> CONFIG_SB8X8)) ?
                                   left_block_mode(m, i) : B_DC_PRED;
       m->bmi[i].as_mode.first = read_kf_bmode(r, cm->kf_bmode_prob[a][l]);
     }
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index 0f87a21..2457f79 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -354,7 +354,7 @@
     if (!xd->mode_info_context->mbmi.mb_skip_coeff)
       vp9_decode_coefs_4x4(pbi, xd, r, PLANE_TYPE_Y_WITH_DC, i);
 #endif
-    vp9_intra4x4_predict(xd, i, b_mode, dst, xd->plane[0].dst.stride);
+    vp9_intra4x4_predict(xd, i, bsize, b_mode, dst, xd->plane[0].dst.stride);
     // TODO(jingning): refactor to use foreach_transformed_block_in_plane_
     tx_type = get_tx_type_4x4(xd, i);
     dequant_add_y(xd, tx_type, i, bsize);
@@ -1246,8 +1246,6 @@
 
   setup_loopfilter(pc, xd, &header_bc);
 
-  vp9_read_literal(&header_bc, 2);  // unused
-
   setup_quantization(pbi, &header_bc);
 
   // Determine if the golden frame or ARF buffer should be updated and how.
@@ -1343,11 +1341,8 @@
   vp9_setup_block_dptrs(xd);
 
   // clear out the coeff buffer
-  vpx_memset(xd->plane[0].qcoeff, 0, sizeof(xd->plane[0].qcoeff));
-  vpx_memset(xd->plane[1].qcoeff, 0, sizeof(xd->plane[1].qcoeff));
-  vpx_memset(xd->plane[2].qcoeff, 0, sizeof(xd->plane[2].qcoeff));
-
-  vp9_read_bit(&header_bc);  // unused
+  for (i = 0; i < MAX_MB_PLANE; ++i)
+    vp9_zero(xd->plane[i].qcoeff);
 
   vp9_decode_mode_mvs_init(pbi, &header_bc);
 
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 7152ac9..b6dd984 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -914,7 +914,8 @@
     int i = 0;
     do {
       const B_PREDICTION_MODE a = above_block_mode(m, i, mis);
-      const B_PREDICTION_MODE l = (xd->left_available || (i & 3)) ?
+      const B_PREDICTION_MODE l = (xd->left_available ||
+                                  (i & (3 >> CONFIG_SB8X8))) ?
                                   left_block_mode(m, i) : B_DC_PRED;
       const int bm = m->bmi[i].as_mode.first;
 
@@ -1930,9 +1931,6 @@
     }
   }
 
-  // TODO(jkoleszar): remove these unused bits
-  vp9_write_literal(&header_bc, 0, 2);
-
   // Frame Q baseline quantizer index
   vp9_write_literal(&header_bc, pc->base_qindex, QINDEX_BITS);
 
@@ -2178,9 +2176,6 @@
   active_section = 2;
 #endif
 
-  // TODO(jkoleszar): remove this unused bit
-  vp9_write_bit(&header_bc, 1);
-
   vp9_update_skip_probs(cpi);
   for (i = 0; i < MBSKIP_CONTEXTS; ++i) {
     vp9_write_prob(&header_bc, pc->mbskip_pred_probs[i]);
diff --git a/vp9/encoder/vp9_encodeintra.c b/vp9/encoder/vp9_encodeintra.c
index d5574db..268058e 100644
--- a/vp9/encoder/vp9_encodeintra.c
+++ b/vp9/encoder/vp9_encodeintra.c
@@ -22,15 +22,12 @@
   MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
   (void) cpi;
 
-#if !CONFIG_SB8X8
   if (use_16x16_pred) {
-#endif
     mbmi->mode = DC_PRED;
     mbmi->uv_mode = DC_PRED;
     mbmi->ref_frame = INTRA_FRAME;
 
     vp9_encode_intra16x16mby(&cpi->common, x);
-#if !CONFIG_SB8X8
   } else {
     int i;
 
@@ -39,7 +36,6 @@
       encode_intra4x4block(x, i, BLOCK_SIZE_MB16X16);
     }
   }
-#endif
 
   return vp9_get_mb_ss(x->plane[0].src_diff);
 }
@@ -61,36 +57,37 @@
       raster_block_offset_int16(xd, bsize, 0, ib,
                                 xd->plane[0].diff);
   int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, ib, 16);
+  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
 
-  assert(ib < (16 >> (2 * CONFIG_SB8X8)));
+  assert(ib < (1 << (bwl + bhl)));
 
 #if CONFIG_NEWBINTRAMODES
   xd->mode_info_context->bmi[ib].as_mode.context =
     vp9_find_bpred_context(&x->e_mbd, ib, dst, xd->plane[0].dst.stride);
 #endif
 
-  vp9_intra4x4_predict(&x->e_mbd, ib,
+  vp9_intra4x4_predict(&x->e_mbd, ib, bsize,
                        xd->mode_info_context->bmi[ib].as_mode.first,
                        dst, xd->plane[0].dst.stride);
-  vp9_subtract_block(4, 4, src_diff, 16 >> CONFIG_SB8X8,
+  vp9_subtract_block(4, 4, src_diff, 4 << bwl,
                      src, x->plane[0].src.stride,
                      dst, xd->plane[0].dst.stride);
 
   tx_type = get_tx_type_4x4(&x->e_mbd, ib);
   if (tx_type != DCT_DCT) {
-    vp9_short_fht4x4(src_diff, coeff, 16 >> CONFIG_SB8X8, tx_type);
+    vp9_short_fht4x4(src_diff, coeff, 4 << bwl, tx_type);
     x->quantize_b_4x4(x, ib, tx_type, 16);
     vp9_short_iht4x4(BLOCK_OFFSET(xd->plane[0].dqcoeff, ib, 16),
-                     diff, 16 >> CONFIG_SB8X8, tx_type);
+                     diff, 4 << bwl, tx_type);
   } else {
-    x->fwd_txm4x4(src_diff, coeff, 32 >> CONFIG_SB8X8);
+    x->fwd_txm4x4(src_diff, coeff, 8 << bwl);
     x->quantize_b_4x4(x, ib, tx_type, 16);
     vp9_inverse_transform_b_4x4(&x->e_mbd, xd->plane[0].eobs[ib],
                                 BLOCK_OFFSET(xd->plane[0].dqcoeff, ib, 16),
-                                diff, 32 >> CONFIG_SB8X8);
+                                diff, 8 << bwl);
   }
 
-  vp9_recon_b(dst, diff, dst, xd->plane[0].dst.stride);
+  vp9_recon_b(dst, diff, 4 << bwl, dst, xd->plane[0].dst.stride);
 }
 
 void vp9_encode_intra4x4mby(MACROBLOCK *mb, BLOCK_SIZE_TYPE bsize) {
@@ -203,7 +200,7 @@
         raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, ib + iblock[i],
                                   xd->plane[0].dst.buf,
                                   xd->plane[0].dst.stride);
-    vp9_recon_b_c(dst, diff, dst, xd->plane[0].dst.stride);
+    vp9_recon_b_c(dst, diff, 16, dst, xd->plane[0].dst.stride);
   }
 }
 
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 4d28f1b..db5dd6c 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -247,8 +247,8 @@
 
 // Calculate a modified Error used in distributing bits between easier and harder frames
 static double calculate_modified_err(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
-  double av_err = (cpi->twopass.total_stats->ssim_weighted_pred_err /
-                   cpi->twopass.total_stats->count);
+  double av_err = (cpi->twopass.total_stats.ssim_weighted_pred_err /
+                   cpi->twopass.total_stats.count);
   double this_err = this_frame->ssim_weighted_pred_err;
   double modified_err;
 
@@ -328,7 +328,7 @@
   // For VBR base this on the bits and frames left plus the
   // two_pass_vbrmax_section rate passed in by the user.
   max_bits = (int) (((double) cpi->twopass.bits_left
-      / (cpi->twopass.total_stats->count - (double) cpi->common
+      / (cpi->twopass.total_stats.count - (double) cpi->common
              .current_video_frame))
                     * ((double) cpi->oxcf.two_pass_vbrmax_section / 100.0));
 
@@ -340,11 +340,11 @@
 }
 
 void vp9_init_first_pass(VP9_COMP *cpi) {
-  zero_stats(cpi->twopass.total_stats);
+  zero_stats(&cpi->twopass.total_stats);
 }
 
 void vp9_end_first_pass(VP9_COMP *cpi) {
-  output_stats(cpi, cpi->output_pkt_list, cpi->twopass.total_stats);
+  output_stats(cpi, cpi->output_pkt_list, &cpi->twopass.total_stats);
 }
 
 static void zz_motion_search(VP9_COMP *cpi, MACROBLOCK *x, YV12_BUFFER_CONFIG *recon_buffer, int *best_motion_err, int recon_yoffset) {
@@ -754,20 +754,20 @@
                             - cpi->source->ts_start);
 
     // don't want to do output stats with a stack variable!
-    memcpy(cpi->twopass.this_frame_stats,
+    memcpy(&cpi->twopass.this_frame_stats,
            &fps,
            sizeof(FIRSTPASS_STATS));
-    output_stats(cpi, cpi->output_pkt_list, cpi->twopass.this_frame_stats);
-    accumulate_stats(cpi->twopass.total_stats, &fps);
+    output_stats(cpi, cpi->output_pkt_list, &cpi->twopass.this_frame_stats);
+    accumulate_stats(&cpi->twopass.total_stats, &fps);
   }
 
   // Copy the previous Last Frame back into gf and and arf buffers if
   // the prediction is good enough... but also dont allow it to lag too far
   if ((cpi->twopass.sr_update_lag > 3) ||
       ((cm->current_video_frame > 0) &&
-       (cpi->twopass.this_frame_stats->pcnt_inter > 0.20) &&
-       ((cpi->twopass.this_frame_stats->intra_error /
-         DOUBLE_DIVIDE_CHECK(cpi->twopass.this_frame_stats->coded_error)) >
+       (cpi->twopass.this_frame_stats.pcnt_inter > 0.20) &&
+       ((cpi->twopass.this_frame_stats.intra_error /
+         DOUBLE_DIVIDE_CHECK(cpi->twopass.this_frame_stats.coded_error)) >
         2.0))) {
     vp8_yv12_copy_frame(lst_yv12, gld_yv12);
     cpi->twopass.sr_update_lag = 1;
@@ -998,7 +998,7 @@
   // Give average a chance to settle though.
   // PGW TODO.. This code is broken for the extended Q range
   if ((cpi->ni_frames >
-       ((int)cpi->twopass.total_stats->count >> 8)) &&
+       ((int)cpi->twopass.total_stats.count >> 8)) &&
       (cpi->ni_frames > 25)) {
     adjust_maxq_qrange(cpi);
   }
@@ -1055,8 +1055,8 @@
   }
 
   // II ratio correction factor for clip as a whole
-  clip_iiratio = cpi->twopass.total_stats->intra_error /
-                 DOUBLE_DIVIDE_CHECK(cpi->twopass.total_stats->coded_error);
+  clip_iiratio = cpi->twopass.total_stats.intra_error /
+                 DOUBLE_DIVIDE_CHECK(cpi->twopass.total_stats.coded_error);
   clip_iifactor = 1.0 - ((clip_iiratio - 10.0) * 0.025);
   if (clip_iifactor < 0.80)
     clip_iifactor = 0.80;
@@ -1101,14 +1101,14 @@
   if (two_pass_min_rate < lower_bounds_min_rate)
     two_pass_min_rate = lower_bounds_min_rate;
 
-  zero_stats(cpi->twopass.total_stats);
-  zero_stats(cpi->twopass.total_left_stats);
+  zero_stats(&cpi->twopass.total_stats);
+  zero_stats(&cpi->twopass.total_left_stats);
 
   if (!cpi->twopass.stats_in_end)
     return;
 
-  *cpi->twopass.total_stats = *cpi->twopass.stats_in_end;
-  *cpi->twopass.total_left_stats = *cpi->twopass.total_stats;
+  cpi->twopass.total_stats = *cpi->twopass.stats_in_end;
+  cpi->twopass.total_left_stats = cpi->twopass.total_stats;
 
   // each frame can have a different duration, as the frame rate in the source
   // isn't guaranteed to be constant.   The frame rate prior to the first frame
@@ -1116,13 +1116,13 @@
   // Its calculated based on the actual durations of all frames from the first
   // pass.
   vp9_new_frame_rate(cpi,
-                     10000000.0 * cpi->twopass.total_stats->count /
-                     cpi->twopass.total_stats->duration);
+                     10000000.0 * cpi->twopass.total_stats.count /
+                     cpi->twopass.total_stats.duration);
 
   cpi->output_frame_rate = cpi->oxcf.frame_rate;
-  cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats->duration *
+  cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats.duration *
                                      cpi->oxcf.target_bandwidth / 10000000.0);
-  cpi->twopass.bits_left -= (int64_t)(cpi->twopass.total_stats->duration *
+  cpi->twopass.bits_left -= (int64_t)(cpi->twopass.total_stats.duration *
                                       two_pass_min_rate / 10000000.0);
 
   // Calculate a minimum intra value to be used in determining the IIratio
@@ -1148,7 +1148,8 @@
       sum_iiratio += IIRatio;
     }
 
-    cpi->twopass.avg_iiratio = sum_iiratio / DOUBLE_DIVIDE_CHECK((double)cpi->twopass.total_stats->count);
+    cpi->twopass.avg_iiratio = sum_iiratio /
+        DOUBLE_DIVIDE_CHECK((double)cpi->twopass.total_stats.count);
 
     // Reset file position
     reset_fpf_position(cpi, start_pos);
@@ -1831,7 +1832,7 @@
   // where cpi->twopass.kf_group_bits is tied to cpi->twopass.bits_left.
   // This is also important for short clips where there may only be one
   // key frame.
-  if (cpi->twopass.frames_to_key >= (int)(cpi->twopass.total_stats->count -
+  if (cpi->twopass.frames_to_key >= (int)(cpi->twopass.total_stats.count -
                                           cpi->common.current_video_frame)) {
     cpi->twopass.kf_group_bits =
       (cpi->twopass.bits_left > 0) ? cpi->twopass.bits_left : 0;
@@ -2099,7 +2100,7 @@
 
 void vp9_second_pass(VP9_COMP *cpi) {
   int tmp_q;
-  int frames_left = (int)(cpi->twopass.total_stats->count -
+  int frames_left = (int)(cpi->twopass.total_stats.count -
                           cpi->common.current_video_frame);
 
   FIRSTPASS_STATS this_frame;
@@ -2124,7 +2125,7 @@
 
       est_cq =
         estimate_cq(cpi,
-                    cpi->twopass.total_left_stats,
+                    &cpi->twopass.total_left_stats,
                     (int)(cpi->twopass.bits_left / frames_left));
 
       cpi->cq_target_quality = cpi->oxcf.cq_level;
@@ -2138,7 +2139,7 @@
 
     tmp_q = estimate_max_q(
               cpi,
-              cpi->twopass.total_left_stats,
+              &cpi->twopass.total_left_stats,
               (int)(cpi->twopass.bits_left / frames_left));
 
     cpi->active_worst_quality         = tmp_q;
@@ -2161,15 +2162,15 @@
   // radical adjustments to the allowed quantizer range just to use up a
   // few surplus bits or get beneath the target rate.
   else if ((cpi->common.current_video_frame <
-            (((unsigned int)cpi->twopass.total_stats->count * 255) >> 8)) &&
+            (((unsigned int)cpi->twopass.total_stats.count * 255) >> 8)) &&
            ((cpi->common.current_video_frame + cpi->baseline_gf_interval) <
-            (unsigned int)cpi->twopass.total_stats->count)) {
+            (unsigned int)cpi->twopass.total_stats.count)) {
     if (frames_left < 1)
       frames_left = 1;
 
     tmp_q = estimate_max_q(
               cpi,
-              cpi->twopass.total_left_stats,
+              &cpi->twopass.total_left_stats,
               (int)(cpi->twopass.bits_left / frames_left));
 
     // Make a damped adjustment to active max Q
@@ -2248,7 +2249,7 @@
   cpi->twopass.frames_to_key--;
 
   // Update the total stats remaining structure
-  subtract_stats(cpi->twopass.total_left_stats, &this_frame);
+  subtract_stats(&cpi->twopass.total_left_stats, &this_frame);
 }
 
 static int test_candidate_kf(VP9_COMP *cpi,
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 300fa32..60b7abf 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -332,15 +332,6 @@
 
   vpx_free(cpi->mb.pip);
   cpi->mb.pip = 0;
-
-  vpx_free(cpi->twopass.total_stats);
-  cpi->twopass.total_stats = 0;
-
-  vpx_free(cpi->twopass.total_left_stats);
-  cpi->twopass.total_left_stats = 0;
-
-  vpx_free(cpi->twopass.this_frame_stats);
-  cpi->twopass.this_frame_stats = 0;
 }
 
 // Computes a q delta (in "q index" terms) to get from a starting q value
@@ -960,23 +951,6 @@
   CHECK_MEM_ERROR(cpi->mb_norm_activity_map,
                   vpx_calloc(sizeof(unsigned int),
                              cm->mb_rows * cm->mb_cols));
-
-  vpx_free(cpi->twopass.total_stats);
-
-  cpi->twopass.total_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));
-
-  vpx_free(cpi->twopass.total_left_stats);
-  cpi->twopass.total_left_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));
-
-  vpx_free(cpi->twopass.this_frame_stats);
-
-  cpi->twopass.this_frame_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));
-
-  if (!cpi->twopass.total_stats ||
-      !cpi->twopass.total_left_stats ||
-      !cpi->twopass.this_frame_stats)
-    vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
-                       "Failed to allocate firstpass stats");
 }
 
 
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 9d1e984..c6abfe1 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -578,10 +578,10 @@
     unsigned int section_intra_rating;
     unsigned int next_iiratio;
     unsigned int this_iiratio;
-    FIRSTPASS_STATS *total_stats;
-    FIRSTPASS_STATS *this_frame_stats;
+    FIRSTPASS_STATS total_stats;
+    FIRSTPASS_STATS this_frame_stats;
     FIRSTPASS_STATS *stats_in, *stats_in_end, *stats_in_start;
-    FIRSTPASS_STATS *total_left_stats;
+    FIRSTPASS_STATS total_left_stats;
     int first_pass_done;
     int64_t bits_left;
     int64_t clip_bits_total;
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 0600de2..dcbdef3 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -798,7 +798,13 @@
     rate = bmode_costs[mode];
 #endif
 
-    vp9_intra4x4_predict(xd, ib, mode, dst, xd->plane[0].dst.stride);
+    vp9_intra4x4_predict(xd, ib,
+#if CONFIG_SB8X8
+                         BLOCK_SIZE_SB8X8,
+#else
+                         BLOCK_SIZE_MB16X16,
+#endif
+                         mode, dst, xd->plane[0].dst.stride);
     vp9_subtract_block(4, 4, src_diff, 16 >> CONFIG_SB8X8,
                        src, src_stride,
                        dst, xd->plane[0].dst.stride);
@@ -846,9 +852,15 @@
   else
     xd->inv_txm4x4(best_dqcoeff, diff, 32 >> CONFIG_SB8X8);
 
-  vp9_intra4x4_predict(xd, ib, *best_mode,
+  vp9_intra4x4_predict(xd, ib,
+#if CONFIG_SB8X8
+                       BLOCK_SIZE_SB8X8,
+#else
+                       BLOCK_SIZE_MB16X16,
+#endif
+                       *best_mode,
                        dst, xd->plane[0].dst.stride);
-  vp9_recon_b(dst, diff,
+  vp9_recon_b(dst, diff, 16 >> CONFIG_SB8X8,
               dst, xd->plane[0].dst.stride);
 
   return best_rd;
@@ -3077,6 +3089,93 @@
   }
 }
 
+static enum BlockSize get_block_size(int bw, int bh) {
+#if CONFIG_SB8X8
+  if (bw == 4 && bh == 4)
+    return BLOCK_4X4;
+
+  if (bw == 4 && bh == 8)
+    return BLOCK_4X8;
+
+  if (bw == 8 && bh == 4)
+    return BLOCK_8X4;
+
+  if (bw == 8 && bh == 8)
+    return BLOCK_8X8;
+
+  if (bw == 8 && bh == 16)
+    return BLOCK_8X16;
+
+  if (bw == 16 && bh == 8)
+    return BLOCK_16X8;
+#else
+  if (bw == 16 && bh == 8)
+    return BLOCK_16X8;
+
+  if (bw == 8 && bh == 16)
+    return BLOCK_8X16;
+
+  if (bw == 8 && bh == 8)
+    return BLOCK_8X8;
+
+  if (bw == 4 && bh == 4)
+    return BLOCK_4X4;
+#endif
+  if (bw == 16 && bh == 16)
+    return BLOCK_16X16;
+
+  if (bw == 32 && bh == 32)
+    return BLOCK_32X32;
+
+  if (bw == 32 && bh == 16)
+    return BLOCK_32X16;
+
+  if (bw == 16 && bh == 32)
+    return BLOCK_16X32;
+
+  if (bw == 64 && bh == 32)
+    return BLOCK_64X32;
+
+  if (bw == 32 && bh == 64)
+    return BLOCK_32X64;
+
+  if (bw == 64 && bh == 64)
+    return BLOCK_64X64;
+
+  assert(0);
+  return -1;
+}
+
+static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
+                            MACROBLOCK *x, MACROBLOCKD *xd,
+                            int *out_rate_sum, int *out_dist_sum) {
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  unsigned int sse, var;
+  int i, rate_sum = 0, dist_sum = 0;
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    struct macroblock_plane *const p = &x->plane[i];
+    struct macroblockd_plane *const pd = &xd->plane[i];
+
+    const int bwl = b_width_log2(bsize) - pd->subsampling_x;
+    const int bhl = b_height_log2(bsize) - pd->subsampling_y;
+    const enum BlockSize bs = get_block_size(4 << bwl, 4 << bhl);
+    int rate, dist;
+    var = cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
+                             pd->dst.buf, pd->dst.stride, &sse);
+    model_rd_from_var_lapndz(var, 16 << (bwl + bhl),
+                             pd->dequant[1] >> 3, &rate, &dist);
+
+    rate_sum += rate;
+    dist_sum += dist;
+  }
+
+  *out_rate_sum = rate_sum;
+  *out_dist_sum = dist_sum;
+}
+
 static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                  BLOCK_SIZE_TYPE bsize,
                                  int mdcounts[4], int64_t txfm_cache[],
@@ -3271,76 +3370,40 @@
   // Search for best switchable filter by checking the variance of
   // pred error irrespective of whether the filter will be used
   if (1) {
-    int switchable_filter_index, newbest;
-    int tmp_rate_y_i = 0, tmp_rate_u_i = 0, tmp_rate_v_i = 0;
-    int tmp_dist_y_i = 0, tmp_dist_u_i = 0, tmp_dist_v_i = 0;
-    for (switchable_filter_index = 0;
-         switchable_filter_index < VP9_SWITCHABLE_FILTERS;
-         ++switchable_filter_index) {
+    int i, newbest;
+    int tmp_rate_sum = 0, tmp_dist_sum = 0;
+    for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
       int rs = 0;
-      mbmi->interp_filter = vp9_switchable_interp[switchable_filter_index];
-      vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
+      const INTERPOLATIONFILTERTYPE filter = vp9_switchable_interp[i];
+      const int is_intpel_interp = intpel_mv &&
+                                   vp9_is_interpolating_filter[filter];
+      mbmi->interp_filter = filter;
+      vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
 
-      if (cpi->common.mcomp_filter_type == SWITCHABLE) {
+      if (cm->mcomp_filter_type == SWITCHABLE) {
         const int c = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP);
         const int m = vp9_switchable_interp_map[mbmi->interp_filter];
         rs = SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m];
       }
-      if (interpolating_intpel_seen && intpel_mv &&
-          vp9_is_interpolating_filter[mbmi->interp_filter]) {
-        rd = RDCOST(x->rdmult, x->rddiv,
-                    rs + tmp_rate_y_i + tmp_rate_u_i + tmp_rate_v_i,
-                    tmp_dist_y_i + tmp_dist_u_i + tmp_dist_v_i);
+
+      if (interpolating_intpel_seen && is_intpel_interp) {
+        rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate_sum, tmp_dist_sum);
       } else {
-        unsigned int sse, var;
-        int tmp_rate_y, tmp_rate_u, tmp_rate_v;
-        int tmp_dist_y, tmp_dist_u, tmp_dist_v;
+        int rate_sum = 0, dist_sum = 0;
         vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
-        var = cpi->fn_ptr[block_size].vf(x->plane[0].src.buf,
-                                         x->plane[0].src.stride,
-                                         xd->plane[0].dst.buf,
-                                         xd->plane[0].dst.stride,
-                                         &sse);
-        // Note our transform coeffs are 8 times an orthogonal transform.
-        // Hence quantizer step is also 8 times. To get effective quantizer
-        // we need to divide by 8 before sending to modeling function.
-        model_rd_from_var_lapndz(var, MI_SIZE * bw * MI_SIZE * bh,
-                                 xd->plane[0].dequant[1] >> 3,
-                                 &tmp_rate_y, &tmp_dist_y);
-        var = cpi->fn_ptr[uv_block_size].vf(x->plane[1].src.buf,
-                                            x->plane[1].src.stride,
-                                            xd->plane[1].dst.buf,
-                                            xd->plane[1].dst.stride,
-                                            &sse);
-        model_rd_from_var_lapndz(var, MI_UV_SIZE * bw * MI_UV_SIZE * bh,
-                                 xd->plane[1].dequant[1] >> 3,
-                                 &tmp_rate_u, &tmp_dist_u);
-        var = cpi->fn_ptr[uv_block_size].vf(x->plane[2].src.buf,
-                                            x->plane[1].src.stride,
-                                            xd->plane[2].dst.buf,
-                                            xd->plane[1].dst.stride,
-                                            &sse);
-        model_rd_from_var_lapndz(var, MI_UV_SIZE * bw * MI_UV_SIZE * bh,
-                                 xd->plane[2].dequant[1] >> 3,
-                                 &tmp_rate_v, &tmp_dist_v);
-        rd = RDCOST(x->rdmult, x->rddiv,
-                    rs + tmp_rate_y + tmp_rate_u + tmp_rate_v,
-                    tmp_dist_y + tmp_dist_u + tmp_dist_v);
-        if (!interpolating_intpel_seen && intpel_mv &&
-            vp9_is_interpolating_filter[mbmi->interp_filter]) {
-          tmp_rate_y_i = tmp_rate_y;
-          tmp_rate_u_i = tmp_rate_u;
-          tmp_rate_v_i = tmp_rate_v;
-          tmp_dist_y_i = tmp_dist_y;
-          tmp_dist_u_i = tmp_dist_u;
-          tmp_dist_v_i = tmp_dist_v;
+        model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum);
+        rd = RDCOST(x->rdmult, x->rddiv, rs + rate_sum, dist_sum);
+        if (!interpolating_intpel_seen && is_intpel_interp) {
+          tmp_rate_sum = rate_sum;
+          tmp_dist_sum = dist_sum;
         }
       }
-      newbest = (switchable_filter_index == 0 || rd < best_rd);
+      newbest = i == 0 || rd < best_rd;
       if (newbest) {
         best_rd = rd;
         *best_filter = mbmi->interp_filter;
       }
+
       if ((cm->mcomp_filter_type == SWITCHABLE && newbest) ||
           (cm->mcomp_filter_type != SWITCHABLE &&
            cm->mcomp_filter_type == mbmi->interp_filter)) {
@@ -3355,21 +3418,18 @@
                      sizeof(unsigned char) * MI_UV_SIZE * bw);
         for (i = 0; i < MI_UV_SIZE * bh; ++i)
           vpx_memcpy(tmp_vbuf + i * MI_UV_SIZE * bw,
-                     xd->plane[2].dst.buf + i * xd->plane[1].dst.stride,
+                     xd->plane[2].dst.buf + i * xd->plane[2].dst.stride,
                      sizeof(unsigned char) * MI_UV_SIZE * bw);
         pred_exists = 1;
       }
-      interpolating_intpel_seen |=
-        intpel_mv && vp9_is_interpolating_filter[mbmi->interp_filter];
+      interpolating_intpel_seen |= is_intpel_interp;
     }
   }
 
   // Set the appripriate filter
-  if (cm->mcomp_filter_type != SWITCHABLE)
-    mbmi->interp_filter = cm->mcomp_filter_type;
-  else
-    mbmi->interp_filter = *best_filter;
-  vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
+  mbmi->interp_filter = cm->mcomp_filter_type != SWITCHABLE ?
+      cm->mcomp_filter_type : *best_filter;
+  vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
 
   if (pred_exists) {
     // FIXME(rbultje): mb code still predicts into xd->predictor
@@ -3382,7 +3442,7 @@
                  tmp_ubuf + i * bw * MI_UV_SIZE,
                  sizeof(unsigned char) * bw * MI_UV_SIZE);
     for (i = 0; i < bh * MI_UV_SIZE; ++i)
-      vpx_memcpy(xd->plane[2].dst.buf + i * xd->plane[1].dst.stride,
+      vpx_memcpy(xd->plane[2].dst.buf + i * xd->plane[2].dst.stride,
                  tmp_vbuf + i * bw * MI_UV_SIZE,
                  sizeof(unsigned char) * bw * MI_UV_SIZE);
   } else {
@@ -4744,6 +4804,21 @@
 
     mbmi->ref_frame = ref_frame;
     mbmi->second_ref_frame = vp9_mode_order[mode_index].second_ref_frame;
+
+    // TODO(jingning): scaling not supported in SPLITMV mode.
+    if (mbmi->ref_frame > 0 &&
+          (yv12_mb[mbmi->ref_frame].y_width != cm->mb_cols * 16 ||
+           yv12_mb[mbmi->ref_frame].y_height != cm->mb_rows * 16) &&
+        this_mode == SPLITMV)
+      continue;
+
+    if (mbmi->second_ref_frame > 0 &&
+          (yv12_mb[mbmi->second_ref_frame].y_width != cm->mb_cols * 16 ||
+           yv12_mb[mbmi->second_ref_frame].y_height != cm->mb_rows * 16) &&
+        this_mode == SPLITMV)
+      continue;
+
+
     set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,
                       scale_factor);
     comp_pred = mbmi->second_ref_frame > INTRA_FRAME;
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 1e6b984..6bd8b50 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -481,7 +481,7 @@
   // Note: this_frame->frame has been updated in the loop
   // so it now points at the ARF frame.
   half_gf_int = cpi->baseline_gf_interval >> 1;
-  frames_after_arf = (int)(cpi->twopass.total_stats->count - this_frame - 1);
+  frames_after_arf = (int)(cpi->twopass.total_stats.count - this_frame - 1);
 
   switch (cpi->oxcf.arnr_type) {
     case 1:  // Backward filter
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 1ddd4f0..cbe3aa3 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -113,14 +113,6 @@
 
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_sadmxn_sse2.c
-ifeq ($(HAVE_SSE2),yes)
-vp9/common/x86/vp9_idct_intrin_sse2.c.o: CFLAGS += -msse2
-vp9/common/x86/vp9_loopfilter_intrin_sse2.c.o: CFLAGS += -msse2
-vp9/common/x86/vp9_sadmxn_sse2.c.o: CFLAGS += -msse2
-vp9/common/x86/vp9_idct_intrin_sse2.c.d: CFLAGS += -msse2
-vp9/common/x86/vp9_loopfilter_intrin_sse2.c.d: CFLAGS += -msse2
-vp9/common/x86/vp9_sadmxn_sse2.c.d: CFLAGS += -msse2
-endif
 
 $(eval $(call asm_offsets_template,\
          vp9_asm_com_offsets.asm, $(VP9_PREFIX)common/vp9_asm_com_offsets.c))
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 39f836f..42ab02d 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -95,10 +95,5 @@
 VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt.asm
 
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c
-ifeq ($(HAVE_SSE2),yes)
-vp9/encoder/x86/vp9_dct_sse2.c.d: CFLAGS += -msse2
-vp9/encoder/x86/vp9_dct_sse2.c.o: CFLAGS += -msse2
-endif
-
 
 VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))
diff --git a/vp9/vp9dx.mk b/vp9/vp9dx.mk
index babdebb..72cdfeb 100644
--- a/vp9/vp9dx.mk
+++ b/vp9/vp9dx.mk
@@ -38,10 +38,6 @@
 VP9_DX_SRCS-$(HAVE_SSE2) += decoder/x86/vp9_idct_blk_sse2.c
 
 VP9_DX_SRCS-$(HAVE_SSE2) += decoder/x86/vp9_dequantize_sse2.c
-ifeq ($(HAVE_SSE2),yes)
-vp9/decoder/x86/vp9_dequantize_sse2.c.o: CFLAGS += -msse2
-vp9/decoder/x86/vp9_dequantize_sse2.c.d: CFLAGS += -msse2
-endif
 
 $(eval $(call asm_offsets_template,\
          vp9_asm_dec_offsets.asm, $(VP9_PREFIX)decoder/vp9_asm_dec_offsets.c))
diff --git a/vpx/vpx_image.h b/vpx/vpx_image.h
index 809fa38..c304bac 100644
--- a/vpx/vpx_image.h
+++ b/vpx/vpx_image.h
@@ -55,9 +55,11 @@
     VPX_IMG_FMT_YV12    = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_UV_FLIP | 1, /**< planar YVU */
     VPX_IMG_FMT_I420    = VPX_IMG_FMT_PLANAR | 2,
     VPX_IMG_FMT_VPXYV12 = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_UV_FLIP | 3, /** < planar 4:2:0 format with vpx color space */
-    VPX_IMG_FMT_VPXI420 = VPX_IMG_FMT_PLANAR | 4   /** < planar 4:2:0 format with vpx color space */
-  }
-                        vpx_img_fmt_t; /**< alias for enum vpx_img_fmt */
+    VPX_IMG_FMT_VPXI420 = VPX_IMG_FMT_PLANAR | 4,
+    VPX_IMG_FMT_I422    = VPX_IMG_FMT_PLANAR | 5,
+    VPX_IMG_FMT_I444    = VPX_IMG_FMT_PLANAR | 6,
+    VPX_IMG_FMT_444A    = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_HAS_ALPHA | 7
+  } vpx_img_fmt_t; /**< alias for enum vpx_img_fmt */
 
 #if !defined(VPX_CODEC_DISABLE_COMPAT) || !VPX_CODEC_DISABLE_COMPAT
 #define IMG_FMT_PLANAR         VPX_IMG_FMT_PLANAR     /**< \deprecated Use #VPX_IMG_FMT_PLANAR */
diff --git a/vpxenc.c b/vpxenc.c
index 95c6cf2..33a56a4 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -326,6 +326,7 @@
   unsigned int          h;
   struct vpx_rational   framerate;
   int                   use_i420;
+  int                   only_i420;
 };
 
 
@@ -1793,7 +1794,8 @@
 
   if (input->detect.buf_read == 4
       && file_is_y4m(input->file, &input->y4m, input->detect.buf)) {
-    if (y4m_input_open(&input->y4m, input->file, input->detect.buf, 4) >= 0) {
+    if (y4m_input_open(&input->y4m, input->file, input->detect.buf, 4,
+                       input->only_i420) >= 0) {
       input->file_type = FILE_TYPE_Y4M;
       input->w = input->y4m.pic_w;
       input->h = input->y4m.pic_h;
@@ -2517,6 +2519,7 @@
   input.framerate.num = 30;
   input.framerate.den = 1;
   input.use_i420 = 1;
+  input.only_i420 = 1;
 
   /* First parse the global configuration values, because we want to apply
    * other parameters on top of the default configuration provided by the
@@ -2551,6 +2554,12 @@
   if (!input.fn)
     usage_exit();
 
+#if CONFIG_NON420
+  /* Decide if other chroma subsamplings than 4:2:0 are supported */
+  if (global.codec->fourcc == VP9_FOURCC)
+    input.only_i420 = 0;
+#endif
+
   for (pass = global.pass ? global.pass - 1 : 0; pass < global.passes; pass++) {
     int frames_in = 0, seen_frames = 0;
     int64_t estimated_time_left = -1;
diff --git a/y4minput.c b/y4minput.c
index 24f0c15..47f005a 100644
--- a/y4minput.c
+++ b/y4minput.c
@@ -659,7 +659,8 @@
                              unsigned char *_aux) {
 }
 
-int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip) {
+int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,
+                   int only_420) {
   char buffer[80];
   int  ret;
   int  i;
@@ -701,6 +702,8 @@
             "Only progressive scan handled.\n");
     return -1;
   }
+  _y4m->vpx_fmt = VPX_IMG_FMT_I420;
+  _y4m->vpx_bps = 12;
   if (strcmp(_y4m->chroma_type, "420") == 0 ||
       strcmp(_y4m->chroma_type, "420jpeg") == 0) {
     _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v = _y4m->dst_c_dec_v = 2;
@@ -734,16 +737,30 @@
     _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
     _y4m->convert = y4m_convert_422jpeg_420jpeg;
   } else if (strcmp(_y4m->chroma_type, "422") == 0) {
-    _y4m->src_c_dec_h = _y4m->dst_c_dec_h = 2;
+    _y4m->src_c_dec_h = 2;
     _y4m->src_c_dec_v = 1;
-    _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
-    /*Chroma filter required: read into the aux buf first.
-      We need to make two filter passes, so we need some extra space in the
-       aux buffer.*/
-    _y4m->aux_buf_read_sz = 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
-    _y4m->convert = y4m_convert_422_420jpeg;
+    if (only_420) {
+      _y4m->dst_c_dec_h = 2;
+      _y4m->dst_c_dec_v = 2;
+      _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+      /*Chroma filter required: read into the aux buf first.
+        We need to make two filter passes, so we need some extra space in the
+         aux buffer.*/
+      _y4m->aux_buf_read_sz = 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
+      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz +
+          ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
+      _y4m->convert = y4m_convert_422_420jpeg;
+    } else {
+      _y4m->vpx_fmt = VPX_IMG_FMT_I422;
+      _y4m->vpx_bps = 16;
+      _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
+      _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
+      _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h
+                              + 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
+      /*Natively supported: no conversion required.*/
+      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+      _y4m->convert = y4m_convert_null;
+      }
   } else if (strcmp(_y4m->chroma_type, "411") == 0) {
     _y4m->src_c_dec_h = 4;
     _y4m->dst_c_dec_h = 2;
@@ -758,29 +775,52 @@
     _y4m->convert = y4m_convert_411_420jpeg;
   } else if (strcmp(_y4m->chroma_type, "444") == 0) {
     _y4m->src_c_dec_h = 1;
-    _y4m->dst_c_dec_h = 2;
     _y4m->src_c_dec_v = 1;
-    _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
-    /*Chroma filter required: read into the aux buf first.
-      We need to make two filter passes, so we need some extra space in the
-       aux buffer.*/
-    _y4m->aux_buf_read_sz = 2 * _y4m->pic_w * _y4m->pic_h;
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
-    _y4m->convert = y4m_convert_444_420jpeg;
+    if (only_420) {
+      _y4m->dst_c_dec_h = 2;
+      _y4m->dst_c_dec_v = 2;
+      _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+      /*Chroma filter required: read into the aux buf first.
+        We need to make two filter passes, so we need some extra space in the
+         aux buffer.*/
+      _y4m->aux_buf_read_sz = 2 * _y4m->pic_w * _y4m->pic_h;
+      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz +
+          ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
+      _y4m->convert = y4m_convert_444_420jpeg;
+    } else {
+      _y4m->vpx_fmt = VPX_IMG_FMT_I444;
+      _y4m->vpx_bps = 24;
+      _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
+      _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
+      _y4m->dst_buf_read_sz = 3 * _y4m->pic_w * _y4m->pic_h;
+      /*Natively supported: no conversion required.*/
+      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+      _y4m->convert = y4m_convert_null;
+    }
   } else if (strcmp(_y4m->chroma_type, "444alpha") == 0) {
     _y4m->src_c_dec_h = 1;
-    _y4m->dst_c_dec_h = 2;
     _y4m->src_c_dec_v = 1;
-    _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
-    /*Chroma filter required: read into the aux buf first.
-      We need to make two filter passes, so we need some extra space in the
-       aux buffer.
-      The extra plane also gets read into the aux buf.
-      It will be discarded.*/
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 3 * _y4m->pic_w * _y4m->pic_h;
-    _y4m->convert = y4m_convert_444_420jpeg;
+    if (only_420) {
+      _y4m->dst_c_dec_h = 2;
+      _y4m->dst_c_dec_v = 2;
+      _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+      /*Chroma filter required: read into the aux buf first.
+        We need to make two filter passes, so we need some extra space in the
+         aux buffer.
+        The extra plane also gets read into the aux buf.
+        It will be discarded.*/
+      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 3 * _y4m->pic_w * _y4m->pic_h;
+      _y4m->convert = y4m_convert_444_420jpeg;
+    } else {
+      _y4m->vpx_fmt = VPX_IMG_FMT_444A;
+      _y4m->vpx_bps = 32;
+      _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
+      _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
+      _y4m->dst_buf_read_sz = 4 * _y4m->pic_w * _y4m->pic_h;
+      /*Natively supported: no conversion required.*/
+      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+      _y4m->convert = y4m_convert_null;
+    }
   } else if (strcmp(_y4m->chroma_type, "mono") == 0) {
     _y4m->src_c_dec_h = _y4m->src_c_dec_v = 0;
     _y4m->dst_c_dec_h = _y4m->dst_c_dec_v = 2;
@@ -847,22 +887,23 @@
      sizes, which would require a separate fread call for every row.*/
   memset(_img, 0, sizeof(*_img));
   /*Y4M has the planes in Y'CbCr order, which libvpx calls Y, U, and V.*/
-  _img->fmt = IMG_FMT_I420;
+  _img->fmt = _y4m->vpx_fmt;
   _img->w = _img->d_w = _y4m->pic_w;
   _img->h = _img->d_h = _y4m->pic_h;
-  /*This is hard-coded to 4:2:0 for now, as that's all VP8 supports.*/
-  _img->x_chroma_shift = 1;
-  _img->y_chroma_shift = 1;
-  _img->bps = 12;
+  _img->x_chroma_shift = _y4m->dst_c_dec_h >> 1;
+  _img->y_chroma_shift = _y4m->dst_c_dec_v >> 1;
+  _img->bps = _y4m->vpx_bps;
+
   /*Set up the buffer pointers.*/
   pic_sz = _y4m->pic_w * _y4m->pic_h;
   c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h;
   c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v;
   c_sz = c_w * c_h;
-  _img->stride[PLANE_Y] = _y4m->pic_w;
+  _img->stride[PLANE_Y] = _img->stride[PLANE_ALPHA] = _y4m->pic_w;
   _img->stride[PLANE_U] = _img->stride[PLANE_V] = c_w;
   _img->planes[PLANE_Y] = _y4m->dst_buf;
   _img->planes[PLANE_U] = _y4m->dst_buf + pic_sz;
   _img->planes[PLANE_V] = _y4m->dst_buf + pic_sz + c_sz;
+  _img->planes[PLANE_ALPHA] = _y4m->dst_buf + pic_sz + 2 * c_sz;
   return 1;
 }
diff --git a/y4minput.h b/y4minput.h
index 2fa3767..b2a390c 100644
--- a/y4minput.h
+++ b/y4minput.h
@@ -51,9 +51,12 @@
   y4m_convert_func  convert;
   unsigned char    *dst_buf;
   unsigned char    *aux_buf;
+  enum vpx_img_fmt  vpx_fmt;
+  int               vpx_bps;
 };
 
-int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip);
+int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,
+                   int only_420);
 void y4m_input_close(y4m_input *_y4m);
 int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, vpx_image_t *img);