Merge "Renaming "mbskip" to "skip"."
diff --git a/ivfenc.c b/ivfenc.c
index 0041ff0..4a97c42 100644
--- a/ivfenc.c
+++ b/ivfenc.c
@@ -10,7 +10,6 @@
 
 #include "./ivfenc.h"
 
-#include "./tools_common.h"
 #include "vpx/vpx_encoder.h"
 #include "vpx_ports/mem_ops.h"
 
@@ -24,33 +23,31 @@
   header[1] = 'K';
   header[2] = 'I';
   header[3] = 'F';
-  mem_put_le16(header + 4,  0);                 /* version */
-  mem_put_le16(header + 6,  32);                /* headersize */
-  mem_put_le32(header + 8,  fourcc);            /* four CC */
-  mem_put_le16(header + 12, cfg->g_w);          /* width */
-  mem_put_le16(header + 14, cfg->g_h);          /* height */
-  mem_put_le32(header + 16, cfg->g_timebase.den); /* rate */
-  mem_put_le32(header + 20, cfg->g_timebase.num); /* scale */
-  mem_put_le32(header + 24, frame_cnt);         /* length */
-  mem_put_le32(header + 28, 0);                 /* unused */
+  mem_put_le16(header + 4, 0);                     // version
+  mem_put_le16(header + 6, 32);                    // header size
+  mem_put_le32(header + 8, fourcc);                // fourcc
+  mem_put_le16(header + 12, cfg->g_w);             // width
+  mem_put_le16(header + 14, cfg->g_h);             // height
+  mem_put_le32(header + 16, cfg->g_timebase.den);  // rate
+  mem_put_le32(header + 20, cfg->g_timebase.num);  // scale
+  mem_put_le32(header + 24, frame_cnt);            // length
+  mem_put_le32(header + 28, 0);                    // unused
 
-  (void) fwrite(header, 1, 32, outfile);
+  fwrite(header, 1, 32, outfile);
 }
 
-void ivf_write_frame_header(FILE *outfile, const struct vpx_codec_cx_pkt *pkt) {
+void ivf_write_frame_header(FILE *outfile, int64_t pts, size_t frame_size) {
   char header[12];
-  vpx_codec_pts_t pts;
 
-  pts = pkt->data.frame.pts;
-  mem_put_le32(header, (int)pkt->data.frame.sz);
-  mem_put_le32(header + 4, pts & 0xFFFFFFFF);
-  mem_put_le32(header + 8, pts >> 32);
-
-  (void) fwrite(header, 1, 12, outfile);
+  mem_put_le32(header, (int)frame_size);
+  mem_put_le32(header + 4, (int)(pts & 0xFFFFFFFF));
+  mem_put_le32(header + 8, (int)(pts >> 32));
+  fwrite(header, 1, 12, outfile);
 }
 
-void ivf_write_frame_size(FILE *outfile, size_t size) {
+void ivf_write_frame_size(FILE *outfile, size_t frame_size) {
   char header[4];
-  mem_put_le32(header, (int)size);
-  (void) fwrite(header, 1, 4, outfile);
+
+  mem_put_le32(header, (int)frame_size);
+  fwrite(header, 1, 4, outfile);
 }
diff --git a/ivfenc.h b/ivfenc.h
index b486bc8..6623687 100644
--- a/ivfenc.h
+++ b/ivfenc.h
@@ -23,8 +23,10 @@
                            const struct vpx_codec_enc_cfg *cfg,
                            uint32_t fourcc,
                            int frame_cnt);
-void ivf_write_frame_header(FILE *outfile, const struct vpx_codec_cx_pkt *pkt);
-void ivf_write_frame_size(FILE *outfile, size_t size);
+
+void ivf_write_frame_header(FILE *outfile, int64_t pts, size_t frame_size);
+
+void ivf_write_frame_size(FILE *outfile, size_t frame_size);
 
 #ifdef __cplusplus
 }  /* extern "C" */
diff --git a/libs.mk b/libs.mk
index 470066a..cc40451 100644
--- a/libs.mk
+++ b/libs.mk
@@ -214,8 +214,11 @@
 ifeq ($(CONFIG_EXTERNAL_BUILD),yes)
 ifeq ($(CONFIG_MSVS),yes)
 
+obj_int_extract.bat: $(SRC_PATH_BARE)/build/$(MSVS_ARCH_DIR)/obj_int_extract.bat
+	@cp $^ $@
+
+obj_int_extract.$(VCPROJ_SFX): obj_int_extract.bat
 obj_int_extract.$(VCPROJ_SFX): $(SRC_PATH_BARE)/build/make/obj_int_extract.c
-	@cp $(SRC_PATH_BARE)/build/$(MSVS_ARCH_DIR)/obj_int_extract.bat .
 	@echo "    [CREATE] $@"
 	$(qexec)$(GEN_VCPROJ) \
     --exe \
diff --git a/test/i420_video_source.h b/test/i420_video_source.h
index 2bf2a03..c3315f9 100644
--- a/test/i420_video_source.h
+++ b/test/i420_video_source.h
@@ -52,7 +52,7 @@
     ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
         << file_name_;
     if (start_) {
-      fseek(input_file_, raw_sz_ * start_, SEEK_SET);
+      fseek(input_file_, static_cast<unsigned>(raw_sz_) * start_, SEEK_SET);
     }
 
     frame_ = start_;
diff --git a/test/subtract_test.cc b/test/subtract_test.cc
index 3211c5c..9e242a2 100644
--- a/test/subtract_test.cc
+++ b/test/subtract_test.cc
@@ -59,7 +59,7 @@
     int16_t *src_diff = be.src_diff;
     for (int r = 0; r < kBlockHeight; ++r) {
       for (int c = 0; c < kBlockWidth; ++c) {
-        src_diff[c] = static_cast<int16_t>(0xa5a5);
+        src_diff[c] = static_cast<int16_t>(0xa5a5u);
       }
       src_diff += kDiffPredStride;
     }
diff --git a/test/svc_test.cc b/test/svc_test.cc
index 3ddd9c1..75659d5 100644
--- a/test/svc_test.cc
+++ b/test/svc_test.cc
@@ -234,7 +234,7 @@
   video.Begin();
 
   res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
-                       video.duration(), VPX_DL_REALTIME);
+                       video.duration(), VPX_DL_GOOD_QUALITY);
   EXPECT_EQ(VPX_CODEC_OK, res);
 
   const vpx_codec_err_t res_dec = decoder_->DecodeFrame(
@@ -262,7 +262,7 @@
   video.Begin();
   // This frame is a keyframe.
   res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
-                       video.duration(), VPX_DL_REALTIME);
+                       video.duration(), VPX_DL_GOOD_QUALITY);
   ASSERT_EQ(VPX_CODEC_OK, res);
   EXPECT_EQ(1, vpx_svc_is_keyframe(&svc_));
 
@@ -275,7 +275,7 @@
   video.Next();
   // This is a P-frame.
   res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
-                       video.duration(), VPX_DL_REALTIME);
+                       video.duration(), VPX_DL_GOOD_QUALITY);
   ASSERT_EQ(VPX_CODEC_OK, res);
   EXPECT_EQ(0, vpx_svc_is_keyframe(&svc_));
 
@@ -288,7 +288,7 @@
   video.Next();
   // This is a P-frame.
   res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
-                       video.duration(), VPX_DL_REALTIME);
+                       video.duration(), VPX_DL_GOOD_QUALITY);
   ASSERT_EQ(VPX_CODEC_OK, res);
   EXPECT_EQ(0, vpx_svc_is_keyframe(&svc_));
 
diff --git a/test/test-data.sha1 b/test/test-data.sha1
index 6daf69e..a287731 100644
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -574,3 +574,5 @@
 84c1599298aac78f2fc05ae2274575d10569dfa0  vp90-2-09-aq2.webm.md5
 55fc55ed73d578ed60fad05692579873f8bad758  vp90-2-09-lf_deltas.webm
 54638c38009198c38c8f3b25c182b709b6c1fd2e  vp90-2-09-lf_deltas.webm.md5
+510d95f3beb3b51c572611fdaeeece12277dac30  vp90-2-10-show-existing-frame.webm
+14d631096f4bfa2d71f7f739aec1448fb3c33bad  vp90-2-10-show-existing-frame.webm.md5
diff --git a/test/test.mk b/test/test.mk
index cb62615..a65decf 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -676,6 +676,8 @@
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-lf_deltas.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-10-show-existing-frame.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-10-show-existing-frame.webm.md5
 
 ifeq ($(CONFIG_DECODE_PERF_TESTS),yes)
 # BBB VP9 streams
diff --git a/test/test_vector_test.cc b/test/test_vector_test.cc
index 4adf9af..53b7636 100644
--- a/test/test_vector_test.cc
+++ b/test/test_vector_test.cc
@@ -35,7 +35,7 @@
 
   void OpenMD5File(const std::string& md5_file_name_) {
     md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name_);
-    ASSERT_TRUE(md5_file_) << "Md5 file open failed. Filename: "
+    ASSERT_TRUE(md5_file_ != NULL) << "Md5 file open failed. Filename: "
         << md5_file_name_;
   }
 
diff --git a/test/test_vectors.cc b/test/test_vectors.cc
index 16298d0..3227f52 100644
--- a/test/test_vectors.cc
+++ b/test/test_vectors.cc
@@ -160,6 +160,7 @@
   "vp90-2-02-size-lf-1920x1080.webm",
   "vp90-2-09-aq2.webm",
   "vp90-2-09-lf_deltas.webm",
+  "vp90-2-10-show-existing-frame.webm",
 #if CONFIG_NON420
   "vp91-2-04-yv444.webm"
 #endif
diff --git a/test/test_vectors.h b/test/test_vectors.h
index 5f62e99..eb592de 100644
--- a/test/test_vectors.h
+++ b/test/test_vectors.h
@@ -22,9 +22,9 @@
 
 #if CONFIG_VP9_DECODER
 #if CONFIG_NON420
-const int kNumVp9TestVectors = 216;
+const int kNumVp9TestVectors = 217;
 #else
-const int kNumVp9TestVectors = 215;
+const int kNumVp9TestVectors = 216;
 #endif
 
 extern const char *kVP9TestVectors[kNumVp9TestVectors];
diff --git a/test/vp8_boolcoder_test.cc b/test/vp8_boolcoder_test.cc
index fa7ee6e..7c6c601 100644
--- a/test/vp8_boolcoder_test.cc
+++ b/test/vp8_boolcoder_test.cc
@@ -43,7 +43,7 @@
 
 void test_decrypt_cb(void *decrypt_state, const uint8_t *input,
                            uint8_t *output, int count) {
-  int offset = input - reinterpret_cast<uint8_t *>(decrypt_state);
+  const size_t offset = input - reinterpret_cast<uint8_t*>(decrypt_state);
   for (int i = 0; i < count; i++) {
     output[i] = input[i] ^ secret_key[(offset + i) & 15];
   }
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 2266e0e..dd304c9 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -16,26 +16,6 @@
 
 #include "vp9/common/vp9_seg_common.h"
 
-// This structure holds bit masks for all 8x8 blocks in a 64x64 region.
-// Each 1 bit represents a position in which we want to apply the loop filter.
-// Left_ entries refer to whether we apply a filter on the border to the
-// left of the block.   Above_ entries refer to whether or not to apply a
-// filter on the above border.   Int_ entries refer to whether or not to
-// apply borders on the 4x4 edges within the 8x8 block that each bit
-// represents.
-// Since each transform is accompanied by a potentially different type of
-// loop filter there is a different entry in the array for each transform size.
-typedef struct {
-  uint64_t left_y[TX_SIZES];
-  uint64_t above_y[TX_SIZES];
-  uint64_t int_4x4_y;
-  uint16_t left_uv[TX_SIZES];
-  uint16_t above_uv[TX_SIZES];
-  uint16_t int_4x4_uv;
-  uint8_t lfl_y[64];
-  uint8_t lfl_uv[16];
-} LOOP_FILTER_MASK;
-
 // 64 bit masks for left transform size.  Each 1 represents a position where
 // we should apply a loop filter across the left border of an 8x8 block
 // boundary.
@@ -638,9 +618,9 @@
 // This function sets up the bit masks for the entire 64x64 region represented
 // by mi_row, mi_col.
 // TODO(JBB): This function only works for yv12.
-static void setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
-                       MODE_INFO **mi_8x8, const int mode_info_stride,
-                       LOOP_FILTER_MASK *lfm) {
+void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
+                    MODE_INFO **mi_8x8, const int mode_info_stride,
+                    LOOP_FILTER_MASK *lfm) {
   int idx_32, idx_16, idx_8;
   const loop_filter_info_n *const lfi_n = &cm->lf_info;
   MODE_INFO **mip = mi_8x8;
@@ -1069,10 +1049,10 @@
 }
 #endif
 
-static void filter_block_plane(VP9_COMMON *const cm,
-                               struct macroblockd_plane *const plane,
-                               int mi_row,
-                               LOOP_FILTER_MASK *lfm) {
+void vp9_filter_block_plane(VP9_COMMON *const cm,
+                            struct macroblockd_plane *const plane,
+                            int mi_row,
+                            LOOP_FILTER_MASK *lfm) {
   struct buf_2d *const dst = &plane->dst;
   uint8_t* const dst0 = dst->buf;
   int r, c;
@@ -1244,14 +1224,14 @@
 #if CONFIG_NON420
       if (use_420)
 #endif
-        setup_mask(cm, mi_row, mi_col, mi_8x8 + mi_col, cm->mode_info_stride,
-                   &lfm);
+        vp9_setup_mask(cm, mi_row, mi_col, mi_8x8 + mi_col,
+                       cm->mode_info_stride, &lfm);
 
       for (plane = 0; plane < num_planes; ++plane) {
 #if CONFIG_NON420
         if (use_420)
 #endif
-          filter_block_plane(cm, &xd->plane[plane], mi_row, &lfm);
+          vp9_filter_block_plane(cm, &xd->plane[plane], mi_row, &lfm);
 #if CONFIG_NON420
         else
           filter_block_plane_non420(cm, &xd->plane[plane], mi_8x8 + mi_col,
diff --git a/vp9/common/vp9_loopfilter.h b/vp9/common/vp9_loopfilter.h
index 43373f4..668e898 100644
--- a/vp9/common/vp9_loopfilter.h
+++ b/vp9/common/vp9_loopfilter.h
@@ -60,9 +60,42 @@
   uint8_t lvl[MAX_SEGMENTS][MAX_REF_FRAMES][MAX_MODE_LF_DELTAS];
 } loop_filter_info_n;
 
+// This structure holds bit masks for all 8x8 blocks in a 64x64 region.
+// Each 1 bit represents a position in which we want to apply the loop filter.
+// Left_ entries refer to whether we apply a filter on the border to the
+// left of the block.   Above_ entries refer to whether or not to apply a
+// filter on the above border.   Int_ entries refer to whether or not to
+// apply borders on the 4x4 edges within the 8x8 block that each bit
+// represents.
+// Since each transform is accompanied by a potentially different type of
+// loop filter there is a different entry in the array for each transform size.
+typedef struct {
+  uint64_t left_y[TX_SIZES];
+  uint64_t above_y[TX_SIZES];
+  uint64_t int_4x4_y;
+  uint16_t left_uv[TX_SIZES];
+  uint16_t above_uv[TX_SIZES];
+  uint16_t int_4x4_uv;
+  uint8_t lfl_y[64];
+  uint8_t lfl_uv[16];
+} LOOP_FILTER_MASK;
+
 /* assorted loopfilter functions which get used elsewhere */
 struct VP9Common;
 struct macroblockd;
+struct VP9LfSyncData;
+
+// This function sets up the bit masks for the entire 64x64 region represented
+// by mi_row, mi_col.
+void vp9_setup_mask(struct VP9Common *const cm,
+                    const int mi_row, const int mi_col,
+                    MODE_INFO **mi_8x8, const int mode_info_stride,
+                    LOOP_FILTER_MASK *lfm);
+
+void vp9_filter_block_plane(struct VP9Common *const cm,
+                            struct macroblockd_plane *const plane,
+                            int mi_row,
+                            LOOP_FILTER_MASK *lfm);
 
 void vp9_loop_filter_init(struct VP9Common *cm);
 
@@ -90,6 +123,9 @@
   int start;
   int stop;
   int y_only;
+
+  struct VP9LfSyncData *lf_sync;
+  int num_lf_workers;
 } LFWorkerData;
 
 // Operates on the rows described by LFWorkerData passed as 'arg1'.
diff --git a/vp9/common/vp9_onyx.h b/vp9/common/vp9_onyx.h
index 9df76de..564e419 100644
--- a/vp9/common/vp9_onyx.h
+++ b/vp9/common/vp9_onyx.h
@@ -55,6 +55,7 @@
     MODE_FIRSTPASS      = 0x3,
     MODE_SECONDPASS     = 0x4,
     MODE_SECONDPASS_BEST = 0x5,
+    MODE_REALTIME       = 0x6,
   } MODE;
 
   typedef enum {
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 3b2b48c..a4df051 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -33,18 +33,12 @@
 #include "vp9/decoder/vp9_detokenize.h"
 #include "vp9/decoder/vp9_decodemv.h"
 #include "vp9/decoder/vp9_dsubexp.h"
+#include "vp9/decoder/vp9_dthread.h"
 #include "vp9/decoder/vp9_onyxd_int.h"
 #include "vp9/decoder/vp9_read_bit_buffer.h"
 #include "vp9/decoder/vp9_reader.h"
 #include "vp9/decoder/vp9_thread.h"
 
-typedef struct TileWorkerData {
-  VP9_COMMON *cm;
-  vp9_reader bit_reader;
-  DECLARE_ALIGNED(16, MACROBLOCKD, xd);
-  DECLARE_ALIGNED(16, int16_t,  dqcoeff[MAX_MB_PLANE][64 * 64]);
-} TileWorkerData;
-
 static int read_be32(const uint8_t *p) {
   return (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
 }
@@ -982,7 +976,6 @@
       ++pbi->num_tile_workers;
 
       vp9_worker_init(worker);
-      worker->hook = (VP9WorkerHook)tile_worker_hook;
       CHECK_MEM_ERROR(cm, worker->data1,
                       vpx_memalign(32, sizeof(TileWorkerData)));
       CHECK_MEM_ERROR(cm, worker->data2, vpx_malloc(sizeof(TileInfo)));
@@ -993,6 +986,11 @@
     }
   }
 
+  // Reset tile decoding hook
+  for (n = 0; n < pbi->num_tile_workers; ++n) {
+    pbi->tile_workers[n].hook = (VP9WorkerHook)tile_worker_hook;
+  }
+
   // Note: this memset assumes above_context[0], [1] and [2]
   // are allocated as part of the same buffer.
   vpx_memset(pbi->above_context[0], 0,
@@ -1392,9 +1390,6 @@
     *p_data_end = decode_tiles(pbi, data + first_partition_size);
   }
 
-  cm->last_width = cm->width;
-  cm->last_height = cm->height;
-
   new_fb->corrupted |= xd->corrupted;
 
   if (!pbi->decoded_key_frame) {
diff --git a/vp9/decoder/vp9_dthread.c b/vp9/decoder/vp9_dthread.c
new file mode 100644
index 0000000..280e351
--- /dev/null
+++ b/vp9/decoder/vp9_dthread.c
@@ -0,0 +1,259 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/decoder/vp9_dthread.h"
+#include "vp9/decoder/vp9_onyxd_int.h"
+#include "vpx_mem/vpx_mem.h"
+
+#if CONFIG_MULTITHREAD
+static INLINE void mutex_lock(pthread_mutex_t *const mutex) {
+  const int kMaxTryLocks = 4000;
+  int locked = 0;
+  int i;
+
+  for (i = 0; i < kMaxTryLocks; ++i) {
+    if (!pthread_mutex_trylock(mutex)) {
+      locked = 1;
+      break;
+    }
+  }
+
+  if (!locked)
+    pthread_mutex_lock(mutex);
+}
+#endif  // CONFIG_MULTITHREAD
+
+static INLINE void sync_read(VP9LfSync *const lf_sync, int r, int c) {
+#if CONFIG_MULTITHREAD
+  const int nsync = lf_sync->sync_range;
+
+  if (r && !(c & (nsync - 1))) {
+    mutex_lock(&lf_sync->mutex_[r - 1]);
+
+    while (c > lf_sync->cur_sb_col[r - 1] - nsync) {
+      pthread_cond_wait(&lf_sync->cond_[r - 1],
+                        &lf_sync->mutex_[r - 1]);
+    }
+    pthread_mutex_unlock(&lf_sync->mutex_[r - 1]);
+  }
+#else
+  (void)lf_sync;
+  (void)r;
+  (void)c;
+#endif  // CONFIG_MULTITHREAD
+}
+
+static INLINE void sync_write(VP9LfSync *const lf_sync, int r, int c,
+                              const int sb_cols) {
+#if CONFIG_MULTITHREAD
+  const int nsync = lf_sync->sync_range;
+  int cur;
+  // Only signal when there are enough filtered SB for next row to run.
+  int sig = 1;
+
+  if (c < sb_cols - 1) {
+    cur = c;
+    if (c % nsync)
+      sig = 0;
+  } else {
+    cur = sb_cols + nsync;
+  }
+
+  if (sig) {
+    mutex_lock(&lf_sync->mutex_[r]);
+
+    lf_sync->cur_sb_col[r] = cur;
+
+    pthread_cond_signal(&lf_sync->cond_[r]);
+    pthread_mutex_unlock(&lf_sync->mutex_[r]);
+  }
+#else
+  (void)lf_sync;
+  (void)r;
+  (void)c;
+  (void)sb_cols;
+#endif  // CONFIG_MULTITHREAD
+}
+
+// Implement row loopfiltering for each thread.
+static void loop_filter_rows_mt(const YV12_BUFFER_CONFIG *const frame_buffer,
+                                VP9_COMMON *const cm, MACROBLOCKD *const xd,
+                                int start, int stop, int y_only,
+                                VP9LfSync *const lf_sync, int num_lf_workers) {
+  const int num_planes = y_only ? 1 : MAX_MB_PLANE;
+  int r, c;  // SB row and col
+  LOOP_FILTER_MASK lfm;
+  const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2;
+
+  for (r = start; r < stop; r += num_lf_workers) {
+    const int mi_row = r << MI_BLOCK_SIZE_LOG2;
+    MODE_INFO **mi_8x8 = cm->mi_grid_visible + mi_row * cm->mode_info_stride;
+
+    for (c = 0; c < sb_cols; ++c) {
+      const int mi_col = c << MI_BLOCK_SIZE_LOG2;
+      int plane;
+
+      sync_read(lf_sync, r, c);
+
+      setup_dst_planes(xd, frame_buffer, mi_row, mi_col);
+      vp9_setup_mask(cm, mi_row, mi_col, mi_8x8 + mi_col, cm->mode_info_stride,
+                     &lfm);
+
+      for (plane = 0; plane < num_planes; ++plane) {
+        vp9_filter_block_plane(cm, &xd->plane[plane], mi_row, &lfm);
+      }
+
+      sync_write(lf_sync, r, c, sb_cols);
+    }
+  }
+}
+
+// Row-based multi-threaded loopfilter hook
+static int loop_filter_row_worker(void *arg1, void *arg2) {
+  TileWorkerData *const tile_data = (TileWorkerData*)arg1;
+  LFWorkerData *const lf_data = &tile_data->lfdata;
+
+  loop_filter_rows_mt(lf_data->frame_buffer, lf_data->cm, &lf_data->xd,
+                      lf_data->start, lf_data->stop, lf_data->y_only,
+                      lf_data->lf_sync, lf_data->num_lf_workers);
+  return 1;
+}
+
+// VP9 decoder: Implement multi-threaded loopfilter that uses the tile
+// threads.
+void vp9_loop_filter_frame_mt(VP9D_COMP *pbi,
+                              VP9_COMMON *cm,
+                              MACROBLOCKD *xd,
+                              int frame_filter_level,
+                              int y_only, int partial) {
+  // Number of superblock rows and cols
+  const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
+  int i;
+
+  // Allocate memory used in thread synchronization.
+  // This always needs to be done even if frame_filter_level is 0.
+  if (!cm->current_video_frame || cm->last_height != cm->height) {
+    VP9LfSync *const lf_sync = &pbi->lf_row_sync;
+
+    if (cm->last_height != cm->height) {
+      const int aligned_last_height =
+          ALIGN_POWER_OF_TWO(cm->last_height, MI_SIZE_LOG2);
+      const int last_sb_rows =
+          mi_cols_aligned_to_sb(aligned_last_height >> MI_SIZE_LOG2) >>
+          MI_BLOCK_SIZE_LOG2;
+
+      vp9_loop_filter_dealloc(lf_sync, last_sb_rows);
+    }
+
+    vp9_loop_filter_alloc(cm, lf_sync, sb_rows, cm->width);
+  }
+
+  if (!frame_filter_level) return;
+
+  vp9_loop_filter_frame_init(cm, frame_filter_level);
+
+  // Initialize cur_sb_col to -1 for all SB rows.
+  vpx_memset(pbi->lf_row_sync.cur_sb_col, -1,
+             sizeof(*pbi->lf_row_sync.cur_sb_col) * sb_rows);
+
+  // Set up loopfilter thread data.
+  for (i = 0; i < pbi->num_tile_workers; ++i) {
+    VP9Worker *const worker = &pbi->tile_workers[i];
+    TileWorkerData *const tile_data = (TileWorkerData*)worker->data1;
+    LFWorkerData *const lf_data = &tile_data->lfdata;
+
+    worker->hook = (VP9WorkerHook)loop_filter_row_worker;
+
+    // Loopfilter data
+    lf_data->frame_buffer = get_frame_new_buffer(cm);
+    lf_data->cm = cm;
+    lf_data->xd = pbi->mb;
+    lf_data->start = i;
+    lf_data->stop = sb_rows;
+    lf_data->y_only = y_only;   // always do all planes in decoder
+
+    lf_data->lf_sync = &pbi->lf_row_sync;
+    lf_data->num_lf_workers = pbi->num_tile_workers;
+
+    // Start loopfiltering
+    if (i == pbi->num_tile_workers - 1) {
+      vp9_worker_execute(worker);
+    } else {
+      vp9_worker_launch(worker);
+    }
+  }
+
+  // Wait till all rows are finished
+  for (i = 0; i < pbi->num_tile_workers; ++i) {
+    vp9_worker_sync(&pbi->tile_workers[i]);
+  }
+}
+
+// Set up nsync by width.
+static int get_sync_range(int width) {
+  // nsync numbers are picked by testing. For example, for 4k
+  // video, using 4 gives best performance.
+  if (width < 640)
+    return 1;
+  else if (width <= 1280)
+    return 2;
+  else if (width <= 4096)
+    return 4;
+  else
+    return 8;
+}
+
+// Allocate memory for lf row synchronization
+void vp9_loop_filter_alloc(VP9_COMMON *cm, VP9LfSync *lf_sync, int rows,
+                           int width) {
+#if CONFIG_MULTITHREAD
+  int i;
+
+  CHECK_MEM_ERROR(cm, lf_sync->mutex_,
+                  vpx_malloc(sizeof(*lf_sync->mutex_) * rows));
+  CHECK_MEM_ERROR(cm, lf_sync->cond_,
+                  vpx_malloc(sizeof(*lf_sync->cond_) * rows));
+
+  for (i = 0; i < rows; ++i) {
+    pthread_mutex_init(&lf_sync->mutex_[i], NULL);
+    pthread_cond_init(&lf_sync->cond_[i], NULL);
+  }
+#endif  // CONFIG_MULTITHREAD
+
+  CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col,
+                  vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows));
+
+  // Set up nsync.
+  lf_sync->sync_range = get_sync_range(width);
+}
+
+// Deallocate lf synchronization related mutex and data
+void vp9_loop_filter_dealloc(VP9LfSync *lf_sync, int rows) {
+#if CONFIG_MULTITHREAD
+  if (lf_sync != NULL) {
+    int i;
+
+    for (i = 0; i < rows; ++i) {
+      pthread_mutex_destroy(&lf_sync->mutex_[i]);
+      pthread_cond_destroy(&lf_sync->cond_[i]);
+    }
+
+    vpx_free(lf_sync->mutex_);
+    vpx_free(lf_sync->cond_);
+    vpx_free(lf_sync->cur_sb_col);
+  }
+#else
+  (void)rows;
+  if (lf_sync != NULL)
+    vpx_free(lf_sync->cur_sb_col);
+#endif  // CONFIG_MULTITHREAD
+}
diff --git a/vp9/decoder/vp9_dthread.h b/vp9/decoder/vp9_dthread.h
new file mode 100644
index 0000000..4478354
--- /dev/null
+++ b/vp9/decoder/vp9_dthread.h
@@ -0,0 +1,60 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_DECODER_VP9_DTHREAD_H_
+#define VP9_DECODER_VP9_DTHREAD_H_
+
+#include "./vpx_config.h"
+#include "vp9/common/vp9_loopfilter.h"
+#include "vp9/decoder/vp9_reader.h"
+#include "vp9/decoder/vp9_thread.h"
+
+struct macroblockd;
+struct VP9Common;
+struct VP9Decompressor;
+
+typedef struct TileWorkerData {
+  struct VP9Common *cm;
+  vp9_reader bit_reader;
+  DECLARE_ALIGNED(16, struct macroblockd, xd);
+  DECLARE_ALIGNED(16, int16_t, dqcoeff[MAX_MB_PLANE][64 * 64]);
+
+  // Row-based parallel loopfilter data
+  LFWorkerData lfdata;
+} TileWorkerData;
+
+// Loopfilter row synchronization
+typedef struct VP9LfSyncData {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *mutex_;
+  pthread_cond_t *cond_;
+#endif
+  // Allocate memory to store the loop-filtered superblock index in each row.
+  int *cur_sb_col;
+  // The optimal sync_range for different resolution and platform should be
+  // determined by testing. Currently, it is chosen to be a power-of-2 number.
+  int sync_range;
+} VP9LfSync;
+
+// Allocate memory for loopfilter row synchronization.
+void vp9_loop_filter_alloc(struct VP9Common *cm, struct VP9LfSyncData *lf_sync,
+                           int rows, int width);
+
+// Deallocate loopfilter synchronization related mutex and data.
+void vp9_loop_filter_dealloc(struct VP9LfSyncData *lf_sync, int rows);
+
+// Multi-threaded loopfilter that uses the tile threads.
+void vp9_loop_filter_frame_mt(struct VP9Decompressor *pbi,
+                              struct VP9Common *cm,
+                              struct macroblockd *xd,
+                              int frame_filter_level,
+                              int y_only, int partial);
+
+#endif  // VP9_DECODER_VP9_DTHREAD_H_
diff --git a/vp9/decoder/vp9_onyxd_if.c b/vp9/decoder/vp9_onyxd_if.c
index 75d52c2..c14a05d 100644
--- a/vp9/decoder/vp9_onyxd_if.c
+++ b/vp9/decoder/vp9_onyxd_if.c
@@ -27,6 +27,7 @@
 #include "vpx_ports/vpx_timer.h"
 #include "vp9/decoder/vp9_decodeframe.h"
 #include "vp9/decoder/vp9_detokenize.h"
+#include "vp9/decoder/vp9_dthread.h"
 #include "./vpx_scale_rtcd.h"
 
 #define WRITE_RECON_BUFFER 0
@@ -177,6 +178,16 @@
     vpx_free(worker->data2);
   }
   vpx_free(pbi->tile_workers);
+
+  if (pbi->num_tile_workers) {
+    VP9_COMMON *const cm = &pbi->common;
+    const int sb_rows =
+        mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
+    VP9LfSync *const lf_sync = &pbi->lf_row_sync;
+
+    vp9_loop_filter_dealloc(lf_sync, sb_rows);
+  }
+
   vpx_free(pbi->mi_streams);
   vpx_free(pbi->above_context[0]);
   vpx_free(pbi->above_seg_context);
@@ -370,7 +381,13 @@
 #endif
 
   if (!pbi->do_loopfilter_inline) {
-    vp9_loop_filter_frame(cm, &pbi->mb, pbi->common.lf.filter_level, 0, 0);
+    // If multiple threads are used to decode tiles, then we use those threads
+    // to do parallel loopfiltering.
+    if (pbi->num_tile_workers) {
+      vp9_loop_filter_frame_mt(pbi, cm, &pbi->mb, cm->lf.filter_level, 0, 0);
+    } else {
+      vp9_loop_filter_frame(cm, &pbi->mb, cm->lf.filter_level, 0, 0);
+    }
   }
 
 #if WRITE_RECON_BUFFER == 2
@@ -390,7 +407,11 @@
 
   vp9_clear_system_state();
 
-  cm->last_show_frame = cm->show_frame;
+  cm->last_width = cm->width;
+  cm->last_height = cm->height;
+
+  if (!cm->show_existing_frame)
+    cm->last_show_frame = cm->show_frame;
   if (cm->show_frame) {
     if (!cm->show_existing_frame) {
       // current mip will be the prev_mip for the next frame
diff --git a/vp9/decoder/vp9_onyxd_int.h b/vp9/decoder/vp9_onyxd_int.h
index 242c600..6c6c239 100644
--- a/vp9/decoder/vp9_onyxd_int.h
+++ b/vp9/decoder/vp9_onyxd_int.h
@@ -14,6 +14,7 @@
 #include "./vpx_config.h"
 
 #include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/decoder/vp9_dthread.h"
 #include "vp9/decoder/vp9_onyxd.h"
 #include "vp9/decoder/vp9_thread.h"
 
@@ -49,6 +50,8 @@
   VP9Worker *tile_workers;
   int num_tile_workers;
 
+  VP9LfSync lf_row_sync;
+
   /* Each tile column has its own MODE_INFO stream. This array indexes them by
      tile column index. */
   MODE_INFO **mi_streams;
diff --git a/vp9/decoder/vp9_thread.c b/vp9/decoder/vp9_thread.c
index d953e72..5d31d3d 100644
--- a/vp9/decoder/vp9_thread.c
+++ b/vp9/decoder/vp9_thread.c
@@ -24,116 +24,6 @@
 
 #if CONFIG_MULTITHREAD
 
-#if defined(_WIN32)
-
-//------------------------------------------------------------------------------
-// simplistic pthread emulation layer
-
-#include <process.h>  // NOLINT
-
-// _beginthreadex requires __stdcall
-#define THREADFN unsigned int __stdcall
-#define THREAD_RETURN(val) (unsigned int)((DWORD_PTR)val)
-
-static int pthread_create(pthread_t* const thread, const void* attr,
-                          unsigned int (__stdcall *start)(void*), void* arg) {
-  (void)attr;
-  *thread = (pthread_t)_beginthreadex(NULL,   /* void *security */
-                                      0,      /* unsigned stack_size */
-                                      start,
-                                      arg,
-                                      0,      /* unsigned initflag */
-                                      NULL);  /* unsigned *thrdaddr */
-  if (*thread == NULL) return 1;
-  SetThreadPriority(*thread, THREAD_PRIORITY_ABOVE_NORMAL);
-  return 0;
-}
-
-static int pthread_join(pthread_t thread, void** value_ptr) {
-  (void)value_ptr;
-  return (WaitForSingleObject(thread, INFINITE) != WAIT_OBJECT_0 ||
-          CloseHandle(thread) == 0);
-}
-
-// Mutex
-static int pthread_mutex_init(pthread_mutex_t* const mutex, void* mutexattr) {
-  (void)mutexattr;
-  InitializeCriticalSection(mutex);
-  return 0;
-}
-
-static int pthread_mutex_lock(pthread_mutex_t* const mutex) {
-  EnterCriticalSection(mutex);
-  return 0;
-}
-
-static int pthread_mutex_unlock(pthread_mutex_t* const mutex) {
-  LeaveCriticalSection(mutex);
-  return 0;
-}
-
-static int pthread_mutex_destroy(pthread_mutex_t* const mutex) {
-  DeleteCriticalSection(mutex);
-  return 0;
-}
-
-// Condition
-static int pthread_cond_destroy(pthread_cond_t* const condition) {
-  int ok = 1;
-  ok &= (CloseHandle(condition->waiting_sem_) != 0);
-  ok &= (CloseHandle(condition->received_sem_) != 0);
-  ok &= (CloseHandle(condition->signal_event_) != 0);
-  return !ok;
-}
-
-static int pthread_cond_init(pthread_cond_t* const condition, void* cond_attr) {
-  (void)cond_attr;
-  condition->waiting_sem_ = CreateSemaphore(NULL, 0, 1, NULL);
-  condition->received_sem_ = CreateSemaphore(NULL, 0, 1, NULL);
-  condition->signal_event_ = CreateEvent(NULL, FALSE, FALSE, NULL);
-  if (condition->waiting_sem_ == NULL ||
-      condition->received_sem_ == NULL ||
-      condition->signal_event_ == NULL) {
-    pthread_cond_destroy(condition);
-    return 1;
-  }
-  return 0;
-}
-
-static int pthread_cond_signal(pthread_cond_t* const condition) {
-  int ok = 1;
-  if (WaitForSingleObject(condition->waiting_sem_, 0) == WAIT_OBJECT_0) {
-    // a thread is waiting in pthread_cond_wait: allow it to be notified
-    ok = SetEvent(condition->signal_event_);
-    // wait until the event is consumed so the signaler cannot consume
-    // the event via its own pthread_cond_wait.
-    ok &= (WaitForSingleObject(condition->received_sem_, INFINITE) !=
-           WAIT_OBJECT_0);
-  }
-  return !ok;
-}
-
-static int pthread_cond_wait(pthread_cond_t* const condition,
-                             pthread_mutex_t* const mutex) {
-  int ok;
-  // note that there is a consumer available so the signal isn't dropped in
-  // pthread_cond_signal
-  if (!ReleaseSemaphore(condition->waiting_sem_, 1, NULL))
-    return 1;
-  // now unlock the mutex so pthread_cond_signal may be issued
-  pthread_mutex_unlock(mutex);
-  ok = (WaitForSingleObject(condition->signal_event_, INFINITE) ==
-        WAIT_OBJECT_0);
-  ok &= ReleaseSemaphore(condition->received_sem_, 1, NULL);
-  pthread_mutex_lock(mutex);
-  return !ok;
-}
-
-#else  // _WIN32
-# define THREADFN void*
-# define THREAD_RETURN(val) val
-#endif
-
 //------------------------------------------------------------------------------
 
 static THREADFN thread_loop(void *ptr) {    // thread loop
diff --git a/vp9/decoder/vp9_thread.h b/vp9/decoder/vp9_thread.h
index bc69cfa..2f8728d 100644
--- a/vp9/decoder/vp9_thread.h
+++ b/vp9/decoder/vp9_thread.h
@@ -26,7 +26,8 @@
 #if CONFIG_MULTITHREAD
 
 #if defined(_WIN32)
-
+#include <errno.h>  // NOLINT
+#include <process.h>  // NOLINT
 #include <windows.h>  // NOLINT
 typedef HANDLE pthread_t;
 typedef CRITICAL_SECTION pthread_mutex_t;
@@ -36,12 +37,120 @@
   HANDLE signal_event_;
 } pthread_cond_t;
 
-#else
+//------------------------------------------------------------------------------
+// simplistic pthread emulation layer
 
+// _beginthreadex requires __stdcall
+#define THREADFN unsigned int __stdcall
+#define THREAD_RETURN(val) (unsigned int)((DWORD_PTR)val)
+
+static INLINE int pthread_create(pthread_t* const thread, const void* attr,
+                                 unsigned int (__stdcall *start)(void*),
+                                 void* arg) {
+  (void)attr;
+  *thread = (pthread_t)_beginthreadex(NULL,   /* void *security */
+                                      0,      /* unsigned stack_size */
+                                      start,
+                                      arg,
+                                      0,      /* unsigned initflag */
+                                      NULL);  /* unsigned *thrdaddr */
+  if (*thread == NULL) return 1;
+  SetThreadPriority(*thread, THREAD_PRIORITY_ABOVE_NORMAL);
+  return 0;
+}
+
+static INLINE int pthread_join(pthread_t thread, void** value_ptr) {
+  (void)value_ptr;
+  return (WaitForSingleObject(thread, INFINITE) != WAIT_OBJECT_0 ||
+          CloseHandle(thread) == 0);
+}
+
+// Mutex
+static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex,
+                                     void* mutexattr) {
+  (void)mutexattr;
+  InitializeCriticalSection(mutex);
+  return 0;
+}
+
+static INLINE int pthread_mutex_trylock(pthread_mutex_t *const mutex) {
+  return TryEnterCriticalSection(mutex) ? 0 : EBUSY;
+}
+
+static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) {
+  EnterCriticalSection(mutex);
+  return 0;
+}
+
+static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) {
+  LeaveCriticalSection(mutex);
+  return 0;
+}
+
+static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) {
+  DeleteCriticalSection(mutex);
+  return 0;
+}
+
+// Condition
+static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) {
+  int ok = 1;
+  ok &= (CloseHandle(condition->waiting_sem_) != 0);
+  ok &= (CloseHandle(condition->received_sem_) != 0);
+  ok &= (CloseHandle(condition->signal_event_) != 0);
+  return !ok;
+}
+
+static INLINE int pthread_cond_init(pthread_cond_t *const condition,
+                                    void* cond_attr) {
+  (void)cond_attr;
+  condition->waiting_sem_ = CreateSemaphore(NULL, 0, 1, NULL);
+  condition->received_sem_ = CreateSemaphore(NULL, 0, 1, NULL);
+  condition->signal_event_ = CreateEvent(NULL, FALSE, FALSE, NULL);
+  if (condition->waiting_sem_ == NULL ||
+      condition->received_sem_ == NULL ||
+      condition->signal_event_ == NULL) {
+    pthread_cond_destroy(condition);
+    return 1;
+  }
+  return 0;
+}
+
+static INLINE int pthread_cond_signal(pthread_cond_t *const condition) {
+  int ok = 1;
+  if (WaitForSingleObject(condition->waiting_sem_, 0) == WAIT_OBJECT_0) {
+    // a thread is waiting in pthread_cond_wait: allow it to be notified
+    ok = SetEvent(condition->signal_event_);
+    // wait until the event is consumed so the signaler cannot consume
+    // the event via its own pthread_cond_wait.
+    ok &= (WaitForSingleObject(condition->received_sem_, INFINITE) !=
+           WAIT_OBJECT_0);
+  }
+  return !ok;
+}
+
+static INLINE int pthread_cond_wait(pthread_cond_t *const condition,
+                                    pthread_mutex_t *const mutex) {
+  int ok;
+  // note that there is a consumer available so the signal isn't dropped in
+  // pthread_cond_signal
+  if (!ReleaseSemaphore(condition->waiting_sem_, 1, NULL))
+    return 1;
+  // now unlock the mutex so pthread_cond_signal may be issued
+  pthread_mutex_unlock(mutex);
+  ok = (WaitForSingleObject(condition->signal_event_, INFINITE) ==
+        WAIT_OBJECT_0);
+  ok &= ReleaseSemaphore(condition->received_sem_, 1, NULL);
+  pthread_mutex_lock(mutex);
+  return !ok;
+}
+#else  // _WIN32
 #include <pthread.h> // NOLINT
+# define THREADFN void*
+# define THREAD_RETURN(val) val
+#endif
 
-#endif    /* _WIN32 */
-#endif    /* CONFIG_MULTITHREAD */
+#endif  // CONFIG_MULTITHREAD
 
 // State of the worker thread object
 typedef enum {
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 716ad61..713cc51 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -239,25 +239,6 @@
   }
 }
 
-struct rdcost_block_args {
-  MACROBLOCK *x;
-  ENTROPY_CONTEXT t_above[16];
-  ENTROPY_CONTEXT t_left[16];
-  TX_SIZE tx_size;
-  int bw;
-  int bh;
-  int rate;
-  int64_t dist;
-  int64_t sse;
-  int this_rate;
-  int64_t this_dist;
-  int64_t this_sse;
-  int64_t this_rd;
-  int64_t best_rd;
-  int skip;
-  const int16_t *scan, *nb;
-};
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 724a115..35c6d6f 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -1031,131 +1031,171 @@
   }
   return 0;
 }
-
-// TODO(jingning) This currently serves as a test framework for non-RD mode
-// decision. To be continued on optimizing the partition type decisions.
-static void pick_partition_type(VP9_COMP *cpi,
-                                const TileInfo *const tile,
-                                MODE_INFO **mi_8x8, TOKENEXTRA **tp,
-                                int mi_row, int mi_col,
-                                BLOCK_SIZE bsize, int *rate, int64_t *dist,
-                                int do_recon) {
+static void update_state_rt(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
+                         BLOCK_SIZE bsize, int output_enabled) {
+  int i;
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
-  const int mi_stride = cm->mode_info_stride;
-  const int num_8x8_subsize = (num_8x8_blocks_wide_lookup[bsize] >> 1);
-  int i;
-  PARTITION_TYPE partition = PARTITION_NONE;
-  BLOCK_SIZE subsize;
-  BLOCK_SIZE bs_type = mi_8x8[0]->mbmi.sb_type;
-  int sub_rate[4] = {0};
-  int64_t sub_dist[4] = {0};
-  int mi_offset;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *const p = x->plane;
+  struct macroblockd_plane *const pd = xd->plane;
+  MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
 
-  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
-    return;
+  const int mb_mode_index = ctx->best_mode_index;
+  int max_plane;
 
-  partition = partition_lookup[b_width_log2(bsize)][bs_type];
-  subsize = get_subsize(bsize, partition);
+  max_plane = is_inter_block(mbmi) ? MAX_MB_PLANE : 1;
+  for (i = 0; i < max_plane; ++i) {
+    p[i].coeff = ctx->coeff_pbuf[i][1];
+    p[i].qcoeff = ctx->qcoeff_pbuf[i][1];
+    pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
+    p[i].eobs = ctx->eobs_pbuf[i][1];
+  }
+
+  for (i = max_plane; i < MAX_MB_PLANE; ++i) {
+    p[i].coeff = ctx->coeff_pbuf[i][2];
+    p[i].qcoeff = ctx->qcoeff_pbuf[i][2];
+    pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][2];
+    p[i].eobs = ctx->eobs_pbuf[i][2];
+  }
+
+  x->skip = ctx->skip;
+
+  if (frame_is_intra_only(cm)) {
+#if CONFIG_INTERNAL_STATS
+    static const int kf_mode_index[] = {
+      THR_DC /*DC_PRED*/,
+      THR_V_PRED /*V_PRED*/,
+      THR_H_PRED /*H_PRED*/,
+      THR_D45_PRED /*D45_PRED*/,
+      THR_D135_PRED /*D135_PRED*/,
+      THR_D117_PRED /*D117_PRED*/,
+      THR_D153_PRED /*D153_PRED*/,
+      THR_D207_PRED /*D207_PRED*/,
+      THR_D63_PRED /*D63_PRED*/,
+      THR_TM /*TM_PRED*/,
+    };
+    ++cpi->mode_chosen_counts[kf_mode_index[mbmi->mode]];
+#endif
+  } else {
+    // Note how often each mode chosen as best
+    cpi->mode_chosen_counts[mb_mode_index]++;
+    if (is_inter_block(mbmi) &&
+        (mbmi->sb_type < BLOCK_8X8 || mbmi->mode == NEWMV)) {
+      int_mv best_mv[2];
+      for (i = 0; i < 1 + has_second_ref(mbmi); ++i)
+        best_mv[i].as_int = mbmi->ref_mvs[mbmi->ref_frame[i]][0].as_int;
+      vp9_update_mv_count(cpi, x, best_mv);
+    }
+
+    if (cm->interp_filter == SWITCHABLE && is_inter_mode(mbmi->mode)) {
+      const int ctx = vp9_get_pred_context_switchable_interp(xd);
+      ++cm->counts.switchable_interp[ctx][mbmi->interp_filter];
+    }
+  }
+}
+
+static void encode_b_rt(VP9_COMP *cpi, const TileInfo *const tile,
+                     TOKENEXTRA **tp, int mi_row, int mi_col,
+                     int output_enabled, BLOCK_SIZE bsize) {
+  MACROBLOCK *const x = &cpi->mb;
 
   if (bsize < BLOCK_8X8) {
     // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0
     // there is nothing to be done.
-    if (x->ab_index != 0) {
-      *rate = 0;
-      *dist = 0;
+    if (x->ab_index > 0)
       return;
-    }
-  } else {
-    *(get_sb_partitioning(x, bsize)) = subsize;
   }
+  set_offsets(cpi, tile, mi_row, mi_col, bsize);
+  update_state_rt(cpi, get_block_context(x, bsize), bsize, output_enabled);
+
+  encode_superblock(cpi, tp, output_enabled, mi_row, mi_col, bsize);
+  update_stats(cpi);
+
+  (*tp)->token = EOSB_TOKEN;
+  (*tp)++;
+}
+
+static void encode_sb_rt(VP9_COMP *cpi, const TileInfo *const tile,
+                      TOKENEXTRA **tp, int mi_row, int mi_col,
+                      int output_enabled, BLOCK_SIZE bsize) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
+  const int bsl = b_width_log2(bsize), hbs = (1 << bsl) / 4;
+  int ctx;
+  PARTITION_TYPE partition;
+  BLOCK_SIZE subsize;
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
+
+  if (bsize >= BLOCK_8X8) {
+    MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+    const int idx_str = xd->mode_info_stride * mi_row + mi_col;
+    MODE_INFO ** mi_8x8 = cm->mi_grid_visible + idx_str;
+    ctx = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context,
+                                 mi_row, mi_col, bsize);
+    subsize = mi_8x8[0]->mbmi.sb_type;
+
+  } else {
+    ctx = 0;
+    subsize = BLOCK_4X4;
+  }
+
+  partition = partition_lookup[bsl][subsize];
 
   switch (partition) {
     case PARTITION_NONE:
-      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, rate, dist,
-                       bsize, get_block_context(x, bsize), INT64_MAX);
-      break;
-    case PARTITION_HORZ:
-      *get_sb_index(x, subsize) = 0;
-      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sub_rate[0], &sub_dist[0],
-                       subsize, get_block_context(x, subsize), INT64_MAX);
-      if (bsize >= BLOCK_8X8 && mi_row + num_8x8_subsize < cm->mi_rows) {
-        update_state(cpi, get_block_context(x, subsize), subsize, 0);
-        encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
-        *get_sb_index(x, subsize) = 1;
-        rd_pick_sb_modes(cpi, tile, mi_row + num_8x8_subsize, mi_col,
-                         &sub_rate[1], &sub_dist[1], subsize,
-                         get_block_context(x, subsize), INT64_MAX);
-      }
-      *rate = sub_rate[0] + sub_rate[1];
-      *dist = sub_dist[0] + sub_dist[1];
+      if (output_enabled && bsize >= BLOCK_8X8)
+        cm->counts.partition[ctx][PARTITION_NONE]++;
+      encode_b_rt(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize);
       break;
     case PARTITION_VERT:
+      if (output_enabled)
+        cm->counts.partition[ctx][PARTITION_VERT]++;
       *get_sb_index(x, subsize) = 0;
-      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sub_rate[0], &sub_dist[0],
-                       subsize, get_block_context(x, subsize), INT64_MAX);
-      if (bsize >= BLOCK_8X8 && mi_col + num_8x8_subsize < cm->mi_cols) {
-        update_state(cpi, get_block_context(x, subsize), subsize, 0);
-        encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
+      encode_b_rt(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize);
+      if (mi_col + hbs < cm->mi_cols) {
         *get_sb_index(x, subsize) = 1;
-        rd_pick_sb_modes(cpi, tile, mi_row, mi_col + num_8x8_subsize,
-                         &sub_rate[1], &sub_dist[1], subsize,
-                         get_block_context(x, subsize), INT64_MAX);
+        encode_b_rt(cpi, tile, tp, mi_row, mi_col + hbs, output_enabled,
+                    subsize);
       }
-      *rate = sub_rate[0] + sub_rate[1];
-      *dist = sub_dist[1] + sub_dist[1];
+      break;
+    case PARTITION_HORZ:
+      if (output_enabled)
+        cm->counts.partition[ctx][PARTITION_HORZ]++;
+      *get_sb_index(x, subsize) = 0;
+      encode_b_rt(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize);
+      if (mi_row + hbs < cm->mi_rows) {
+        *get_sb_index(x, subsize) = 1;
+        encode_b_rt(cpi, tile, tp, mi_row + hbs, mi_col, output_enabled,
+                    subsize);
+      }
       break;
     case PARTITION_SPLIT:
+      subsize = get_subsize(bsize, PARTITION_SPLIT);
+      if (output_enabled)
+        cm->counts.partition[ctx][PARTITION_SPLIT]++;
+
       *get_sb_index(x, subsize) = 0;
-      pick_partition_type(cpi, tile, mi_8x8, tp, mi_row, mi_col, subsize,
-                          &sub_rate[0], &sub_dist[0], 0);
-
-      if ((mi_col + num_8x8_subsize) < cm->mi_cols) {
-        *get_sb_index(x, subsize) = 1;
-        pick_partition_type(cpi, tile, mi_8x8 + num_8x8_subsize, tp,
-                            mi_row, mi_col + num_8x8_subsize, subsize,
-                            &sub_rate[1], &sub_dist[1], 0);
-      }
-
-      if ((mi_row + num_8x8_subsize) < cm->mi_rows) {
-        *get_sb_index(x, subsize) = 2;
-        pick_partition_type(cpi, tile, mi_8x8 + num_8x8_subsize * mi_stride, tp,
-                            mi_row + num_8x8_subsize, mi_col, subsize,
-                            &sub_rate[2], &sub_dist[2], 0);
-      }
-
-      if ((mi_col + num_8x8_subsize) < cm->mi_cols &&
-          (mi_row + num_8x8_subsize) < cm->mi_rows) {
-        *get_sb_index(x, subsize) = 3;
-        mi_offset = num_8x8_subsize * mi_stride + num_8x8_subsize;
-        pick_partition_type(cpi, tile, mi_8x8 + mi_offset, tp,
-                            mi_row + num_8x8_subsize, mi_col + num_8x8_subsize,
-                            subsize, &sub_rate[3], &sub_dist[3], 0);
-      }
-
-      for (i = 0; i < 4; ++i) {
-        *rate += sub_rate[i];
-        *dist += sub_dist[i];
-      }
-
+      encode_sb_rt(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize);
+      *get_sb_index(x, subsize) = 1;
+      encode_sb_rt(cpi, tile, tp, mi_row, mi_col + hbs, output_enabled,
+                   subsize);
+      *get_sb_index(x, subsize) = 2;
+      encode_sb_rt(cpi, tile, tp, mi_row + hbs, mi_col, output_enabled,
+                   subsize);
+      *get_sb_index(x, subsize) = 3;
+      encode_sb_rt(cpi, tile, tp, mi_row + hbs, mi_col + hbs, output_enabled,
+                subsize);
       break;
     default:
-      assert(0);
+      assert("Invalid partition type.");
   }
 
-  if (do_recon) {
-    int output_enabled = (bsize == BLOCK_64X64);
-
-    // Check the projected output rate for this SB against it's target
-    // and and if necessary apply a Q delta using segmentation to get
-    // closer to the target.
-    if ((cpi->oxcf.aq_mode == COMPLEXITY_AQ) && cm->seg.update_map) {
-      select_in_frame_q_segment(cpi, mi_row, mi_col,
-                                output_enabled, *rate);
-    }
-
-    encode_sb(cpi, tile, tp, mi_row, mi_col, output_enabled, bsize);
-  }
+  if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
+    update_partition_context(cpi->above_seg_context, cpi->left_seg_context,
+                             mi_row, mi_col, subsize, bsize);
 }
 
 static void rd_use_partition(VP9_COMP *cpi,
@@ -1446,15 +1486,19 @@
 }
 
 static const BLOCK_SIZE min_partition_size[BLOCK_SIZES] = {
-  BLOCK_4X4, BLOCK_4X4, BLOCK_4X4, BLOCK_4X4,
-  BLOCK_4X4, BLOCK_4X4, BLOCK_8X8, BLOCK_8X8,
-  BLOCK_8X8, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16
+  BLOCK_4X4,   BLOCK_4X4,   BLOCK_4X4,
+  BLOCK_4X4,   BLOCK_4X4,   BLOCK_4X4,
+  BLOCK_8X8,   BLOCK_8X8,   BLOCK_8X8,
+  BLOCK_16X16, BLOCK_16X16, BLOCK_16X16,
+  BLOCK_16X16
 };
 
 static const BLOCK_SIZE max_partition_size[BLOCK_SIZES] = {
-  BLOCK_8X8, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16,
-  BLOCK_32X32, BLOCK_32X32, BLOCK_32X32, BLOCK_64X64,
-  BLOCK_64X64, BLOCK_64X64, BLOCK_64X64, BLOCK_64X64, BLOCK_64X64
+  BLOCK_8X8,   BLOCK_16X16, BLOCK_16X16,
+  BLOCK_16X16, BLOCK_32X32, BLOCK_32X32,
+  BLOCK_32X32, BLOCK_64X64, BLOCK_64X64,
+  BLOCK_64X64, BLOCK_64X64, BLOCK_64X64,
+  BLOCK_64X64
 };
 
 // Look at all the mode_info entries for blocks that are part of this
@@ -1540,9 +1584,11 @@
     }
   }
 
-  // Give a bit of leaway either side of the observed min and max
-  *min_block_size = min_partition_size[*min_block_size];
-  *max_block_size = max_partition_size[*max_block_size];
+  // adjust observed min and max
+  if (cpi->sf.auto_min_max_partition_size == RELAXED_NEIGHBORING_MIN_MAX) {
+    *min_block_size = min_partition_size[*min_block_size];
+    *max_block_size = max_partition_size[*max_block_size];
+  }
 
   // Check border cases where max and min from neighbours may not be legal.
   *max_block_size = find_partition_size(*max_block_size,
@@ -1998,34 +2044,6 @@
   restore_context(cpi, mi_row, mi_col, a, l, sa, sl, BLOCK_64X64);
 }
 
-static void encode_sb_row_rt(VP9_COMP *cpi, const TileInfo *const tile,
-                             int mi_row, TOKENEXTRA **tp) {
-  VP9_COMMON *const cm = &cpi->common;
-  int mi_col;
-
-  cpi->sf.always_this_block_size = BLOCK_8X8;
-
-  // Initialize the left context for the new SB row
-  vpx_memset(&cpi->left_context, 0, sizeof(cpi->left_context));
-  vpx_memset(cpi->left_seg_context, 0, sizeof(cpi->left_seg_context));
-
-  // Code each SB in the row
-  for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
-       mi_col += MI_BLOCK_SIZE) {
-    int dummy_rate;
-    int64_t dummy_dist;
-    const int idx_str = cm->mode_info_stride * mi_row + mi_col;
-    MODE_INFO **mi_8x8 = cm->mi_grid_visible + idx_str;
-
-    vp9_zero(cpi->mb.pred_mv);
-
-    set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
-    set_partitioning(cpi, tile, mi_8x8, mi_row, mi_col);
-    pick_partition_type(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
-                        &dummy_rate, &dummy_dist, 1);
-  }
-}
-
 static void encode_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
                           int mi_row, TOKENEXTRA **tp) {
   VP9_COMMON *const cm = &cpi->common;
@@ -2252,11 +2270,7 @@
           vp9_tile_init(&tile, cm, tile_row, tile_col);
           for (mi_row = tile.mi_row_start;
                mi_row < tile.mi_row_end; mi_row += 8)
-#if 1
             encode_sb_row(cpi, &tile, mi_row, &tp);
-#else
-            encode_sb_row_rt(cpi, &tile, mi_row, &tp);
-#endif
 
           cpi->tok_count[tile_row][tile_col] = (unsigned int)(tp - tp_old);
           assert(tp - cpi->tok <= get_token_alloc(cm->mb_rows, cm->mb_cols));
@@ -2397,15 +2411,15 @@
   }
 }
 
-static int get_frame_type(VP9_COMP *cpi) {
+static MV_REFERENCE_FRAME get_frame_type(VP9_COMP *cpi) {
   if (frame_is_intra_only(&cpi->common))
-    return 0;
+    return INTRA_FRAME;
   else if (cpi->rc.is_src_frame_alt_ref && cpi->refresh_golden_frame)
-    return 3;
+    return ALTREF_FRAME;
   else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)
-    return 1;
+    return LAST_FRAME;
   else
-    return 2;
+    return GOLDEN_FRAME;
 }
 
 static void select_tx_mode(VP9_COMP *cpi) {
@@ -2435,6 +2449,264 @@
     }
   }
 }
+// Start RTC Exploration
+typedef enum {
+  BOTH_ZERO = 0,
+  ZERO_PLUS_PREDICTED = 1,
+  BOTH_PREDICTED = 2,
+  NEW_PLUS_NON_INTRA = 3,
+  BOTH_NEW = 4,
+  INTRA_PLUS_NON_INTRA = 5,
+  BOTH_INTRA = 6,
+  INVALID_CASE = 9
+} motion_vector_context;
+
+static void set_mode_info(MB_MODE_INFO *mbmi, BLOCK_SIZE bsize,
+                          MB_PREDICTION_MODE mode, int mi_row, int mi_col) {
+  mbmi->interp_filter = EIGHTTAP;
+  mbmi->mode = mode;
+  mbmi->mv[0].as_int = 0;
+  mbmi->mv[1].as_int = 0;
+  if (mode < NEARESTMV) {
+    mbmi->ref_frame[0] = INTRA_FRAME;
+  } else {
+    mbmi->ref_frame[0] = LAST_FRAME;
+  }
+
+  mbmi->ref_frame[1] = INTRA_FRAME;
+  mbmi->tx_size = max_txsize_lookup[bsize];
+  mbmi->uv_mode = mode;
+  mbmi->skip_coeff = 0;
+  mbmi->sb_type = bsize;
+  mbmi->segment_id = 0;
+}
+static INLINE int get_block_row(int b32i, int b16i, int b8i) {
+  return ((b32i >> 1) << 2) + ((b16i >> 1) << 1) + (b8i >> 1);
+}
+static INLINE int get_block_col(int b32i, int b16i, int b8i) {
+  return ((b32i & 1) << 2) + ((b16i & 1) << 1) + (b8i & 1);
+}
+static void rtc_use_partition(VP9_COMP *cpi,
+                             const TileInfo *const tile,
+                             MODE_INFO **mi_8x8,
+                             TOKENEXTRA **tp, int mi_row, int mi_col,
+                             BLOCK_SIZE bsize, int *rate, int64_t *dist,
+                             int do_recon) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+  const int mis = cm->mode_info_stride;
+  int mi_width = num_8x8_blocks_wide_lookup[cpi->sf.always_this_block_size];
+  int mi_height = num_8x8_blocks_high_lookup[cpi->sf.always_this_block_size];
+  int i, j;
+  int chosen_rate = INT_MAX;
+  int64_t chosen_dist = INT_MAX;
+  MB_PREDICTION_MODE mode = DC_PRED;
+  int row8x8_remaining = tile->mi_row_end - mi_row;
+  int col8x8_remaining = tile->mi_col_end - mi_col;
+  int b32i;
+  x->fast_ms = 0;
+  x->subblock_ref = 0;
+  for (b32i = 0; b32i < 4; b32i++) {
+    int b16i;
+    for (b16i = 0; b16i < 4; b16i++) {
+      int b8i;
+      int block_row = get_block_row(b32i, b16i, 0);
+      int block_col = get_block_col(b32i, b16i, 0);
+      int index = block_row * mis + block_col;
+      int rate;
+      int64_t dist;
+
+      int_mv frame_nearest_mv[MAX_REF_FRAMES];
+      int_mv frame_near_mv[MAX_REF_FRAMES];
+      struct buf_2d yv12_mb[MAX_REF_FRAMES][MAX_MB_PLANE];
+
+      // Find a partition size that fits
+      bsize = find_partition_size(cpi->sf.always_this_block_size,
+                                  (row8x8_remaining - block_row),
+                                  (col8x8_remaining - block_col),
+                                  &mi_height, &mi_width);
+      mi_8x8[index] = mi_8x8[0] + index;
+
+      set_mi_row_col(xd, tile, mi_row + block_row, mi_height,
+                     mi_col + block_col, mi_width, cm->mi_rows, cm->mi_cols);
+
+      xd->mi_8x8 = mi_8x8 + index;
+
+      if (cm->frame_type != KEY_FRAME) {
+        set_offsets(cpi, tile, mi_row + block_row, mi_col + block_col, bsize);
+
+        vp9_pick_inter_mode(cpi, x, tile,
+                            mi_row + block_row, mi_col + block_col,
+                            &rate, &dist, cpi->sf.always_this_block_size);
+      } else {
+        set_mode_info(&mi_8x8[index]->mbmi, bsize, mode,
+                      mi_row + block_row, mi_col + block_col);
+        vp9_setup_buffer_inter(cpi, x, tile,
+                               LAST_FRAME, cpi->sf.always_this_block_size,
+                               mi_row + block_row, mi_col + block_col,
+                               frame_nearest_mv, frame_near_mv, yv12_mb);
+      }
+
+      for (j = 0; j < mi_height; j++)
+        for (i = 0; i < mi_width; i++)
+          if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > i
+            && (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > j) {
+            mi_8x8[index+ i + j * mis] = mi_8x8[index];
+          }
+
+      for (b8i = 0; b8i < 4; b8i++) {
+      }
+    }
+  }
+  encode_sb_rt(cpi, tile, tp, mi_row, mi_col, 1, BLOCK_64X64);
+
+  *rate = chosen_rate;
+  *dist = chosen_dist;
+}
+
+static void encode_rtc_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
+                              int mi_row, TOKENEXTRA **tp) {
+  VP9_COMMON * const cm = &cpi->common;
+  int mi_col;
+
+  // Initialize the left context for the new SB row
+  vpx_memset(&cpi->left_context, 0, sizeof(cpi->left_context));
+  vpx_memset(cpi->left_seg_context, 0, sizeof(cpi->left_seg_context));
+
+  // Code each SB in the row
+  for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
+       mi_col += MI_BLOCK_SIZE) {
+    int dummy_rate;
+    int64_t dummy_dist;
+
+    const int idx_str = cm->mode_info_stride * mi_row + mi_col;
+    MODE_INFO **mi_8x8 = cm->mi_grid_visible + idx_str;
+
+    cpi->mb.source_variance = UINT_MAX;
+    set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
+    set_partitioning(cpi, tile, mi_8x8, mi_row, mi_col);
+    rtc_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
+                     &dummy_rate, &dummy_dist, 1);
+  }
+}
+
+
+static void encode_rtc_frame_internal(VP9_COMP *cpi) {
+  int mi_row;
+  MACROBLOCK * const x = &cpi->mb;
+  VP9_COMMON * const cm = &cpi->common;
+  MACROBLOCKD * const xd = &x->e_mbd;
+
+//  fprintf(stderr, "encode_frame_internal frame %d (%d) type %d\n",
+//           cpi->common.current_video_frame, cpi->common.show_frame,
+//           cm->frame_type);
+
+// debug output
+#if DBG_PRNT_SEGMAP
+  {
+    FILE *statsfile;
+    statsfile = fopen("segmap2.stt", "a");
+    fprintf(statsfile, "\n");
+    fclose(statsfile);
+  }
+#endif
+
+  vp9_zero(cm->counts.switchable_interp);
+  vp9_zero(cpi->tx_stepdown_count);
+
+  xd->mi_8x8 = cm->mi_grid_visible;
+  // required for vp9_frame_init_quantizer
+  xd->mi_8x8[0] = cm->mi;
+
+  xd->last_mi = cm->prev_mi;
+
+  vp9_zero(cpi->common.counts.mv);
+  vp9_zero(cpi->coef_counts);
+  vp9_zero(cm->counts.eob_branch);
+
+  cpi->mb.e_mbd.lossless = cm->base_qindex == 0 && cm->y_dc_delta_q == 0
+      && cm->uv_dc_delta_q == 0 && cm->uv_ac_delta_q == 0;
+  switch_lossless_mode(cpi, cpi->mb.e_mbd.lossless);
+
+  vp9_frame_init_quantizer(cpi);
+
+  vp9_initialize_rd_consts(cpi);
+  vp9_initialize_me_consts(cpi, cm->base_qindex);
+  switch_tx_mode(cpi);
+  cpi->sf.always_this_block_size = BLOCK_16X16;
+
+  if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
+    // Initialize encode frame context.
+    init_encode_frame_mb_context(cpi);
+
+    // Build a frame level activity map
+    build_activity_map(cpi);
+  }
+
+  // Re-initialize encode frame context.
+  init_encode_frame_mb_context(cpi);
+
+  vp9_zero(cpi->rd_comp_pred_diff);
+  vp9_zero(cpi->rd_filter_diff);
+  vp9_zero(cpi->rd_tx_select_diff);
+  vp9_zero(cpi->rd_tx_select_threshes);
+
+  set_prev_mi(cm);
+
+  {
+    struct vpx_usec_timer emr_timer;
+    vpx_usec_timer_start(&emr_timer);
+
+    {
+      // Take tiles into account and give start/end MB
+      int tile_col, tile_row;
+      TOKENEXTRA *tp = cpi->tok;
+      const int tile_cols = 1 << cm->log2_tile_cols;
+      const int tile_rows = 1 << cm->log2_tile_rows;
+
+      for (tile_row = 0; tile_row < tile_rows; tile_row++) {
+        for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+          TileInfo tile;
+          TOKENEXTRA *tp_old = tp;
+
+          // For each row of SBs in the frame
+          vp9_tile_init(&tile, cm, tile_row, tile_col);
+          for (mi_row = tile.mi_row_start;
+               mi_row < tile.mi_row_end; mi_row += 8)
+            encode_rtc_sb_row(cpi, &tile, mi_row, &tp);
+
+          cpi->tok_count[tile_row][tile_col] = (unsigned int)(tp - tp_old);
+          assert(tp - cpi->tok <= get_token_alloc(cm->mb_rows, cm->mb_cols));
+        }
+      }
+    }
+
+    vpx_usec_timer_mark(&emr_timer);
+    cpi->time_encode_sb_row += vpx_usec_timer_elapsed(&emr_timer);
+  }
+
+  if (cpi->sf.skip_encode_sb) {
+    int j;
+    unsigned int intra_count = 0, inter_count = 0;
+    for (j = 0; j < INTRA_INTER_CONTEXTS; ++j) {
+      intra_count += cm->counts.intra_inter[j][0];
+      inter_count += cm->counts.intra_inter[j][1];
+    }
+    cpi->sf.skip_encode_frame = ((intra_count << 2) < inter_count);
+    cpi->sf.skip_encode_frame &= (cm->frame_type != KEY_FRAME);
+    cpi->sf.skip_encode_frame &= cm->show_frame;
+  } else {
+    cpi->sf.skip_encode_frame = 0;
+  }
+
+#if 0
+  // Keep record of the total distortion this time around for future use
+  cpi->last_frame_distortion = cpi->frame_distortion;
+#endif
+}
+// end RTC play code
+
 
 void vp9_encode_frame(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
@@ -2462,7 +2734,6 @@
   if (cpi->sf.RD) {
     int i;
     REFERENCE_MODE reference_mode;
-    INTERP_FILTER interp_filter;
     /*
      * This code does a single RD pass over the whole frame assuming
      * either compound, single or hybrid prediction as per whatever has
@@ -2472,7 +2743,7 @@
      * that for subsequent frames.
      * It does the same analysis for transform size selection also.
      */
-    const int frame_type = get_frame_type(cpi);
+    const MV_REFERENCE_FRAME frame_type = get_frame_type(cpi);
     const int64_t *mode_thresh = cpi->rd_prediction_type_threshes[frame_type];
     const int64_t *filter_thresh = cpi->rd_filter_threshes[frame_type];
 
@@ -2490,22 +2761,18 @@
     else
       reference_mode = REFERENCE_MODE_SELECT;
 
-    /* filter type selection */
-    // FIXME(rbultje) for some odd reason, we often select smooth_filter
-    // as default filter for ARF overlay frames. This is a REALLY BAD
-    // IDEA so we explicitly disable it here.
-    if (frame_type != 3 &&
-        filter_thresh[EIGHTTAP_SMOOTH] > filter_thresh[EIGHTTAP] &&
-        filter_thresh[EIGHTTAP_SMOOTH] > filter_thresh[EIGHTTAP_SHARP] &&
-        filter_thresh[EIGHTTAP_SMOOTH] > filter_thresh[SWITCHABLE - 1]) {
-      interp_filter = EIGHTTAP_SMOOTH;
-    } else if (filter_thresh[EIGHTTAP_SHARP] > filter_thresh[EIGHTTAP] &&
-               filter_thresh[EIGHTTAP_SHARP] > filter_thresh[SWITCHABLE - 1]) {
-      interp_filter = EIGHTTAP_SHARP;
-    } else if (filter_thresh[EIGHTTAP] > filter_thresh[SWITCHABLE - 1]) {
-      interp_filter = EIGHTTAP;
-    } else {
-      interp_filter = SWITCHABLE;
+    if (cm->interp_filter == SWITCHABLE) {
+      if (frame_type != ALTREF_FRAME &&
+          filter_thresh[EIGHTTAP_SMOOTH] > filter_thresh[EIGHTTAP] &&
+          filter_thresh[EIGHTTAP_SMOOTH] > filter_thresh[EIGHTTAP_SHARP] &&
+          filter_thresh[EIGHTTAP_SMOOTH] > filter_thresh[SWITCHABLE - 1]) {
+        cm->interp_filter = EIGHTTAP_SMOOTH;
+      } else if (filter_thresh[EIGHTTAP_SHARP] > filter_thresh[EIGHTTAP] &&
+          filter_thresh[EIGHTTAP_SHARP] > filter_thresh[SWITCHABLE - 1]) {
+        cm->interp_filter = EIGHTTAP_SHARP;
+      } else if (filter_thresh[EIGHTTAP] > filter_thresh[SWITCHABLE - 1]) {
+        cm->interp_filter = EIGHTTAP;
+      }
     }
 
     cpi->mb.e_mbd.lossless = cpi->oxcf.lossless;
@@ -2513,8 +2780,11 @@
     /* transform size selection (4x4, 8x8, 16x16 or select-per-mb) */
     select_tx_mode(cpi);
     cm->reference_mode = reference_mode;
-    cm->interp_filter = interp_filter;
-    encode_frame_internal(cpi);
+
+    if (cpi->sf.super_fast_rtc)
+      encode_rtc_frame_internal(cpi);
+    else
+      encode_frame_internal(cpi);
 
     for (i = 0; i < REFERENCE_MODES; ++i) {
       const int diff = (int) (cpi->rd_comp_pred_diff[i] / cm->MBs);
@@ -2592,7 +2862,12 @@
       }
     }
   } else {
-    encode_frame_internal(cpi);
+    // Force the usage of the BILINEAR interp_filter.
+    cm->interp_filter = BILINEAR;
+    if (cpi->sf.super_fast_rtc)
+      encode_rtc_frame_internal(cpi);
+    else
+      encode_frame_internal(cpi);
   }
 }
 
@@ -2668,7 +2943,8 @@
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
   x->skip_recode = !x->select_txfm_size && mbmi->sb_type >= BLOCK_8X8 &&
-                   (cpi->oxcf.aq_mode != COMPLEXITY_AQ);
+                   (cpi->oxcf.aq_mode != COMPLEXITY_AQ) &&
+                   !cpi->sf.super_fast_rtc;
   x->skip_optimize = ctx->is_coded;
   ctx->is_coded = 1;
   x->use_lp32x32fdct = cpi->sf.use_lp32x32fdct;
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 4488189..8ff23c7 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -338,7 +338,6 @@
                            pd->above_context, pd->left_context,
                            num_4x4_w, num_4x4_h);
 }
-
 void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize,
                      TX_SIZE tx_size, void *arg) {
   struct encode_b_args* const args = arg;
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index af9fa1b..a03cbdd 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -49,6 +49,9 @@
 
 #define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x) - 0.000001 : (x) + 0.000001)
 
+#define MIN_BOOST        300
+#define KEY_FRAME_BOOST 2000
+
 static void swap_yv12(YV12_BUFFER_CONFIG *a, YV12_BUFFER_CONFIG *b) {
   YV12_BUFFER_CONFIG temp = *a;
   *a = *b;
@@ -2219,8 +2222,8 @@
     if (kf_boost < (rc->frames_to_key * 3))
       kf_boost = (rc->frames_to_key * 3);
 
-    if (kf_boost < 300)  // Min KF boost
-      kf_boost = 300;
+    if (kf_boost < MIN_BOOST)
+      kf_boost = MIN_BOOST;
 
     // Make a note of baseline boost and the zero motion
     // accumulator value for use elsewhere.
@@ -2331,7 +2334,7 @@
     cpi->rc.this_key_frame_forced = cm->current_video_frame != 0 &&
                                     cpi->rc.frames_to_key == 0;
     cpi->rc.frames_to_key = cpi->key_frame_frequency;
-    cpi->rc.kf_boost = 2000;
+    cpi->rc.kf_boost = KEY_FRAME_BOOST;
     cpi->rc.source_alt_ref_active = 0;
   } else {
     cm->frame_type = INTER_FRAME;
@@ -2358,7 +2361,7 @@
     cpi->rc.this_key_frame_forced = cm->current_video_frame != 0 &&
                                     cpi->rc.frames_to_key == 0;
     cpi->rc.frames_to_key = cpi->key_frame_frequency;
-    cpi->rc.kf_boost = 2000;
+    cpi->rc.kf_boost = KEY_FRAME_BOOST;
     cpi->rc.source_alt_ref_active = 0;
   } else {
     cm->frame_type = INTER_FRAME;
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index c28d01c..b135b67 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -455,14 +455,17 @@
     cache_ptr += cm->mi_cols;
   }
 }
+static int is_slowest_mode(int mode) {
+  return (mode == MODE_SECONDPASS_BEST || mode == MODE_BESTQUALITY);
+}
 
-static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode) {
+static void set_rd_speed_thresholds(VP9_COMP *cpi) {
   SPEED_FEATURES *sf = &cpi->sf;
   int i;
 
   // Set baseline threshold values
   for (i = 0; i < MAX_MODES; ++i)
-    sf->thresh_mult[i] = mode == 0 ? -500 : 0;
+    sf->thresh_mult[i] = is_slowest_mode(cpi->oxcf.mode) ? -500 : 0;
 
   sf->thresh_mult[THR_NEARESTMV] = 0;
   sf->thresh_mult[THR_NEARESTG] = 0;
@@ -538,12 +541,12 @@
   }
 }
 
-static void set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi, int mode) {
+static void set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi) {
   SPEED_FEATURES *sf = &cpi->sf;
   int i;
 
   for (i = 0; i < MAX_REFS; ++i)
-    sf->thresh_mult_sub8x8[i] = mode == 0 ? -500 : 0;
+    sf->thresh_mult_sub8x8[i] = is_slowest_mode(cpi->oxcf.mode)  ? -500 : 0;
 
   sf->thresh_mult_sub8x8[THR_LAST] += 2500;
   sf->thresh_mult_sub8x8[THR_GOLD] += 2500;
@@ -626,7 +629,7 @@
     sf->disable_filter_search_var_thresh = 50;
     sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
 
-    sf->auto_min_max_partition_size = 1;
+    sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
     sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_LOW_MOTION;
     sf->adjust_partitioning_from_last_frame = 1;
     sf->last_partitioning_redo_frequency = 3;
@@ -663,7 +666,7 @@
     sf->disable_filter_search_var_thresh = 100;
     sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
 
-    sf->auto_min_max_partition_size = 1;
+    sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
     sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL;
     sf->adjust_partitioning_from_last_frame = 1;
     sf->last_partitioning_redo_frequency = 3;
@@ -698,7 +701,7 @@
     sf->disable_filter_search_var_thresh = 200;
     sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
 
-    sf->auto_min_max_partition_size = 1;
+    sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
     sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL;
     sf->adjust_partitioning_from_last_frame = 1;
     sf->last_partitioning_redo_frequency = 3;
@@ -797,7 +800,7 @@
     sf->disable_filter_search_var_thresh = 50;
     sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
 
-    sf->auto_min_max_partition_size = 1;
+    sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
     sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_LOW_MOTION;
     sf->adjust_partitioning_from_last_frame = 1;
     sf->last_partitioning_redo_frequency = 3;
@@ -839,19 +842,24 @@
   if (speed >= 5) {
     int i;
     sf->disable_split_mask = DISABLE_ALL_SPLIT;
+    sf->auto_min_max_partition_size = frame_is_intra_only(cm) ?
+        RELAXED_NEIGHBORING_MIN_MAX : STRICT_NEIGHBORING_MIN_MAX;
     sf->subpel_force_stop = 1;
     for (i = 0; i < TX_SIZES; i++) {
       sf->intra_y_mode_mask[i] = INTRA_DC_H_V;
       sf->intra_uv_mode_mask[i] = INTRA_DC_ONLY;
     }
     sf->use_fast_lpf_pick = 2;
+    sf->RD = 0;
+  }
+  if (speed >= 6) {
+    sf->super_fast_rtc = 1;
   }
 }
 
 void vp9_set_speed_features(VP9_COMP *cpi) {
   SPEED_FEATURES *sf = &cpi->sf;
   VP9_COMMON *cm = &cpi->common;
-  int mode = cpi->compressor_speed;
   int speed = cpi->speed;
   int i;
 
@@ -884,7 +892,7 @@
   sf->use_one_partition_size_always = 0;
   sf->less_rectangular_check = 0;
   sf->use_square_partition_only = 0;
-  sf->auto_min_max_partition_size = 0;
+  sf->auto_min_max_partition_size = NOT_IN_USE;
   sf->max_partition_size = BLOCK_64X64;
   sf->min_partition_size = BLOCK_4X4;
   sf->adjust_partitioning_from_last_frame = 0;
@@ -904,23 +912,26 @@
   sf->use_fast_coef_updates = 0;
   sf->using_small_partition_info = 0;
   sf->mode_skip_start = MAX_MODES;  // Mode index at which mode skip mask set
+  sf->super_fast_rtc = 0;
 
-  switch (mode) {
-    case 0:  // This is the best quality mode.
+  switch (cpi->oxcf.mode) {
+    case MODE_BESTQUALITY:
+    case MODE_SECONDPASS_BEST:  // This is the best quality mode.
       cpi->diamond_search_sad = vp9_full_range_search;
       break;
-    case 1:
+    case MODE_FIRSTPASS:
+    case MODE_GOODQUALITY:
+    case MODE_SECONDPASS:
       set_good_speed_feature(cm, sf, speed);
       break;
-      break;
-    case 2:
+    case MODE_REALTIME:
       set_rt_speed_feature(cm, sf, speed);
       break;
   }; /* switch */
 
   // Set rd thresholds based on mode and speed setting
-  set_rd_speed_thresholds(cpi, mode);
-  set_rd_speed_thresholds_sub8x8(cpi, mode);
+  set_rd_speed_thresholds(cpi);
+  set_rd_speed_thresholds_sub8x8(cpi);
 
   // Slow quant, dct and trellis not worthwhile for first pass
   // so make sure they are always turned off.
@@ -1241,24 +1252,24 @@
       // Real time and one pass deprecated in test code base
     case MODE_GOODQUALITY:
       cpi->pass = 0;
-      cpi->compressor_speed = 2;
       cpi->oxcf.cpu_used = clamp(cpi->oxcf.cpu_used, -5, 5);
       break;
 
     case MODE_FIRSTPASS:
       cpi->pass = 1;
-      cpi->compressor_speed = 1;
       break;
 
     case MODE_SECONDPASS:
       cpi->pass = 2;
-      cpi->compressor_speed = 1;
       cpi->oxcf.cpu_used = clamp(cpi->oxcf.cpu_used, -5, 5);
       break;
 
     case MODE_SECONDPASS_BEST:
       cpi->pass = 2;
-      cpi->compressor_speed = 0;
+      break;
+
+    case MODE_REALTIME:
+      cpi->pass = 0;
       break;
   }
 
@@ -2732,7 +2743,9 @@
     if (cpi->sf.recode_loop != 0) {
       vp9_save_coding_context(cpi);
       cpi->dummy_packing = 1;
-      vp9_pack_bitstream(cpi, dest, size);
+      if (!cpi->sf.super_fast_rtc)
+        vp9_pack_bitstream(cpi, dest, size);
+
       cpi->rc.projected_frame_size = (*size) << 3;
       vp9_restore_coding_context(cpi);
 
@@ -3081,11 +3094,22 @@
                                    &frame_under_shoot_limit,
                                    &frame_over_shoot_limit);
 
-  // Decide q and q bounds
+  // Decide q and q bounds.
   q = vp9_rc_pick_q_and_adjust_q_bounds(cpi,
                                         &bottom_index,
                                         &top_index);
 
+  // JBB : This is realtime mode.  In real time mode the first frame
+  // should be larger. Q of 0 is disabled because we force tx size to be
+  // 16x16...
+  if (cpi->sf.super_fast_rtc) {
+    if (cpi->common.current_video_frame == 0)
+      q /= 3;
+
+    if (q == 0)
+      q++;
+  }
+
   if (!frame_is_intra_only(cm)) {
     cm->interp_filter = DEFAULT_INTERP_FILTER;
     /* TODO: Decide this more intelligently */
@@ -3227,7 +3251,8 @@
   cm->last_height = cm->height;
 
   // reset to normal state now that we are done.
-  cm->last_show_frame = cm->show_frame;
+  if (!cm->show_existing_frame)
+    cm->last_show_frame = cm->show_frame;
   if (cm->show_frame) {
     // current mip will be the prev_mip for the next frame
     MODE_INFO *temp = cm->prev_mip;
@@ -3307,6 +3332,7 @@
                           YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
                           int64_t end_time) {
   VP9_COMP              *cpi = (VP9_COMP *) ptr;
+  VP9_COMMON             *cm = &cpi->common;
   struct vpx_usec_timer  timer;
   int                    res = 0;
   const int    subsampling_x = sd->uv_width  < sd->y_width;
@@ -3320,6 +3346,12 @@
   vpx_usec_timer_mark(&timer);
   cpi->time_receive_data += vpx_usec_timer_elapsed(&timer);
 
+  if (cm->version == 0 && (subsampling_x != 1 || subsampling_y != 1)) {
+    vpx_internal_error(&cm->error, VPX_CODEC_INVALID_PARAM,
+                       "Non-4:2:0 color space requires profile >= 1");
+    res = -1;
+  }
+
   return res;
 }
 
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index a4cd9bb..c3ecd7f 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -181,6 +181,12 @@
 } TX_SIZE_SEARCH_METHOD;
 
 typedef enum {
+  NOT_IN_USE = 0,
+  RELAXED_NEIGHBORING_MIN_MAX = 1,
+  STRICT_NEIGHBORING_MIN_MAX = 2
+} AUTO_MIN_MAX_MODE;
+
+typedef enum {
   // Values should be powers of 2 so that they can be selected as bits of
   // an integer flags field
 
@@ -337,9 +343,8 @@
   BLOCK_SIZE always_this_block_size;
 
   // Sets min and max partition sizes for this 64x64 region based on the
-  // same superblock in last encoded frame, and the left and above neighbor
-  // in this block.
-  int auto_min_max_partition_size;
+  // same 64x64 in last encoded frame, and the left and above neighbor.
+  AUTO_MIN_MAX_MODE auto_min_max_partition_size;
 
   // Min and max partition size we enable (block_size) as per auto
   // min max, but also used by adjust partitioning, and pick_partitioning.
@@ -411,6 +416,9 @@
   // This feature limits the number of coefficients updates we actually do
   // by only looking at counts from 1/2 the bands.
   int use_fast_coef_updates;  // 0: 2-loop, 1: 1-loop, 2: 1-loop reduced
+
+  // This flag control the use of the new super fast rtc mode
+  int super_fast_rtc;
 } SPEED_FEATURES;
 
 typedef struct VP9_COMP {
@@ -434,7 +442,6 @@
   MACROBLOCK mb;
   VP9_COMMON common;
   VP9_CONFIG oxcf;
-  struct rdcost_block_args rdcost_stack;
   struct lookahead_ctx    *lookahead;
   struct lookahead_entry  *source;
 #if CONFIG_MULTIPLE_ARF
@@ -546,7 +553,6 @@
 
   // for real time encoding
   int speed;
-  int compressor_speed;
 
   int cpu_used;
   int pass;
@@ -751,8 +757,10 @@
 
 static void set_ref_ptrs(VP9_COMMON *cm, MACROBLOCKD *xd,
                          MV_REFERENCE_FRAME ref0, MV_REFERENCE_FRAME ref1) {
-  xd->block_refs[0] = &cm->frame_refs[ref0 - LAST_FRAME];
-  xd->block_refs[1] = &cm->frame_refs[ref1 - LAST_FRAME];
+  xd->block_refs[0] = &cm->frame_refs[ref0 >= LAST_FRAME ? ref0 - LAST_FRAME
+                                                         : 0];
+  xd->block_refs[1] = &cm->frame_refs[ref1 >= LAST_FRAME ? ref1 - LAST_FRAME
+                                                         : 0];
 }
 
 #ifdef __cplusplus
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 210d15f..bd28ea5 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -80,7 +80,7 @@
   step_param = 6;
   further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
 
-  for (i = LAST_FRAME; i <= ALTREF_FRAME && cpi->common.show_frame; ++i) {
+  for (i = LAST_FRAME; i <= LAST_FRAME && cpi->common.show_frame; ++i) {
     if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
       tmp_mv->as_int = INVALID_MV;
 
@@ -124,8 +124,8 @@
                                    stride, 0x7fffffff);
 
   // scale to 1/8 pixel resolution
-  tmp_mv->as_mv.row = tmp_mv->as_mv.row << 3;
-  tmp_mv->as_mv.col = tmp_mv->as_mv.col << 3;
+  tmp_mv->as_mv.row = tmp_mv->as_mv.row * 8;
+  tmp_mv->as_mv.col = tmp_mv->as_mv.col * 8;
 
   // calculate the bit cost on motion vector
   *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv.as_mv,
@@ -142,8 +142,7 @@
                             int mi_row, int mi_col,
                             int *returnrate,
                             int64_t *returndistortion,
-                            BLOCK_SIZE bsize,
-                            PICK_MODE_CONTEXT *ctx) {
+                            BLOCK_SIZE bsize) {
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
   const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
@@ -155,6 +154,7 @@
                                     VP9_ALT_FLAG };
   int64_t best_rd = INT64_MAX;
   int64_t this_rd;
+  int64_t cost[4]= { 0, 100, 150,  205 };
 
   x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
 
@@ -171,7 +171,7 @@
   mbmi->tx_size = MIN(max_txsize_lookup[bsize],
                       tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
 
-  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+  for (ref_frame = LAST_FRAME; ref_frame <= LAST_FRAME ; ++ref_frame) {
     x->pred_mv_sad[ref_frame] = INT_MAX;
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
       vp9_setup_buffer_inter(cpi, x, tile,
@@ -182,7 +182,7 @@
     frame_mv[ZEROMV][ref_frame].as_int = 0;
   }
 
-  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+  for (ref_frame = LAST_FRAME; ref_frame <= LAST_FRAME ; ++ref_frame) {
     int rate_mv = 0;
 
     if (!(cpi->ref_frame_flags & flag_list[ref_frame]))
@@ -191,29 +191,42 @@
     // Select prediction reference frames.
     xd->plane[0].pre[0] = yv12_mb[ref_frame][0];
 
-
-    x->mode_sad[ref_frame][INTER_OFFSET(NEWMV)] =
-        full_pixel_motion_search(cpi, x, tile, bsize, mi_row, mi_col,
-                                 &frame_mv[NEWMV][ref_frame], &rate_mv);
-
-    if (frame_mv[NEWMV][ref_frame].as_int == INVALID_MV)
-      continue;
-
     clamp_mv2(&frame_mv[NEARESTMV][ref_frame].as_mv, xd);
     clamp_mv2(&frame_mv[NEARMV][ref_frame].as_mv, xd);
 
     for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
-      int rate = x->inter_mode_cost[mbmi->mode_context[ref_frame]]
-                                   [INTER_OFFSET(this_mode)];
-      int64_t dist = x->mode_sad[ref_frame][INTER_OFFSET(this_mode)] *
-                      x->mode_sad[ref_frame][INTER_OFFSET(this_mode)];
-      this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
+      int rate = cost[this_mode - NEARESTMV];
+      int64_t dist;
+
+      if (this_mode == NEWMV) {
+        if (this_rd < 300)
+          continue;
+
+        x->mode_sad[ref_frame][INTER_OFFSET(NEWMV)] =
+            full_pixel_motion_search(cpi, x, tile, bsize, mi_row, mi_col,
+                                     &frame_mv[NEWMV][ref_frame], &rate_mv);
+
+        if (frame_mv[NEWMV][ref_frame].as_int == INVALID_MV)
+          continue;
+      }
+
+      dist = x->mode_sad[ref_frame][INTER_OFFSET(this_mode)];
+      this_rd = rate + dist;
 
       if (this_rd < best_rd) {
         best_rd = this_rd;
         mbmi->mode = this_mode;
         mbmi->ref_frame[0] = ref_frame;
         mbmi->mv[0].as_int = frame_mv[this_mode][ref_frame].as_int;
+        xd->mi_8x8[0]->bmi[0].as_mv[0].as_int = mbmi->mv[0].as_int;
+        mbmi->interp_filter = EIGHTTAP;
+
+        mbmi->ref_frame[1] = INTRA_FRAME;
+        mbmi->tx_size = max_txsize_lookup[bsize];
+        mbmi->uv_mode = this_mode;
+        mbmi->skip_coeff = 0;
+        mbmi->sb_type = bsize;
+        mbmi->segment_id = 0;
       }
     }
   }
@@ -223,8 +236,5 @@
   // TODO(jingning) intra prediction search, if the best SAD is above a certain
   // threshold.
 
-  // store mode decisions
-  ctx->mic = *xd->mi_8x8[0];
-
   return INT64_MAX;
 }
diff --git a/vp9/encoder/vp9_pickmode.h b/vp9/encoder/vp9_pickmode.h
index 82904ae..05ff187 100644
--- a/vp9/encoder/vp9_pickmode.h
+++ b/vp9/encoder/vp9_pickmode.h
@@ -22,8 +22,7 @@
                             int mi_row, int mi_col,
                             int *returnrate,
                             int64_t *returndistortion,
-                            BLOCK_SIZE bsize,
-                            PICK_MODE_CONTEXT *ctx);
+                            BLOCK_SIZE bsize);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 24b41a9..9124880 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -55,6 +55,22 @@
   MV_REFERENCE_FRAME ref_frame[2];
 } REF_DEFINITION;
 
+struct rdcost_block_args {
+  MACROBLOCK *x;
+  ENTROPY_CONTEXT t_above[16];
+  ENTROPY_CONTEXT t_left[16];
+  int rate;
+  int64_t dist;
+  int64_t sse;
+  int this_rate;
+  int64_t this_dist;
+  int64_t this_sse;
+  int64_t this_rd;
+  int64_t best_rd;
+  int skip;
+  const int16_t *scan, *nb;
+};
+
 const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
   {NEARESTMV, {LAST_FRAME,   NONE}},
   {NEARESTMV, {ALTREF_FRAME, NONE}},
@@ -280,22 +296,24 @@
 
   fill_token_costs(x->token_costs, cm->fc.coef_probs);
 
-  for (i = 0; i < PARTITION_CONTEXTS; i++)
-    vp9_cost_tokens(x->partition_cost[i], get_partition_probs(cm, i),
-                    vp9_partition_tree);
+  if (!cpi->sf.super_fast_rtc) {
+    for (i = 0; i < PARTITION_CONTEXTS; i++)
+      vp9_cost_tokens(x->partition_cost[i], get_partition_probs(cm, i),
+                      vp9_partition_tree);
 
-  fill_mode_costs(cpi);
+    fill_mode_costs(cpi);
 
-  if (!frame_is_intra_only(cm)) {
-    vp9_build_nmv_cost_table(x->nmvjointcost,
-                             cm->allow_high_precision_mv ? x->nmvcost_hp
-                                                         : x->nmvcost,
-                             &cm->fc.nmvc,
-                             cm->allow_high_precision_mv, 1, 1);
+    if (!frame_is_intra_only(cm)) {
+      vp9_build_nmv_cost_table(x->nmvjointcost,
+                               cm->allow_high_precision_mv ? x->nmvcost_hp
+                                                           : x->nmvcost,
+                               &cm->fc.nmvc,
+                               cm->allow_high_precision_mv, 1, 1);
 
-    for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
-      vp9_cost_tokens((int *)x->inter_mode_cost[i],
-                      cm->fc.inter_mode_probs[i], vp9_inter_mode_tree);
+      for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
+        vp9_cost_tokens((int *)x->inter_mode_cost[i],
+                        cm->fc.inter_mode_probs[i], vp9_inter_mode_tree);
+    }
   }
 }
 
@@ -425,7 +443,7 @@
 
     if (i == 0)
       x->pred_sse[ref] = sse;
-    if (cpi->compressor_speed > 2) {
+    if (cpi->sf.super_fast_rtc) {
       dist_sum += (int)sse;
     } else {
       int rate;
@@ -585,15 +603,15 @@
   return cost;
 }
 
-static void dist_block(int plane, int block, TX_SIZE tx_size, void *arg) {
+static void dist_block(int plane, int block, TX_SIZE tx_size,
+                       struct rdcost_block_args* args) {
   const int ss_txfrm_size = tx_size << 1;
-  struct rdcost_block_args* args = arg;
   MACROBLOCK* const x = args->x;
   MACROBLOCKD* const xd = &x->e_mbd;
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
   int64_t this_sse;
-  int shift = args->tx_size == TX_32X32 ? 0 : 2;
+  int shift = tx_size == TX_32X32 ? 0 : 2;
   int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
   int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   args->dist = vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
@@ -610,14 +628,12 @@
 }
 
 static void rate_block(int plane, int block, BLOCK_SIZE plane_bsize,
-                       TX_SIZE tx_size, void *arg) {
-  struct rdcost_block_args* args = arg;
-
+                       TX_SIZE tx_size, struct rdcost_block_args* args) {
   int x_idx, y_idx;
-  txfrm_block_to_raster_xy(plane_bsize, args->tx_size, block, &x_idx, &y_idx);
+  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x_idx, &y_idx);
 
   args->rate = cost_coeffs(args->x, plane, block, args->t_above + x_idx,
-                           args->t_left + y_idx, args->tx_size,
+                           args->t_left + y_idx, tx_size,
                            args->scan, args->nb);
 }
 
@@ -694,24 +710,19 @@
   }
 }
 
-static void init_rdcost_stack(MACROBLOCK *x, TX_SIZE tx_size,
-                              const int num_4x4_w, const int num_4x4_h,
-                              const int64_t ref_rdcost,
+static void init_rdcost_stack(MACROBLOCK *x, const int64_t ref_rdcost,
                               struct rdcost_block_args *arg) {
   vpx_memset(arg, 0, sizeof(struct rdcost_block_args));
   arg->x = x;
-  arg->tx_size = tx_size;
-  arg->bw = num_4x4_w;
-  arg->bh = num_4x4_h;
   arg->best_rd = ref_rdcost;
 }
 
 static void txfm_rd_in_plane(MACROBLOCK *x,
-                             struct rdcost_block_args *rd_stack,
                              int *rate, int64_t *distortion,
                              int *skippable, int64_t *sse,
                              int64_t ref_best_rd, int plane,
                              BLOCK_SIZE bsize, TX_SIZE tx_size) {
+  struct rdcost_block_args rd_stack;
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
@@ -719,30 +730,29 @@
   const int num_4x4_h = num_4x4_blocks_high_lookup[bs];
   const scan_order *so;
 
-  init_rdcost_stack(x, tx_size, num_4x4_w, num_4x4_h,
-                    ref_best_rd, rd_stack);
+  init_rdcost_stack(x, ref_best_rd, &rd_stack);
   if (plane == 0)
     xd->mi_8x8[0]->mbmi.tx_size = tx_size;
 
-  vp9_get_entropy_contexts(tx_size, rd_stack->t_above, rd_stack->t_left,
+  vp9_get_entropy_contexts(tx_size, rd_stack.t_above, rd_stack.t_left,
                            pd->above_context, pd->left_context,
                            num_4x4_w, num_4x4_h);
 
   so = get_scan(xd, tx_size, pd->plane_type, 0);
-  rd_stack->scan = so->scan;
-  rd_stack->nb = so->neighbors;
+  rd_stack.scan = so->scan;
+  rd_stack.nb = so->neighbors;
 
   foreach_transformed_block_in_plane(xd, bsize, plane,
-                                     block_rd_txfm, rd_stack);
-  if (rd_stack->skip) {
+                                     block_rd_txfm, &rd_stack);
+  if (rd_stack.skip) {
     *rate       = INT_MAX;
     *distortion = INT64_MAX;
     *sse        = INT64_MAX;
     *skippable  = 0;
   } else {
-    *distortion = rd_stack->this_dist;
-    *rate       = rd_stack->this_rate;
-    *sse        = rd_stack->this_sse;
+    *distortion = rd_stack.this_dist;
+    *rate       = rd_stack.this_rate;
+    *sse        = rd_stack.this_sse;
     *skippable  = vp9_is_skippable_in_plane(x, bsize, plane);
   }
 }
@@ -760,7 +770,7 @@
 
   mbmi->tx_size = MIN(max_tx_size, largest_tx_size);
 
-  txfm_rd_in_plane(x, &cpi->rdcost_stack, rate, distortion, skip,
+  txfm_rd_in_plane(x, rate, distortion, skip,
                    &sse[mbmi->tx_size], ref_best_rd, 0, bs,
                    mbmi->tx_size);
   cpi->tx_stepdown_count[0]++;
@@ -891,7 +901,7 @@
 
   // Actually encode using the chosen mode if a model was used, but do not
   // update the r, d costs
-  txfm_rd_in_plane(x, &cpi->rdcost_stack, rate, distortion, skip,
+  txfm_rd_in_plane(x, rate, distortion, skip,
                    &sse[mbmi->tx_size], ref_best_rd, 0, bs, mbmi->tx_size);
 
   if (max_tx_size == TX_32X32 && best_tx == TX_32X32) {
@@ -914,7 +924,6 @@
   int64_t d[TX_SIZES], sse[TX_SIZES];
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
-  struct rdcost_block_args *rdcost_stack = &cpi->rdcost_stack;
   const int b_inter_mode = is_inter_block(mbmi);
   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
   TX_SIZE tx_size;
@@ -944,7 +953,7 @@
                                   skip, sse, ref_best_rd, bs);
   } else {
     for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
-      txfm_rd_in_plane(x, rdcost_stack, &r[tx_size][0], &d[tx_size],
+      txfm_rd_in_plane(x, &r[tx_size][0], &d[tx_size],
                        &s[tx_size], &sse[tx_size],
                        ref_best_rd, 0, bs, tx_size);
     choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,
@@ -1273,7 +1282,7 @@
   *skippable = 1;
 
   for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
-    txfm_rd_in_plane(x, &cpi->rdcost_stack, &pnrate, &pndist, &pnskip, &pnsse,
+    txfm_rd_in_plane(x, &pnrate, &pndist, &pnskip, &pnsse,
                      ref_best_rd, plane, bsize, uv_txfm_size);
     if (pnrate == INT_MAX)
       goto term;
@@ -1759,7 +1768,8 @@
           if (best_rd < label_mv_thresh)
             break;
 
-          if (cpi->compressor_speed) {
+          if (cpi->oxcf.mode != MODE_SECONDPASS_BEST &&
+              cpi->oxcf.mode != MODE_BESTQUALITY) {
             // use previous block's result as next block's MV predictor.
             if (i > 0) {
               bsi->mvp.as_int = mi->bmi[i - 1].as_mv[0].as_int;
@@ -1823,7 +1833,8 @@
           }
 
           // Should we do a full search (best quality only)
-          if (cpi->compressor_speed == 0) {
+          if (cpi->oxcf.mode == MODE_BESTQUALITY ||
+              cpi->oxcf.mode == MODE_SECONDPASS_BEST) {
             /* Check if mvp_full is within the range. */
             clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max,
                      x->mv_row_min, x->mv_row_max);
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 897ecd7..6b18171 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -563,10 +563,21 @@
   unsigned int new_qc;
 
   /* Use best quality mode if no deadline is given. */
-  if (deadline)
-    new_qc = MODE_GOODQUALITY;
-  else
-    new_qc = MODE_BESTQUALITY;
+  new_qc = MODE_BESTQUALITY;
+
+  if (deadline) {
+      uint64_t     duration_us;
+
+      /* Convert duration parameter from stream timebase to microseconds */
+      duration_us = (uint64_t)duration * 1000000
+                    * (uint64_t)ctx->cfg.g_timebase.num
+                    / (uint64_t)ctx->cfg.g_timebase.den;
+
+      /* If the deadline is more that the duration this frame is to be shown,
+       * use good quality mode. Otherwise use realtime mode.
+       */
+      new_qc = (deadline > duration_us) ? MODE_GOODQUALITY : MODE_REALTIME;
+  }
 
   if (ctx->cfg.g_pass == VPX_RC_FIRST_PASS)
     new_qc = MODE_FIRSTPASS;
diff --git a/vp9/vp9dx.mk b/vp9/vp9dx.mk
index b722200..de210f4 100644
--- a/vp9/vp9dx.mk
+++ b/vp9/vp9dx.mk
@@ -21,6 +21,8 @@
 VP9_DX_SRCS-yes += decoder/vp9_decodeframe.c
 VP9_DX_SRCS-yes += decoder/vp9_decodeframe.h
 VP9_DX_SRCS-yes += decoder/vp9_detokenize.c
+VP9_DX_SRCS-yes += decoder/vp9_dthread.c
+VP9_DX_SRCS-yes += decoder/vp9_dthread.h
 VP9_DX_SRCS-yes += decoder/vp9_reader.h
 VP9_DX_SRCS-yes += decoder/vp9_reader.c
 VP9_DX_SRCS-yes += decoder/vp9_read_bit_buffer.h
diff --git a/vp9_spatial_scalable_encoder.c b/vp9_spatial_scalable_encoder.c
index e71094a..50f45c2 100644
--- a/vp9_spatial_scalable_encoder.c
+++ b/vp9_spatial_scalable_encoder.c
@@ -193,8 +193,6 @@
   vpx_codec_err_t res;
   int pts = 0;            /* PTS starts at 0 */
   int frame_duration = 1; /* 1 timebase tick per frame */
-  vpx_codec_cx_pkt_t packet = {0};
-  packet.kind = VPX_CODEC_CX_FRAME_PKT;
 
   memset(&svc_ctx, 0, sizeof(svc_ctx));
   svc_ctx.log_print = 1;
@@ -234,9 +232,7 @@
       die_codec(&codec, "Failed to encode frame");
     }
     if (vpx_svc_get_frame_size(&svc_ctx) > 0) {
-      packet.data.frame.pts = pts;
-      packet.data.frame.sz = vpx_svc_get_frame_size(&svc_ctx);
-      ivf_write_frame_header(outfile, &packet);
+      ivf_write_frame_header(outfile, pts, vpx_svc_get_frame_size(&svc_ctx));
       (void)fwrite(vpx_svc_get_buffer(&svc_ctx), 1,
                    vpx_svc_get_frame_size(&svc_ctx), outfile);
     }
diff --git a/vpxenc.c b/vpxenc.c
index f1feb47..f772432 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -1323,7 +1323,7 @@
             ivf_header_pos = ftello(stream->file);
             fsize = pkt->data.frame.sz;
 
-            ivf_write_frame_header(stream->file, pkt);
+            ivf_write_frame_header(stream->file, pkt->data.frame.pts, fsize);
           } else {
             fsize += pkt->data.frame.sz;