Merge "vp9_decoder_remove: destroy common after thread shutdown"
diff --git a/test/test-data.sha1 b/test/test-data.sha1
index f9c09c6..8940027 100644
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -653,6 +653,8 @@
 e3ab35d4316c5e81325c50f5236ceca4bc0d35df  vp90-2-15-segkey.webm.md5
 9b7ca2cac09d34c4a5d296c1900f93b1e2f69d0d  vp90-2-15-segkey_adpq.webm
 8f46ba5f785d0c2170591a153e0d0d146a7c8090  vp90-2-15-segkey_adpq.webm.md5
+698a6910a97486b833073ef0c0b18d75dce57ee8  vp90-2-16-intra-only.webm
+5661b0168752969f055eec37b05fa9fa947dc7eb  vp90-2-16-intra-only.webm.md5
 0321d507ce62dedc8a51b4e9011f7a19aed9c3dc  vp91-2-04-yuv444.webm
 367e423dd41fdb49aa028574a2cfec5c2f325c5c  vp91-2-04-yuv444.webm.md5
 76024eb753cdac6a5e5703aaea189d35c3c30ac7  invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.ivf
diff --git a/test/test.mk b/test/test.mk
index 85212d9..ef81ab1 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -771,6 +771,8 @@
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey_adpq.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey_adpq.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-16-intra-only.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-16-intra-only.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv444.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv444.webm.md5
 
diff --git a/test/test_vectors.cc b/test/test_vectors.cc
index 41c9e26..4955887 100644
--- a/test/test_vectors.cc
+++ b/test/test_vectors.cc
@@ -180,6 +180,7 @@
   "vp90-2-14-resize-fp-tiles-8-16.webm", "vp90-2-14-resize-fp-tiles-8-1.webm",
   "vp90-2-14-resize-fp-tiles-8-2.webm", "vp90-2-14-resize-fp-tiles-8-4.webm",
   "vp90-2-15-segkey.webm", "vp90-2-15-segkey_adpq.webm",
+  "vp90-2-16-intra-only.webm",
   "vp91-2-04-yuv444.webm",
 };
 const int kNumVP9TestVectors = NELEMENTS(kVP9TestVectors);
diff --git a/test/vp9_thread_test.cc b/test/vp9_thread_test.cc
index 72719a6..fa51835 100644
--- a/test/vp9_thread_test.cc
+++ b/test/vp9_thread_test.cc
@@ -18,7 +18,7 @@
 #if CONFIG_WEBM_IO
 #include "test/webm_video_source.h"
 #endif
-#include "vp9/decoder/vp9_thread.h"
+#include "vp9/common/vp9_thread.h"
 
 namespace {
 
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index e1753a1..afe831a 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -257,10 +257,14 @@
   xd->mi_stride = cm->mi_stride;
 }
 
+static INLINE int frame_is_intra_only(const VP9_COMMON *const cm) {
+  return cm->frame_type == KEY_FRAME || cm->intra_only;
+}
+
 static INLINE const vp9_prob* get_partition_probs(const VP9_COMMON *cm,
                                                   int ctx) {
-  return cm->frame_type == KEY_FRAME ? vp9_kf_partition_probs[ctx]
-                                     : cm->fc.partition_prob[ctx];
+  return frame_is_intra_only(cm) ? vp9_kf_partition_probs[ctx]
+                                 : cm->fc.partition_prob[ctx];
 }
 
 static INLINE void set_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col) {
@@ -299,10 +303,6 @@
                   cm->prev_mip + cm->mi_stride + 1 : NULL;
 }
 
-static INLINE int frame_is_intra_only(const VP9_COMMON *const cm) {
-  return cm->frame_type == KEY_FRAME || cm->intra_only;
-}
-
 static INLINE void update_partition_context(MACROBLOCKD *xd,
                                             int mi_row, int mi_col,
                                             BLOCK_SIZE subsize,
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index f52dccb..b182f3f 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -717,6 +717,9 @@
 add_proto qw/void vp9_quantize_fp/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
 specialize qw/vp9_quantize_fp/, "$ssse3_x86_64";
 
+add_proto qw/void vp9_quantize_fp_32x32/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64";
+
 add_proto qw/void vp9_quantize_b/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
 specialize qw/vp9_quantize_b/, "$ssse3_x86_64";
 
diff --git a/vp9/decoder/vp9_thread.c b/vp9/common/vp9_thread.c
similarity index 100%
rename from vp9/decoder/vp9_thread.c
rename to vp9/common/vp9_thread.c
diff --git a/vp9/decoder/vp9_thread.h b/vp9/common/vp9_thread.h
similarity index 100%
rename from vp9/decoder/vp9_thread.h
rename to vp9/common/vp9_thread.h
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 8b96abb..55d5b4f 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -28,6 +28,7 @@
 #include "vp9/common/vp9_reconintra.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_seg_common.h"
+#include "vp9/common/vp9_thread.h"
 #include "vp9/common/vp9_tile_common.h"
 
 #include "vp9/decoder/vp9_decodeframe.h"
@@ -38,7 +39,6 @@
 #include "vp9/decoder/vp9_dthread.h"
 #include "vp9/decoder/vp9_read_bit_buffer.h"
 #include "vp9/decoder/vp9_reader.h"
-#include "vp9/decoder/vp9_thread.h"
 
 #define MAX_VP9_HEADER_SIZE 80
 
@@ -605,8 +605,8 @@
                              : literal_to_filter[vp9_rb_read_literal(rb, 2)];
 }
 
-static void read_frame_size(struct vp9_read_bit_buffer *rb,
-                            int *width, int *height) {
+void vp9_read_frame_size(struct vp9_read_bit_buffer *rb,
+                         int *width, int *height) {
   const int w = vp9_rb_read_literal(rb, 16) + 1;
   const int h = vp9_rb_read_literal(rb, 16) + 1;
   *width = w;
@@ -617,7 +617,7 @@
   cm->display_width = cm->width;
   cm->display_height = cm->height;
   if (vp9_rb_read_bit(rb))
-    read_frame_size(rb, &cm->display_width, &cm->display_height);
+    vp9_read_frame_size(rb, &cm->display_width, &cm->display_height);
 }
 
 static void apply_frame_size(VP9_COMMON *cm, int width, int height) {
@@ -649,7 +649,7 @@
 
 static void setup_frame_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
   int width, height;
-  read_frame_size(rb, &width, &height);
+  vp9_read_frame_size(rb, &width, &height);
   apply_frame_size(cm, width, height);
   setup_display_size(cm, rb);
 }
@@ -669,7 +669,7 @@
   }
 
   if (!found)
-    read_frame_size(rb, &width, &height);
+    vp9_read_frame_size(rb, &width, &height);
 
   // Check that each of the frames that this frame references has valid
   // dimensions.
@@ -1053,20 +1053,17 @@
   return bit_reader_end;
 }
 
-static void check_sync_code(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
-  if (vp9_rb_read_literal(rb, 8) != VP9_SYNC_CODE_0 ||
-      vp9_rb_read_literal(rb, 8) != VP9_SYNC_CODE_1 ||
-      vp9_rb_read_literal(rb, 8) != VP9_SYNC_CODE_2) {
-    vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
-                       "Invalid frame sync code");
-  }
-}
-
 static void error_handler(void *data) {
   VP9_COMMON *const cm = (VP9_COMMON *)data;
   vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, "Truncated packet");
 }
 
+int vp9_read_sync_code(struct vp9_read_bit_buffer *const rb) {
+  return vp9_rb_read_literal(rb, 8) == VP9_SYNC_CODE_0 &&
+         vp9_rb_read_literal(rb, 8) == VP9_SYNC_CODE_1 &&
+         vp9_rb_read_literal(rb, 8) == VP9_SYNC_CODE_2;
+}
+
 static BITSTREAM_PROFILE read_profile(struct vp9_read_bit_buffer *rb) {
   int profile = vp9_rb_read_bit(rb);
   profile |= vp9_rb_read_bit(rb) << 1;
@@ -1112,7 +1109,9 @@
   cm->error_resilient_mode = vp9_rb_read_bit(rb);
 
   if (cm->frame_type == KEY_FRAME) {
-    check_sync_code(cm, rb);
+    if (!vp9_read_sync_code(rb))
+      vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+                         "Invalid frame sync code");
     if (cm->profile > PROFILE_1)
       cm->bit_depth = vp9_rb_read_bit(rb) ? BITS_12 : BITS_10;
     cm->color_space = (COLOR_SPACE)vp9_rb_read_literal(rb, 3);
@@ -1150,9 +1149,18 @@
         0 : vp9_rb_read_literal(rb, 2);
 
     if (cm->intra_only) {
-      check_sync_code(cm, rb);
+      if (!vp9_read_sync_code(rb))
+        vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+                           "Invalid frame sync code");
 
       pbi->refresh_frame_flags = vp9_rb_read_literal(rb, REF_FRAMES);
+
+      // NOTE: The intra-only frame header does not include the specification of
+      // either the color format or color sub-sampling. VP9 specifies that the
+      // default color space should be YUV 4:2:0 in this case (normative).
+      cm->color_space = BT_601;
+      cm->subsampling_y = cm->subsampling_x = 1;
+
       setup_frame_size(cm, rb);
     } else {
       pbi->refresh_frame_flags = vp9_rb_read_literal(rb, REF_FRAMES);
diff --git a/vp9/decoder/vp9_decodeframe.h b/vp9/decoder/vp9_decodeframe.h
index fb15645..e5d9d62 100644
--- a/vp9/decoder/vp9_decodeframe.h
+++ b/vp9/decoder/vp9_decodeframe.h
@@ -18,6 +18,7 @@
 
 struct VP9Common;
 struct VP9Decoder;
+struct vp9_read_bit_buffer;
 
 void vp9_init_dequantizer(struct VP9Common *cm);
 
@@ -25,6 +26,10 @@
                       const uint8_t *data, const uint8_t *data_end,
                       const uint8_t **p_data_end);
 
+int vp9_read_sync_code(struct vp9_read_bit_buffer *const rb);
+void vp9_read_frame_size(struct vp9_read_bit_buffer *rb,
+                         int *width, int *height);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index a0bd2f1..a1a78a9 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -267,7 +267,10 @@
 
   vp9_decode_frame(pbi, source, source + size, psource);
 
-  swap_frame_buffers(pbi);
+  if (!cm->show_existing_frame)
+    swap_frame_buffers(pbi);
+  else
+    cm->frame_to_show = get_frame_new_buffer(cm);
 
   vp9_clear_system_state();
 
@@ -291,6 +294,7 @@
 
 int vp9_get_raw_frame(VP9Decoder *pbi, YV12_BUFFER_CONFIG *sd,
                       vp9_ppflags_t *flags) {
+  VP9_COMMON *const cm = &pbi->common;
   int ret = -1;
 #if !CONFIG_VP9_POSTPROC
   (void)*flags;
@@ -300,15 +304,20 @@
     return ret;
 
   /* no raw frame to show!!! */
-  if (pbi->common.show_frame == 0)
+  if (!cm->show_frame)
     return ret;
 
   pbi->ready_for_new_data = 1;
 
 #if CONFIG_VP9_POSTPROC
-  ret = vp9_post_proc_frame(&pbi->common, sd, flags);
+  if (!cm->show_existing_frame) {
+    ret = vp9_post_proc_frame(cm, sd, flags);
+  } else {
+    *sd = *cm->frame_to_show;
+    ret = 0;
+  }
 #else
-  *sd = *pbi->common.frame_to_show;
+  *sd = *cm->frame_to_show;
   ret = 0;
 #endif /*!CONFIG_POSTPROC*/
   vp9_clear_system_state();
diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h
index ab4f9a2..8e16e1c 100644
--- a/vp9/decoder/vp9_decoder.h
+++ b/vp9/decoder/vp9_decoder.h
@@ -18,10 +18,9 @@
 
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_ppflags.h"
+#include "vp9/common/vp9_thread.h"
 
-#include "vp9/decoder/vp9_decoder.h"
 #include "vp9/decoder/vp9_dthread.h"
-#include "vp9/decoder/vp9_thread.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/vp9/decoder/vp9_dthread.h b/vp9/decoder/vp9_dthread.h
index a727e2a..423bd88 100644
--- a/vp9/decoder/vp9_dthread.h
+++ b/vp9/decoder/vp9_dthread.h
@@ -12,8 +12,8 @@
 #define VP9_DECODER_VP9_DTHREAD_H_
 
 #include "./vpx_config.h"
+#include "vp9/common/vp9_thread.h"
 #include "vp9/decoder/vp9_reader.h"
-#include "vp9/decoder/vp9_thread.h"
 
 struct VP9Common;
 struct VP9Decoder;
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index f6393e0..dbf8cd7 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -190,12 +190,19 @@
   MACROBLOCKD *filter_mbd = &mb->e_mbd;
   MB_MODE_INFO *mbmi = &filter_mbd->mi[0]->mbmi;
 
+  MB_MODE_INFO saved_mbmi;
+  int i, j;
+  struct buf_2d saved_dst[MAX_MB_PLANE];
+  struct buf_2d saved_pre[MAX_MB_PLANE][2];  // 2 pre buffers
+
   // We will restore these after motion compensation.
-  MB_MODE_INFO saved_mbmi = *mbmi;
-  struct buf_2d saved_dst = filter_mbd->plane[0].dst;
-  struct buf_2d saved_pre[2];
-  saved_pre[0] = filter_mbd->plane[0].pre[0];
-  saved_pre[1] = filter_mbd->plane[0].pre[1];
+  saved_mbmi = *mbmi;
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    for (j = 0; j < 2; ++j) {
+      saved_pre[i][j] = filter_mbd->plane[i].pre[j];
+    }
+    saved_dst[i] = filter_mbd->plane[i].dst;
+  }
 
   mv_col = denoiser->best_sse_mv.as_mv.col;
   mv_row = denoiser->best_sse_mv.as_mv.row;
@@ -224,67 +231,52 @@
 
   // Set the pointers in the MACROBLOCKD to point to the buffers in the denoiser
   // struct.
-  filter_mbd->plane[0].pre[0].buf =
-      block_start(denoiser->running_avg_y[frame].y_buffer,
-                  denoiser->running_avg_y[frame].y_stride,
-                  mi_row, mi_col);
-  filter_mbd->plane[0].pre[0].stride = denoiser->running_avg_y[frame].y_stride;
-
-  filter_mbd->plane[1].pre[0].buf =
-      block_start(denoiser->running_avg_y[frame].u_buffer,
-                  denoiser->running_avg_y[frame].uv_stride,
-                  mi_row, mi_col);
-  filter_mbd->plane[1].pre[0].stride = denoiser->running_avg_y[frame].uv_stride;
-
-  filter_mbd->plane[2].pre[0].buf =
-      block_start(denoiser->running_avg_y[frame].v_buffer,
-                  denoiser->running_avg_y[frame].uv_stride,
-                  mi_row, mi_col);
-  filter_mbd->plane[2].pre[0].stride = denoiser->running_avg_y[frame].uv_stride;
-
-  filter_mbd->plane[0].pre[1].buf =
-      block_start(denoiser->running_avg_y[frame].y_buffer,
-                  denoiser->running_avg_y[frame].y_stride,
-                  mi_row, mi_col);
-  filter_mbd->plane[0].pre[1].stride = denoiser->running_avg_y[frame].y_stride;
-
-  filter_mbd->plane[1].pre[1].buf =
-      block_start(denoiser->running_avg_y[frame].u_buffer,
-                  denoiser->running_avg_y[frame].uv_stride,
-                  mi_row, mi_col);
-  filter_mbd->plane[1].pre[1].stride = denoiser->running_avg_y[frame].uv_stride;
-
-  filter_mbd->plane[2].pre[1].buf =
-      block_start(denoiser->running_avg_y[frame].v_buffer,
-                  denoiser->running_avg_y[frame].uv_stride,
-                  mi_row, mi_col);
-  filter_mbd->plane[2].pre[1].stride = denoiser->running_avg_y[frame].uv_stride;
-
+  for (j = 0; j < 2; ++j) {
+    filter_mbd->plane[0].pre[j].buf =
+        block_start(denoiser->running_avg_y[frame].y_buffer,
+                    denoiser->running_avg_y[frame].y_stride,
+                    mi_row, mi_col);
+    filter_mbd->plane[0].pre[j].stride =
+        denoiser->running_avg_y[frame].y_stride;
+    filter_mbd->plane[1].pre[j].buf =
+        block_start(denoiser->running_avg_y[frame].u_buffer,
+                    denoiser->running_avg_y[frame].uv_stride,
+                    mi_row, mi_col);
+    filter_mbd->plane[1].pre[j].stride =
+        denoiser->running_avg_y[frame].uv_stride;
+    filter_mbd->plane[2].pre[j].buf =
+        block_start(denoiser->running_avg_y[frame].v_buffer,
+                    denoiser->running_avg_y[frame].uv_stride,
+                    mi_row, mi_col);
+    filter_mbd->plane[2].pre[j].stride =
+        denoiser->running_avg_y[frame].uv_stride;
+  }
   filter_mbd->plane[0].dst.buf =
       block_start(denoiser->mc_running_avg_y.y_buffer,
                   denoiser->mc_running_avg_y.y_stride,
                   mi_row, mi_col);
   filter_mbd->plane[0].dst.stride = denoiser->mc_running_avg_y.y_stride;
-
   filter_mbd->plane[1].dst.buf =
       block_start(denoiser->mc_running_avg_y.u_buffer,
                   denoiser->mc_running_avg_y.uv_stride,
                   mi_row, mi_col);
-  filter_mbd->plane[1].dst.stride = denoiser->mc_running_avg_y.y_stride;
-
+  filter_mbd->plane[1].dst.stride = denoiser->mc_running_avg_y.uv_stride;
   filter_mbd->plane[2].dst.buf =
       block_start(denoiser->mc_running_avg_y.v_buffer,
                   denoiser->mc_running_avg_y.uv_stride,
                   mi_row, mi_col);
-  filter_mbd->plane[2].dst.stride = denoiser->mc_running_avg_y.y_stride;
+  filter_mbd->plane[2].dst.stride = denoiser->mc_running_avg_y.uv_stride;
 
   vp9_build_inter_predictors_sby(filter_mbd, mv_row, mv_col, bs);
 
   // Restore everything to its original state
-  filter_mbd->plane[0].pre[0] = saved_pre[0];
-  filter_mbd->plane[0].pre[1] = saved_pre[1];
-  filter_mbd->plane[0].dst = saved_dst;
   *mbmi = saved_mbmi;
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    for (j = 0; j < 2; ++j) {
+      filter_mbd->plane[i].pre[j] = saved_pre[i][j];
+    }
+    filter_mbd->plane[i].dst = saved_dst[i];
+  }
 
   mv_row = denoiser->best_sse_mv.as_mv.row;
   mv_col = denoiser->best_sse_mv.as_mv.col;
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index eb9624d..cd0191e 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -320,10 +320,10 @@
   switch (tx_size) {
     case TX_32X32:
       fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-      vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
-                           p->quant, p->quant_shift, qcoeff, dqcoeff,
-                           pd->dequant, p->zbin_extra, eob, scan_order->scan,
-                           scan_order->iscan);
+      vp9_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin, p->round_fp,
+                            p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                            pd->dequant, p->zbin_extra, eob, scan_order->scan,
+                            scan_order->iscan);
       break;
     case TX_16X16:
       vp9_fdct16x16(src_diff, coeff, diff_stride);
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index e0e0561..a240622 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -862,9 +862,7 @@
 
 #if CONFIG_DENOISING
 #ifdef OUTPUT_YUV_DENOISED
-  if (cpi->oxcf.noise_sensitivity > 0) {
-    yuv_denoised_file = fopen("denoised.yuv", "ab");
-  }
+  yuv_denoised_file = fopen("denoised.yuv", "ab");
 #endif
 #endif
 #ifdef OUTPUT_YUV_SRC
@@ -1122,9 +1120,7 @@
 
 #if CONFIG_DENOISING
 #ifdef OUTPUT_YUV_DENOISED
-  if (cpi->oxcf.noise_sensitivity > 0) {
-    fclose(yuv_denoised_file);
-  }
+  fclose(yuv_denoised_file);
 #endif
 #endif
 #ifdef OUTPUT_YUV_SRC
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index c66e003..9f8b37f 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -338,7 +338,6 @@
   CYCLIC_REFRESH *cyclic_refresh;
 
   fractional_mv_step_fp *find_fractional_mv_step;
-  fractional_mv_step_comp_fp *find_fractional_mv_step_comp;
   vp9_full_search_fn_t full_search_sad;
   vp9_refining_search_fn_t refining_search_sad;
   vp9_diamond_search_fn_t diamond_search_sad;
diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index 9eb2fbc..6e04e2a 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -56,7 +56,7 @@
     cpi->find_fractional_mv_step(
         x, dst_mv, ref_mv, cpi->common.allow_high_precision_mv, x->errorperbit,
         &v_fn_ptr, 0, mv_sf->subpel_iters_per_step, NULL, NULL, &distortion,
-        &sse);
+        &sse, NULL, 0, 0);
   }
 
   xd->mi[0]->mbmi.mode = NEWMV;
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index c0edf45..01d2b44 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -172,15 +172,15 @@
   return &buf[(r >> 3) * stride + (c >> 3)];
 }
 
-/* returns subpixel variance error function */
-#define DIST(r, c) \
-    vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \
-             src_stride, &sse)
-
 /* checks if (r, c) has better score than previous best */
 #define CHECK_BETTER(v, r, c) \
   if (c >= minc && c <= maxc && r >= minr && r <= maxr) {              \
-    thismse = (DIST(r, c));                                            \
+    if (second_pred == NULL)                                           \
+      thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \
+                             src_stride, &sse);                        \
+    else                                                               \
+      thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), \
+                              z, src_stride, &sse, second_pred);       \
     if ((v = MVC(r, c) + thismse) < besterr) {                         \
       besterr = v;                                                     \
       br = r;                                                          \
@@ -266,105 +266,9 @@
                                  int iters_per_step,
                                  int *mvjcost, int *mvcost[2],
                                  int *distortion,
-                                 unsigned int *sse1) {
-  const uint8_t *const z = x->plane[0].src.buf;
-  const int src_stride = x->plane[0].src.stride;
-  const MACROBLOCKD *xd = &x->e_mbd;
-  unsigned int besterr = INT_MAX;
-  unsigned int sse;
-  unsigned int whichdir;
-  int thismse;
-  unsigned int halfiters = iters_per_step;
-  unsigned int quarteriters = iters_per_step;
-  unsigned int eighthiters = iters_per_step;
-
-  const int y_stride = xd->plane[0].pre[0].stride;
-  const int offset = bestmv->row * y_stride + bestmv->col;
-  const uint8_t *const y = xd->plane[0].pre[0].buf;
-
-  int rr = ref_mv->row;
-  int rc = ref_mv->col;
-  int br = bestmv->row * 8;
-  int bc = bestmv->col * 8;
-  int hstep = 4;
-  const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);
-  const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);
-  const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);
-  const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);
-
-  int tr = br;
-  int tc = bc;
-
-  // central mv
-  bestmv->row *= 8;
-  bestmv->col *= 8;
-
-  // calculate central point error
-  besterr = vfp->vf(y + offset, y_stride, z, src_stride, sse1);
-  *distortion = besterr;
-  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
-
-  // 1/2 pel
-  FIRST_LEVEL_CHECKS;
-  if (halfiters > 1) {
-    SECOND_LEVEL_CHECKS;
-  }
-  tr = br;
-  tc = bc;
-
-  // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
-  if (forced_stop != 2) {
-    hstep >>= 1;
-    FIRST_LEVEL_CHECKS;
-    if (quarteriters > 1) {
-      SECOND_LEVEL_CHECKS;
-    }
-    tr = br;
-    tc = bc;
-  }
-
-  if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) {
-    hstep >>= 1;
-    FIRST_LEVEL_CHECKS;
-    if (eighthiters > 1) {
-      SECOND_LEVEL_CHECKS;
-    }
-    tr = br;
-    tc = bc;
-  }
-  // These lines insure static analysis doesn't warn that
-  // tr and tc aren't used after the above point.
-  (void) tr;
-  (void) tc;
-
-  bestmv->row = br;
-  bestmv->col = bc;
-
-  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
-      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
-    return INT_MAX;
-
-  return besterr;
-}
-
-#undef DIST
-/* returns subpixel variance error function */
-#define DIST(r, c) \
-    vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), \
-              z, src_stride, &sse, second_pred)
-
-int vp9_find_best_sub_pixel_comp_tree(const MACROBLOCK *x,
-                                      MV *bestmv, const MV *ref_mv,
-                                      int allow_hp,
-                                      int error_per_bit,
-                                      const vp9_variance_fn_ptr_t *vfp,
-                                      int forced_stop,
-                                      int iters_per_step,
-                                      int *mvjcost, int *mvcost[2],
-                                      int *distortion,
-                                      unsigned int *sse1,
-                                      const uint8_t *second_pred,
-                                      int w, int h) {
+                                 unsigned int *sse1,
+                                 const uint8_t *second_pred,
+                                 int w, int h) {
   const uint8_t *const z = x->plane[0].src.buf;
   const int src_stride = x->plane[0].src.stride;
   const MACROBLOCKD *xd = &x->e_mbd;
@@ -376,7 +280,6 @@
   const unsigned int quarteriters = iters_per_step;
   const unsigned int eighthiters = iters_per_step;
 
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);
   const int y_stride = xd->plane[0].pre[0].stride;
   const int offset = bestmv->row * y_stride + bestmv->col;
   const uint8_t *const y = xd->plane[0].pre[0].buf;
@@ -401,8 +304,13 @@
   // calculate central point error
   // TODO(yunqingwang): central pointer error was already calculated in full-
   // pixel search, and can be passed in this function.
-  vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
-  besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);
+  if (second_pred != NULL) {
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);
+    vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
+    besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);
+  } else {
+    besterr = vfp->vf(y + offset, y_stride, z, src_stride, sse1);
+  }
   *distortion = besterr;
   besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
 
@@ -456,7 +364,6 @@
 
 #undef MVC
 #undef PRE
-#undef DIST
 #undef CHECK_BETTER
 
 static INLINE int check_bounds(const MACROBLOCK *x, int row, int col,
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index 07e410d..366f9af 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -98,27 +98,12 @@
     const vp9_variance_fn_ptr_t *vfp,
     int forced_stop,  // 0 - full, 1 - qtr only, 2 - half only
     int iters_per_step,
-    int *mvjcost,
-    int *mvcost[2],
-    int *distortion,
-    unsigned int *sse);
-
-extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree;
-
-typedef int (fractional_mv_step_comp_fp) (
-    const MACROBLOCK *x,
-    MV *bestmv, const MV *ref_mv,
-    int allow_hp,
-    int error_per_bit,
-    const vp9_variance_fn_ptr_t *vfp,
-    int forced_stop,  // 0 - full, 1 - qtr only, 2 - half only
-    int iters_per_step,
     int *mvjcost, int *mvcost[2],
     int *distortion, unsigned int *sse1,
     const uint8_t *second_pred,
     int w, int h);
 
-extern fractional_mv_step_comp_fp vp9_find_best_sub_pixel_comp_tree;
+extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree;
 
 typedef int (*vp9_full_search_fn_t)(const MACROBLOCK *x,
                                     const MV *ref_mv, int sad_per_bit,
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 7515f44..c915e5c 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -106,24 +106,25 @@
   return const_motion;
 }
 
-static void full_pixel_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
-                                    BLOCK_SIZE bsize, int mi_row, int mi_col,
-                                    int_mv *tmp_mv, int *rate_mv) {
+static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
+                                  BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                  int_mv *tmp_mv, int *rate_mv,
+                                  int64_t best_rd_sofar) {
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0, 0}};
-  int step_param;
-  int sadpb = x->sadperbit16;
+  const int step_param = cpi->sf.mv.fullpel_search_step_param;
+  const int sadpb = x->sadperbit16;
   MV mvp_full;
-  int ref = mbmi->ref_frame[0];
+  const int ref = mbmi->ref_frame[0];
   const MV ref_mv = mbmi->ref_mvs[ref][0].as_mv;
-  int i;
-
-  int tmp_col_min = x->mv_col_min;
-  int tmp_col_max = x->mv_col_max;
-  int tmp_row_min = x->mv_row_min;
-  int tmp_row_max = x->mv_row_max;
-
+  int dis;
+  int rate_mode;
+  const int tmp_col_min = x->mv_col_min;
+  const int tmp_col_max = x->mv_col_max;
+  const int tmp_row_min = x->mv_row_min;
+  const int tmp_row_max = x->mv_row_max;
+  int rv = 0;
   const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi,
                                                                         ref);
   if (scaled_ref_frame) {
@@ -133,27 +134,19 @@
     // motion search code to be used without additional modifications.
     for (i = 0; i < MAX_MB_PLANE; i++)
       backup_yv12[i] = xd->plane[i].pre[0];
-
     vp9_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
   }
-
   vp9_set_mv_search_range(x, &ref_mv);
 
-  // TODO(jingning) exploiting adaptive motion search control in non-RD
-  // mode decision too.
-  step_param = cpi->sf.mv.fullpel_search_step_param;
-
-  for (i = LAST_FRAME; i <= LAST_FRAME && cpi->common.show_frame; ++i) {
-    if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
-      tmp_mv->as_int = INVALID_MV;
-
-      if (scaled_ref_frame) {
-        int i;
-        for (i = 0; i < MAX_MB_PLANE; i++)
-          xd->plane[i].pre[0] = backup_yv12[i];
-      }
-      return;
+  if (cpi->common.show_frame &&
+      (x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[LAST_FRAME]) {
+    tmp_mv->as_int = INVALID_MV;
+    if (scaled_ref_frame) {
+      int i;
+      for (i = 0; i < MAX_MB_PLANE; i++)
+        xd->plane[i].pre[0] = backup_yv12[i];
     }
+    return rv;
   }
   assert(x->mv_best_ref_index[ref] <= 2);
   if (x->mv_best_ref_index[ref] < 2)
@@ -172,60 +165,39 @@
   x->mv_row_min = tmp_row_min;
   x->mv_row_max = tmp_row_max;
 
-  if (scaled_ref_frame) {
-    int i;
-    for (i = 0; i < MAX_MB_PLANE; i++)
-      xd->plane[i].pre[0] = backup_yv12[i];
-  }
-
   // calculate the bit cost on motion vector
   mvp_full.row = tmp_mv->as_mv.row * 8;
   mvp_full.col = tmp_mv->as_mv.col * 8;
+
   *rate_mv = vp9_mv_bit_cost(&mvp_full, &ref_mv,
                              x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
-}
 
-static void sub_pixel_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
-                                    BLOCK_SIZE bsize, int mi_row, int mi_col,
-                                    MV *tmp_mv) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0, 0}};
-  int ref = mbmi->ref_frame[0];
-  MV ref_mv = mbmi->ref_mvs[ref][0].as_mv;
-  int dis;
+  rate_mode = cpi->inter_mode_cost[mbmi->mode_context[ref]]
+                                  [INTER_OFFSET(NEWMV)];
+  rv = !(RDCOST(x->rdmult, x->rddiv, (*rate_mv + rate_mode), 0) >
+         best_rd_sofar);
 
-  const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi,
-                                                                        ref);
-  if (scaled_ref_frame) {
-    int i;
-    // Swap out the reference frame for a version that's been scaled to
-    // match the resolution of the current frame, allowing the existing
-    // motion search code to be used without additional modifications.
-    for (i = 0; i < MAX_MB_PLANE; i++)
-      backup_yv12[i] = xd->plane[i].pre[0];
-
-    vp9_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
+  if (rv) {
+    cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv,
+                                 cpi->common.allow_high_precision_mv,
+                                 x->errorperbit,
+                                 &cpi->fn_ptr[bsize],
+                                 cpi->sf.mv.subpel_force_stop,
+                                 cpi->sf.mv.subpel_iters_per_step,
+                                 x->nmvjointcost, x->mvcost,
+                                 &dis, &x->pred_sse[ref], NULL, 0, 0);
+    x->pred_mv[ref] = tmp_mv->as_mv;
   }
 
-  cpi->find_fractional_mv_step(x, tmp_mv, &ref_mv,
-                               cpi->common.allow_high_precision_mv,
-                               x->errorperbit,
-                               &cpi->fn_ptr[bsize],
-                               cpi->sf.mv.subpel_force_stop,
-                               cpi->sf.mv.subpel_iters_per_step,
-                               x->nmvjointcost, x->mvcost,
-                               &dis, &x->pred_sse[ref]);
-
   if (scaled_ref_frame) {
     int i;
     for (i = 0; i < MAX_MB_PLANE; i++)
       xd->plane[i].pre[0] = backup_yv12[i];
   }
-
-  x->pred_mv[ref] = *tmp_mv;
+  return rv;
 }
 
+
 static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
                               MACROBLOCK *x, MACROBLOCKD *xd,
                               int *out_rate_sum, int64_t *out_dist_sum,
@@ -544,28 +516,17 @@
         continue;
 
       if (this_mode == NEWMV) {
-        int rate_mode = 0;
         if (this_rd < (int64_t)(1 << num_pels_log2_lookup[bsize]))
           continue;
-
-        full_pixel_motion_search(cpi, x, bsize, mi_row, mi_col,
-                                 &frame_mv[NEWMV][ref_frame], &rate_mv);
-
-        if (frame_mv[NEWMV][ref_frame].as_int == INVALID_MV)
+        if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
+                                    &frame_mv[NEWMV][ref_frame],
+                                    &rate_mv, best_rd))
           continue;
-
-        rate_mode = cpi->inter_mode_cost[mbmi->mode_context[ref_frame]]
-                                        [INTER_OFFSET(this_mode)];
-        if (RDCOST(x->rdmult, x->rddiv, rate_mv + rate_mode, 0) > best_rd)
-          continue;
-
-        sub_pixel_motion_search(cpi, x, bsize, mi_row, mi_col,
-                                &frame_mv[NEWMV][ref_frame].as_mv);
       }
 
-      if (this_mode != NEARESTMV)
-        if (frame_mv[this_mode][ref_frame].as_int ==
-            frame_mv[NEARESTMV][ref_frame].as_int)
+      if (this_mode != NEARESTMV &&
+          frame_mv[this_mode][ref_frame].as_int ==
+              frame_mv[NEARESTMV][ref_frame].as_int)
           continue;
 
       mbmi->mode = this_mode;
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 4964e0f..370e1ce 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -104,6 +104,49 @@
   *eob_ptr = eob + 1;
 }
 
+// TODO(jingning) Refactor this file and combine functions with similar
+// operations.
+void vp9_quantize_fp_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
+                             int skip_block,
+                             const int16_t *zbin_ptr, const int16_t *round_ptr,
+                             const int16_t *quant_ptr,
+                             const int16_t *quant_shift_ptr,
+                             int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr,
+                             int zbin_oq_value, uint16_t *eob_ptr,
+                             const int16_t *scan, const int16_t *iscan) {
+  int i, eob = -1;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)zbin_oq_value;
+  (void)iscan;
+
+  vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(int16_t));
+  vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(int16_t));
+
+  if (!skip_block) {
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      int tmp = 0;
+      int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+      if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) {
+        abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+        abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+        tmp = (abs_coeff * quant_ptr[rc != 0]) >> 15;
+        qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+      }
+
+      if (tmp)
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
 void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t count,
                       int skip_block,
                       const int16_t *zbin_ptr, const int16_t *round_ptr,
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index a8daa21..998fb3c 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -1422,7 +1422,8 @@
                                          cpi->sf.mv.subpel_iters_per_step,
                                          x->nmvjointcost, x->mvcost,
                                          &distortion,
-                                         &x->pred_sse[mbmi->ref_frame[0]]);
+                                         &x->pred_sse[mbmi->ref_frame[0]],
+                                         NULL, 0, 0);
 
             // save motion search result for use in compound prediction
             seg_mvs[i][mbmi->ref_frame[0]].as_mv = *new_mv;
@@ -1838,7 +1839,7 @@
                                  cpi->sf.mv.subpel_force_stop,
                                  cpi->sf.mv.subpel_iters_per_step,
                                  x->nmvjointcost, x->mvcost,
-                                 &dis, &x->pred_sse[ref]);
+                                 &dis, &x->pred_sse[ref], NULL, 0, 0);
   }
   *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv,
                              x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
@@ -1954,7 +1955,7 @@
     if (bestsme < INT_MAX) {
       int dis; /* TODO: use dis in distortion calculation later. */
       unsigned int sse;
-      bestsme = cpi->find_fractional_mv_step_comp(
+      bestsme = cpi->find_fractional_mv_step(
           x, &tmp_mv,
           &ref_mv[id].as_mv,
           cpi->common.allow_high_precision_mv,
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 98d6825..f271182 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -253,6 +253,7 @@
   }
 
   if (speed >= 5) {
+    sf->use_quant_fp = cm->frame_type == KEY_FRAME ? 0 : 1;
     sf->auto_min_max_partition_size = (cm->frame_type == KEY_FRAME) ?
         RELAXED_NEIGHBORING_MIN_MAX : STRICT_NEIGHBORING_MIN_MAX;
     sf->max_partition_size = BLOCK_32X32;
@@ -265,7 +266,6 @@
     sf->max_delta_qindex = (cm->frame_type == KEY_FRAME) ? 20 : 15;
     sf->partition_search_type = REFERENCE_PARTITION;
     sf->use_nonrd_pick_mode = 1;
-    sf->mv.search_method = FAST_DIAMOND;
     sf->allow_skip_recode = 0;
   }
 
@@ -287,7 +287,7 @@
     sf->mv.reduce_first_step_size = 1;
   }
   if (speed >= 7) {
-    sf->use_quant_fp = cm->frame_type == KEY_FRAME ? 0 : 1;
+    sf->mv.search_method = FAST_DIAMOND;
     sf->mv.fullpel_search_step_param = 10;
     sf->lpf_pick = LPF_PICK_MINIMAL_LPF;
     sf->encode_breakout_thresh = (MIN(cm->width, cm->height) >= 720) ?
@@ -396,7 +396,6 @@
 
   if (sf->mv.subpel_search_method == SUBPEL_TREE) {
     cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree;
-    cpi->find_fractional_mv_step_comp = vp9_find_best_sub_pixel_comp_tree;
   }
 
   cpi->mb.optimize = sf->optimize_coefficients == 1 && cpi->pass != 1;
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index c090731..bcea100 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -178,7 +178,7 @@
                                          &cpi->fn_ptr[BLOCK_16X16],
                                          0, mv_sf->subpel_iters_per_step,
                                          NULL, NULL,
-                                         &distortion, &sse);
+                                         &distortion, &sse, NULL, 0, 0);
 
   // Restore input state
   x->plane[0].src = src;
diff --git a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
index 2d9f2b0..508e1d4 100644
--- a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
+++ b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
@@ -234,21 +234,18 @@
   movifnidn                   quantq, quantmp
   mova                            m1, [roundq]             ; m1 = round
   mova                            m2, [quantq]             ; m2 = quant
-%ifidn %1, b_32x32
-; TODO(jingning) to be continued with 32x32 quantization process
+%ifidn %1, fp_32x32
   pcmpeqw                         m5, m5
   psrlw                           m5, 15
-  paddw                           m0, m5
   paddw                           m1, m5
-  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
   psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
 %endif
   mova                            m3, [r2q]                ; m3 = dequant
   mov                             r3, qcoeffmp
   mov                             r4, dqcoeffmp
   mov                             r5, iscanmp
-%ifidn %1, b_32x32
-  psllw                           m4, 1
+%ifidn %1, fp_32x32
+  psllw                           m2, 1
 %endif
   pxor                            m5, m5                   ; m5 = dedicated zero
   DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob
@@ -275,18 +272,19 @@
   psignw                         m13, m10                  ; m13 = reinsert sign
   mova        [qcoeffq+ncoeffq*2+ 0], m8
   mova        [qcoeffq+ncoeffq*2+16], m13
-%ifidn %1, b_32x32
+%ifidn %1, fp_32x32
   pabsw                           m8, m8
   pabsw                          m13, m13
 %endif
   pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
   punpckhqdq                      m3, m3
   pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
+%ifidn %1, fp_32x32
   psrlw                           m8, 1
   psrlw                          m13, 1
   psignw                          m8, m9
   psignw                         m13, m10
+  psrlw                           m0, m3, 2
 %endif
   mova       [dqcoeffq+ncoeffq*2+ 0], m8
   mova       [dqcoeffq+ncoeffq*2+16], m13
@@ -307,13 +305,17 @@
   mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
   pabsw                           m6, m9                   ; m6 = abs(m9)
   pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpeqw                         m7, m7
-%ifidn %1, b_32x32
+%ifidn %1, fp_32x32
+  pcmpgtw                         m7, m6,  m0
+  pcmpgtw                        m12, m11, m0
   pmovmskb                        r6, m7
-  pmovmskb                        r2, m7
+  pmovmskb                        r2, m12
+
   or                              r6, r2
   jz .skip_iter
 %endif
+  pcmpeqw                         m7, m7
+
   paddsw                          m6, m1                   ; m6 += round
   paddsw                         m11, m1                   ; m11 += round
   pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
@@ -322,13 +324,13 @@
   psignw                         m13, m10                  ; m13 = reinsert sign
   mova        [qcoeffq+ncoeffq*2+ 0], m14
   mova        [qcoeffq+ncoeffq*2+16], m13
-%ifidn %1, b_32x32
+%ifidn %1, fp_32x32
   pabsw                          m14, m14
   pabsw                          m13, m13
 %endif
   pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
   pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
+%ifidn %1, fp_32x32
   psrlw                          m14, 1
   psrlw                          m13, 1
   psignw                         m14, m9
@@ -349,7 +351,7 @@
   add                        ncoeffq, mmsize
   jl .ac_only_loop
 
-%ifidn %1, b_32x32
+%ifidn %1, fp_32x32
   jmp .accumulate_eob
 .skip_iter:
   mova        [qcoeffq+ncoeffq*2+ 0], m5
@@ -397,3 +399,4 @@
 
 INIT_XMM ssse3
 QUANTIZE_FP fp, 7
+QUANTIZE_FP fp_32x32, 7
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 8c1f345..81fe6a6 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -50,6 +50,8 @@
 VP9_COMMON_SRCS-yes += common/vp9_seg_common.c
 VP9_COMMON_SRCS-yes += common/vp9_systemdependent.h
 VP9_COMMON_SRCS-yes += common/vp9_textblit.h
+VP9_COMMON_SRCS-yes += common/vp9_thread.h
+VP9_COMMON_SRCS-yes += common/vp9_thread.c
 VP9_COMMON_SRCS-yes += common/vp9_tile_common.h
 VP9_COMMON_SRCS-yes += common/vp9_tile_common.c
 VP9_COMMON_SRCS-yes += common/vp9_loopfilter.c
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index b150161..24dcbfa 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -353,7 +353,7 @@
 
   oxcf->key_freq               = cfg->kf_max_dist;
 
-  oxcf->speed                  =  clamp(abs(extra_cfg->cpu_used), 0, 7);
+  oxcf->speed                  =  abs(extra_cfg->cpu_used);
   oxcf->encode_breakout        =  extra_cfg->static_thresh;
   oxcf->play_alternate         =  extra_cfg->enable_auto_alt_ref;
   oxcf->noise_sensitivity      =  extra_cfg->noise_sensitivity;
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index c3ca7ee..2591852 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -20,6 +20,7 @@
 #include "vp9/common/vp9_frame_buffers.h"
 
 #include "vp9/decoder/vp9_decoder.h"
+#include "vp9/decoder/vp9_decodeframe.h"
 #include "vp9/decoder/vp9_read_bit_buffer.h"
 
 #include "vp9/vp9_iface_common.h"
@@ -98,8 +99,10 @@
 static vpx_codec_err_t decoder_peek_si_internal(const uint8_t *data,
                                                 unsigned int data_sz,
                                                 vpx_codec_stream_info_t *si,
+                                                int *is_intra_only,
                                                 vpx_decrypt_cb decrypt_cb,
                                                 void *decrypt_state) {
+  int intra_only_flag = 0;
   uint8_t clear_buffer[9];
 
   if (data + data_sz <= data)
@@ -115,6 +118,8 @@
   }
 
   {
+    int show_frame;
+    int error_resilient;
     struct vp9_read_bit_buffer rb = { data, data + data_sz, 0, NULL, NULL };
     const int frame_marker = vp9_rb_read_literal(&rb, 2);
     const int version = vp9_rb_read_bit(&rb);
@@ -126,6 +131,7 @@
     if (version > 1) return VPX_CODEC_UNSUP_BITSTREAM;
 
     if (vp9_rb_read_bit(&rb)) {  // show an existing frame
+      vp9_rb_read_literal(&rb, 3);  // Frame buffer to show.
       return VPX_CODEC_OK;
     }
 
@@ -133,18 +139,15 @@
       return VPX_CODEC_UNSUP_BITSTREAM;
 
     si->is_kf = !vp9_rb_read_bit(&rb);
+    show_frame = vp9_rb_read_bit(&rb);
+    error_resilient = vp9_rb_read_bit(&rb);
+
     if (si->is_kf) {
       const int sRGB = 7;
       int colorspace;
 
-      rb.bit_offset += 1;  // show frame
-      rb.bit_offset += 1;  // error resilient
-
-      if (vp9_rb_read_literal(&rb, 8) != VP9_SYNC_CODE_0 ||
-          vp9_rb_read_literal(&rb, 8) != VP9_SYNC_CODE_1 ||
-          vp9_rb_read_literal(&rb, 8) != VP9_SYNC_CODE_2) {
+      if (!vp9_read_sync_code(&rb))
         return VPX_CODEC_UNSUP_BITSTREAM;
-      }
 
       colorspace = vp9_rb_read_literal(&rb, 3);
       if (colorspace != sRGB) {
@@ -161,20 +164,28 @@
           return VPX_CODEC_UNSUP_BITSTREAM;
         }
       }
+      vp9_read_frame_size(&rb, (int *)&si->w, (int *)&si->h);
+    } else {
+      intra_only_flag = show_frame ? 0 : vp9_rb_read_bit(&rb);
+      rb.bit_offset += error_resilient ? 0 : 2;  // reset_frame_context
 
-      // TODO(jzern): these are available on non-keyframes in intra only mode.
-      si->w = vp9_rb_read_literal(&rb, 16) + 1;
-      si->h = vp9_rb_read_literal(&rb, 16) + 1;
+      if (intra_only_flag) {
+        if (!vp9_read_sync_code(&rb))
+          return VPX_CODEC_UNSUP_BITSTREAM;
+        rb.bit_offset += REF_FRAMES;  // refresh_frame_flags
+        vp9_read_frame_size(&rb, (int *)&si->w, (int *)&si->h);
+      }
     }
   }
-
+  if (is_intra_only != NULL)
+    *is_intra_only = intra_only_flag;
   return VPX_CODEC_OK;
 }
 
 static vpx_codec_err_t decoder_peek_si(const uint8_t *data,
                                        unsigned int data_sz,
                                        vpx_codec_stream_info_t *si) {
-  return decoder_peek_si_internal(data, data_sz, si, NULL, NULL);
+  return decoder_peek_si_internal(data, data_sz, si, NULL, NULL, NULL);
 }
 
 static vpx_codec_err_t decoder_get_si(vpx_codec_alg_priv_t *ctx,
@@ -266,13 +277,14 @@
   // validate that we have a buffer that does not wrap around the top
   // of the heap.
   if (!ctx->si.h) {
+    int is_intra_only = 0;
     const vpx_codec_err_t res =
-        decoder_peek_si_internal(*data, data_sz, &ctx->si, ctx->decrypt_cb,
-                                 ctx->decrypt_state);
+        decoder_peek_si_internal(*data, data_sz, &ctx->si, &is_intra_only,
+                                 ctx->decrypt_cb, ctx->decrypt_state);
     if (res != VPX_CODEC_OK)
       return res;
 
-    if (!ctx->si.is_kf)
+    if (!ctx->si.is_kf && !is_intra_only)
       return VPX_CODEC_ERROR;
   }
 
diff --git a/vp9/vp9dx.mk b/vp9/vp9dx.mk
index 92ec6fd..1fcb36f 100644
--- a/vp9/vp9dx.mk
+++ b/vp9/vp9dx.mk
@@ -31,8 +31,6 @@
 VP9_DX_SRCS-yes += decoder/vp9_detokenize.h
 VP9_DX_SRCS-yes += decoder/vp9_decoder.c
 VP9_DX_SRCS-yes += decoder/vp9_decoder.h
-VP9_DX_SRCS-yes += decoder/vp9_thread.c
-VP9_DX_SRCS-yes += decoder/vp9_thread.h
 VP9_DX_SRCS-yes += decoder/vp9_dsubexp.c
 VP9_DX_SRCS-yes += decoder/vp9_dsubexp.h