Merge "vpxdec: add --keep-going option"
diff --git a/configure b/configure
index 0dca4c3..b98480e 100755
--- a/configure
+++ b/configure
@@ -274,6 +274,7 @@
     multiple_arf
     spatial_svc
     denoising
+    fp_mb_stats
 "
 CONFIG_LIST="
     external_build
diff --git a/test/md5_helper.h b/test/md5_helper.h
index dd446f4..dc95582 100644
--- a/test/md5_helper.h
+++ b/test/md5_helper.h
@@ -28,10 +28,11 @@
       // plane, we never want to round down and thus skip a pixel so if
       // we are shifting by 1 (chroma_shift) we add 1 before doing the shift.
       // This works only for chroma_shift of 0 and 1.
+      const int bytes_per_sample = (img->fmt & VPX_IMG_FMT_HIGH) ? 2 : 1;
       const int h = plane ? (img->d_h + img->y_chroma_shift) >>
                     img->y_chroma_shift : img->d_h;
-      const int w = plane ? (img->d_w + img->x_chroma_shift) >>
-                    img->x_chroma_shift : img->d_w;
+      const int w = (plane ? (img->d_w + img->x_chroma_shift) >>
+                     img->x_chroma_shift : img->d_w) * bytes_per_sample;
 
       for (int y = 0; y < h; ++y) {
         MD5Update(&md5_, buf, w);
diff --git a/test/test-data.sha1 b/test/test-data.sha1
index af1815c..f9c09c6 100644
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -6,6 +6,15 @@
 2dadee5306245fa5eeb0f99652d0e17afbcba96d  invalid-vp90-02.webm.res
 df1a1453feb3c00d7d89746c7003b4163523bff3  invalid-vp90-03.webm
 8fe6fd82bf537340f586f97a7ae31fb37ccda302  invalid-vp90-03.webm.res
+a432f96ff0a787268e2f94a8092ab161a18d1b06  park_joy_90p_10_420.y4m
+0b194cc312c3a2e84d156a221b0a5eb615dfddc5  park_joy_90p_10_422.y4m
+ff0e0a21dc2adc95b8c1b37902713700655ced17  park_joy_90p_10_444.y4m
+614c32ae1eca391e867c70d19974f0d62664dd99  park_joy_90p_12_420.y4m
+c92825f1ea25c5c37855083a69faac6ac4641a9e  park_joy_90p_12_422.y4m
+b592189b885b6cc85db55cc98512a197d73d3b34  park_joy_90p_12_444.y4m
+4e0eb61e76f0684188d9bc9f3ce61f6b6b77bb2c  park_joy_90p_8_420.y4m
+7a193ff7dfeb96ba5f82b2afd7afa9e1fe83d947  park_joy_90p_8_422.y4m
+bdb7856e6bc93599bdda05c2e773a9f22b6c6d03  park_joy_90p_8_444.y4m
 b1f1c3ec79114b9a0651af24ce634afb44a9a419  rush_hour_444.y4m
 5184c46ddca8b1fadd16742e8500115bc8f749da  vp80-00-comprehensive-001.ivf
 65bf1bbbced81b97bd030f376d1b7f61a224793f  vp80-00-comprehensive-002.ivf
diff --git a/test/test.mk b/test/test.mk
index f06e28e..85212d9 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -15,7 +15,7 @@
 ##
 ## Black box tests only use the public API.
 ##
-LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../md5_utils.h ../md5_utils.c
+LIBVPX_TEST_SRCS-yes                   += ../md5_utils.h ../md5_utils.c
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ivf_video_source.h
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += ../y4minput.h ../y4minput.c
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += aq_segment_test.cc
@@ -42,6 +42,9 @@
 LIBVPX_TEST_SRCS-yes                   += encode_test_driver.cc
 LIBVPX_TEST_SRCS-yes                   += encode_test_driver.h
 
+## Y4m parsing.
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += y4m_test.cc ../y4menc.c ../y4menc.h
+
 ## WebM Parsing
 ifeq ($(CONFIG_WEBM_IO), yes)
 LIBWEBM_PARSER_SRCS                    += ../third_party/libwebm/mkvparser.cpp
@@ -134,6 +137,17 @@
 ##
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_collage_w352h288.yuv
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_odd.yuv
+
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_420.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_422.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_444.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_420.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_422.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_444.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_420.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_422.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_444.y4m
+
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rush_hour_444.y4m
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += screendata.y4m
 
diff --git a/test/video_source.h b/test/video_source.h
index 6d1855a..4250cb7 100644
--- a/test/video_source.h
+++ b/test/video_source.h
@@ -50,6 +50,15 @@
   return fopen(path_to_source.c_str(), "rb");
 }
 
+static FILE *OpenTestOutFile(const std::string& file_name) {
+  const std::string path_to_source = GetDataPath() + "/" + file_name;
+  return fopen(path_to_source.c_str(), "wb");
+}
+
+static FILE *OpenTempOutFile() {
+  return tmpfile();
+}
+
 // Abstract base class for test video sources, which provide a stream of
 // vpx_image_t images with associated timestamps and duration.
 class VideoSource {
diff --git a/test/vp9_thread_test.cc b/test/vp9_thread_test.cc
index 68ee99a..72719a6 100644
--- a/test/vp9_thread_test.cc
+++ b/test/vp9_thread_test.cc
@@ -28,11 +28,11 @@
  protected:
   virtual ~VP9WorkerThreadTest() {}
   virtual void SetUp() {
-    vp9_worker_init(&worker_);
+    vp9_get_worker_interface()->init(&worker_);
   }
 
   virtual void TearDown() {
-    vp9_worker_end(&worker_);
+    vp9_get_worker_interface()->end(&worker_);
   }
 
   VP9Worker worker_;
@@ -45,10 +45,11 @@
 }
 
 TEST_P(VP9WorkerThreadTest, HookSuccess) {
-  EXPECT_NE(vp9_worker_sync(&worker_), 0);  // should be a no-op.
+  // should be a no-op.
+  EXPECT_NE(vp9_get_worker_interface()->sync(&worker_), 0);
 
   for (int i = 0; i < 2; ++i) {
-    EXPECT_NE(vp9_worker_reset(&worker_), 0);
+    EXPECT_NE(vp9_get_worker_interface()->reset(&worker_), 0);
 
     int hook_data = 0;
     int return_value = 1;  // return successfully from the hook
@@ -58,20 +59,21 @@
 
     const bool synchronous = GetParam();
     if (synchronous) {
-      vp9_worker_execute(&worker_);
+      vp9_get_worker_interface()->execute(&worker_);
     } else {
-      vp9_worker_launch(&worker_);
+      vp9_get_worker_interface()->launch(&worker_);
     }
-    EXPECT_NE(vp9_worker_sync(&worker_), 0);
+    EXPECT_NE(vp9_get_worker_interface()->sync(&worker_), 0);
     EXPECT_FALSE(worker_.had_error);
     EXPECT_EQ(5, hook_data);
 
-    EXPECT_NE(vp9_worker_sync(&worker_), 0);  // should be a no-op.
+    // should be a no-op.
+    EXPECT_NE(vp9_get_worker_interface()->sync(&worker_), 0);
   }
 }
 
 TEST_P(VP9WorkerThreadTest, HookFailure) {
-  EXPECT_NE(vp9_worker_reset(&worker_), 0);
+  EXPECT_NE(vp9_get_worker_interface()->reset(&worker_), 0);
 
   int hook_data = 0;
   int return_value = 0;  // return failure from the hook
@@ -81,26 +83,49 @@
 
   const bool synchronous = GetParam();
   if (synchronous) {
-    vp9_worker_execute(&worker_);
+    vp9_get_worker_interface()->execute(&worker_);
   } else {
-    vp9_worker_launch(&worker_);
+    vp9_get_worker_interface()->launch(&worker_);
   }
-  EXPECT_FALSE(vp9_worker_sync(&worker_));
+  EXPECT_FALSE(vp9_get_worker_interface()->sync(&worker_));
   EXPECT_EQ(1, worker_.had_error);
 
   // Ensure _reset() clears the error and _launch() can be called again.
   return_value = 1;
-  EXPECT_NE(vp9_worker_reset(&worker_), 0);
+  EXPECT_NE(vp9_get_worker_interface()->reset(&worker_), 0);
   EXPECT_FALSE(worker_.had_error);
-  vp9_worker_launch(&worker_);
-  EXPECT_NE(vp9_worker_sync(&worker_), 0);
+  vp9_get_worker_interface()->launch(&worker_);
+  EXPECT_NE(vp9_get_worker_interface()->sync(&worker_), 0);
   EXPECT_FALSE(worker_.had_error);
 }
 
+TEST(VP9WorkerThreadTest, TestInterfaceAPI) {
+  EXPECT_EQ(0, vp9_set_worker_interface(NULL));
+  EXPECT_TRUE(vp9_get_worker_interface() != NULL);
+  for (int i = 0; i < 6; ++i) {
+    VP9WorkerInterface winterface = *vp9_get_worker_interface();
+    switch (i) {
+      default:
+      case 0: winterface.init = NULL; break;
+      case 1: winterface.reset = NULL; break;
+      case 2: winterface.sync = NULL; break;
+      case 3: winterface.launch = NULL; break;
+      case 4: winterface.execute = NULL; break;
+      case 5: winterface.end = NULL; break;
+    }
+    EXPECT_EQ(0, vp9_set_worker_interface(&winterface));
+  }
+}
+
 // -----------------------------------------------------------------------------
 // Multi-threaded decode tests
 
 #if CONFIG_WEBM_IO
+struct FileList {
+  const char *name;
+  const char *expected_md5;
+};
+
 // Decodes |filename| with |num_threads|. Returns the md5 of the decoded frames.
 string DecodeFile(const string& filename, int num_threads) {
   libvpx_test::WebMVideoSource video(filename);
@@ -130,39 +155,77 @@
   return string(md5.Get());
 }
 
+void DecodeFiles(const FileList files[]) {
+  for (const FileList *iter = files; iter->name != NULL; ++iter) {
+    SCOPED_TRACE(iter->name);
+    for (int t = 2; t <= 8; ++t) {
+      EXPECT_EQ(iter->expected_md5, DecodeFile(iter->name, t))
+          << "threads = " << t;
+    }
+  }
+}
+
+// Trivial serialized thread worker interface implementation.
+// Note any worker that requires synchronization between other workers will
+// hang.
+namespace impl {
+
+void Init(VP9Worker *const worker) { memset(worker, 0, sizeof(*worker)); }
+int Reset(VP9Worker *const /*worker*/) { return 1; }
+int Sync(VP9Worker *const worker) { return !worker->had_error; }
+
+void Execute(VP9Worker *const worker) {
+  worker->had_error |= worker->hook(worker->data1, worker->data2);
+}
+
+void Launch(VP9Worker *const worker) { Execute(worker); }
+void End(VP9Worker *const /*worker*/) {}
+
+}  // namespace impl
+
+TEST(VP9WorkerThreadTest, TestSerialInterface) {
+  static const VP9WorkerInterface serial_interface = {
+    impl::Init, impl::Reset, impl::Sync, impl::Launch, impl::Execute, impl::End
+  };
+  // TODO(jzern): Avoid using a file that will use the row-based thread
+  // loopfilter, with the simple serialized implementation it will hang. This is
+  // due to its expectation that rows will be run in parallel as they wait on
+  // progress in the row above before proceeding.
+  static const char expected_md5[] = "b35a1b707b28e82be025d960aba039bc";
+  static const char filename[] = "vp90-2-03-size-226x226.webm";
+  VP9WorkerInterface default_interface = *vp9_get_worker_interface();
+
+  EXPECT_NE(vp9_set_worker_interface(&serial_interface), 0);
+  EXPECT_EQ(expected_md5, DecodeFile(filename, 2));
+
+  // Reset the interface.
+  EXPECT_NE(vp9_set_worker_interface(&default_interface), 0);
+  EXPECT_EQ(expected_md5, DecodeFile(filename, 2));
+}
+
 TEST(VP9DecodeMultiThreadedTest, Decode) {
   // no tiles or frame parallel; this exercises loop filter threading.
-  EXPECT_STREQ("b35a1b707b28e82be025d960aba039bc",
-               DecodeFile("vp90-2-03-size-226x226.webm", 2).c_str());
+  EXPECT_EQ("b35a1b707b28e82be025d960aba039bc",
+            DecodeFile("vp90-2-03-size-226x226.webm", 2));
 }
 
 TEST(VP9DecodeMultiThreadedTest, Decode2) {
-  static const struct {
-    const char *name;
-    const char *expected_md5;
-  } files[] = {
+  static const FileList files[] = {
     { "vp90-2-08-tile_1x2_frame_parallel.webm",
       "68ede6abd66bae0a2edf2eb9232241b6" },
     { "vp90-2-08-tile_1x4_frame_parallel.webm",
       "368ebc6ebf3a5e478d85b2c3149b2848" },
     { "vp90-2-08-tile_1x8_frame_parallel.webm",
       "17e439da2388aff3a0f69cb22579c6c1" },
+    { NULL, NULL }
   };
 
-  for (int i = 0; i < static_cast<int>(sizeof(files) / sizeof(files[0])); ++i) {
-    for (int t = 2; t <= 8; ++t) {
-      EXPECT_STREQ(files[i].expected_md5, DecodeFile(files[i].name, t).c_str())
-          << "threads = " << t;
-    }
-  }
+  DecodeFiles(files);
 }
 
 // Test tile quantity changes within one file.
 TEST(VP9DecodeMultiThreadedTest, Decode3) {
-  static const struct {
-    const char *name;
-    const char *expected_md5;
-  } files[] = {
+  static const FileList files[] = {
     { "vp90-2-14-resize-fp-tiles-1-16.webm",
       "0cd5e632c326297e975f38949c31ea94" },
     { "vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm",
@@ -207,14 +270,10 @@
       "ae96f21f21b6370cc0125621b441fc52" },
     { "vp90-2-14-resize-fp-tiles-8-4.webm",
       "3eb4f24f10640d42218f7fd7b9fd30d4" },
+    { NULL, NULL }
   };
 
-  for (int i = 0; i < static_cast<int>(sizeof(files) / sizeof(files[0])); ++i) {
-    for (int t = 2; t <= 8; ++t) {
-      EXPECT_STREQ(files[i].expected_md5, DecodeFile(files[i].name, t).c_str())
-          << "threads = " << t;
-    }
-  }
+  DecodeFiles(files);
 }
 #endif  // CONFIG_WEBM_IO
 
diff --git a/test/y4m_test.cc b/test/y4m_test.cc
new file mode 100644
index 0000000..cfa30e8
--- /dev/null
+++ b/test/y4m_test.cc
@@ -0,0 +1,193 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string>
+#include "test/md5_helper.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "./vpx_config.h"
+#include "./y4menc.h"
+
+namespace {
+
+using std::string;
+using std::tr1::make_tuple;
+
+static const unsigned int kWidth  = 160;
+static const unsigned int kHeight = 90;
+static const unsigned int kFrames = 10;
+
+typedef std::tr1::tuple<const char *, const unsigned int,
+        const vpx_img_fmt, const char *> test_entry_type;
+
+static const test_entry_type kY4mTestVectors[] = {
+  make_tuple("park_joy_90p_8_420.y4m", 8, VPX_IMG_FMT_I420,
+             "e5406275b9fc6bb3436c31d4a05c1cab"),
+  make_tuple("park_joy_90p_8_422.y4m", 8, VPX_IMG_FMT_I422,
+             "284a47a47133b12884ec3a14e959a0b6"),
+  make_tuple("park_joy_90p_8_444.y4m", 8, VPX_IMG_FMT_I444,
+             "90517ff33843d85de712fd4fe60dbed0"),
+  make_tuple("park_joy_90p_10_420.y4m", 10, VPX_IMG_FMT_I42016,
+             "63f21f9f717d8b8631bd2288ee87137b"),
+  make_tuple("park_joy_90p_10_422.y4m", 10, VPX_IMG_FMT_I42216,
+             "48ab51fb540aed07f7ff5af130c9b605"),
+  make_tuple("park_joy_90p_10_444.y4m", 10, VPX_IMG_FMT_I44416,
+             "067bfd75aa85ff9bae91fa3e0edd1e3e"),
+  make_tuple("park_joy_90p_12_420.y4m", 12, VPX_IMG_FMT_I42016,
+             "9e6d8f6508c6e55625f6b697bc461cef"),
+  make_tuple("park_joy_90p_12_422.y4m", 12, VPX_IMG_FMT_I42216,
+             "b239c6b301c0b835485be349ca83a7e3"),
+  make_tuple("park_joy_90p_12_444.y4m", 12, VPX_IMG_FMT_I44416,
+             "5a6481a550821dab6d0192f5c63845e9")
+};
+
+static void write_image_file(const vpx_image_t *img, FILE *file) {
+  int plane, y;
+  for (plane = 0; plane < 3; ++plane) {
+    const unsigned char *buf = img->planes[plane];
+    const int stride = img->stride[plane];
+    const int bytes_per_sample = (img->fmt & VPX_IMG_FMT_HIGH) ? 2 : 1;
+    const int h = (plane ? (img->d_h + img->y_chroma_shift) >>
+                   img->y_chroma_shift : img->d_h);
+    const int w = (plane ? (img->d_w + img->x_chroma_shift) >>
+                   img->x_chroma_shift : img->d_w);
+    for (y = 0; y < h; ++y) {
+      fwrite(buf, bytes_per_sample, w, file);
+      buf += stride;
+    }
+  }
+}
+
+class Y4mVideoSourceTest
+    : public ::testing::TestWithParam<test_entry_type>,
+      public ::libvpx_test::Y4mVideoSource {
+ protected:
+  Y4mVideoSourceTest() : Y4mVideoSource("", 0, 0) {}
+
+  virtual ~Y4mVideoSourceTest() {
+    CloseSource();
+  }
+
+  virtual void Init(const std::string &file_name, int limit) {
+    file_name_ = file_name;
+    start_ = 0;
+    limit_ = limit;
+    frame_ = 0;
+    Begin();
+  }
+
+  // Checks y4m header information
+  void HeaderChecks(unsigned int bit_depth, vpx_img_fmt_t fmt) {
+    ASSERT_TRUE(input_file_ != NULL);
+    ASSERT_EQ(y4m_.pic_w, (int)kWidth);
+    ASSERT_EQ(y4m_.pic_h, (int)kHeight);
+    ASSERT_EQ(img()->d_w, kWidth);
+    ASSERT_EQ(img()->d_h, kHeight);
+    ASSERT_EQ(y4m_.bit_depth, bit_depth);
+    ASSERT_EQ(y4m_.vpx_fmt, fmt);
+    if (fmt == VPX_IMG_FMT_I420 || fmt == VPX_IMG_FMT_I42016) {
+      ASSERT_EQ(y4m_.bps, (int)y4m_.bit_depth * 3 / 2);
+      ASSERT_EQ(img()->x_chroma_shift, 1U);
+      ASSERT_EQ(img()->y_chroma_shift, 1U);
+    }
+    if (fmt == VPX_IMG_FMT_I422 || fmt == VPX_IMG_FMT_I42216) {
+      ASSERT_EQ(y4m_.bps, (int)y4m_.bit_depth * 2);
+      ASSERT_EQ(img()->x_chroma_shift, 1U);
+      ASSERT_EQ(img()->y_chroma_shift, 0U);
+    }
+    if (fmt == VPX_IMG_FMT_I444 || fmt == VPX_IMG_FMT_I44416) {
+      ASSERT_EQ(y4m_.bps, (int)y4m_.bit_depth * 3);
+      ASSERT_EQ(img()->x_chroma_shift, 0U);
+      ASSERT_EQ(img()->y_chroma_shift, 0U);
+    }
+  }
+
+  // Checks MD5 of the raw frame data
+  void Md5Check(const string &expected_md5) {
+    ASSERT_TRUE(input_file_ != NULL);
+    libvpx_test::MD5 md5;
+    for (unsigned int i = start_; i < limit_; i++) {
+      md5.Add(img());
+      Next();
+    }
+    ASSERT_EQ(string(md5.Get()), expected_md5);
+  }
+};
+
+TEST_P(Y4mVideoSourceTest, SourceTest) {
+  const char *filename = GET_PARAM(0);
+  const unsigned int bit_depth = GET_PARAM(1);
+  const vpx_img_fmt format = GET_PARAM(2);
+  const char *md5raw = GET_PARAM(3);
+
+  Init(filename, kFrames);
+  HeaderChecks(bit_depth, format);
+  Md5Check(md5raw);
+}
+
+INSTANTIATE_TEST_CASE_P(C, Y4mVideoSourceTest,
+                        ::testing::ValuesIn(kY4mTestVectors));
+
+class Y4mVideoWriteTest
+    : public Y4mVideoSourceTest {
+ protected:
+  Y4mVideoWriteTest() : Y4mVideoSourceTest() {}
+
+  virtual void ReplaceInputFp(FILE *input_file) {
+    CloseSource();
+    frame_ = 0;
+    input_file_ = input_file;
+    rewind(input_file_);
+    ReadSourceToStart();
+  }
+
+  // Writes out a y4m file and then reads it back
+  void WriteY4mAndReadBack() {
+    ASSERT_TRUE(input_file_ != NULL);
+    char buf[Y4M_BUFFER_SIZE] = {0};
+    const struct VpxRational framerate = {y4m_.fps_n, y4m_.fps_d};
+    FILE *out_file = libvpx_test::OpenTempOutFile();
+    ASSERT_TRUE(out_file != NULL);
+    y4m_write_file_header(buf, sizeof(buf),
+                          kWidth, kHeight,
+                          &framerate, y4m_.vpx_fmt,
+                          y4m_.bit_depth);
+    fputs(buf, out_file);
+    for (unsigned int i = start_; i < limit_; i++) {
+      y4m_write_frame_header(buf, sizeof(buf));
+      fputs(buf, out_file);
+      write_image_file(img(), out_file);
+      Next();
+    }
+    ReplaceInputFp(out_file);
+  }
+
+  virtual void Init(const std::string &file_name, int limit) {
+    Y4mVideoSourceTest::Init(file_name, limit);
+    WriteY4mAndReadBack();
+  }
+};
+
+TEST_P(Y4mVideoWriteTest, WriteTest) {
+  const char *filename = GET_PARAM(0);
+  const unsigned int bit_depth = GET_PARAM(1);
+  const vpx_img_fmt format = GET_PARAM(2);
+  const char *md5raw = GET_PARAM(3);
+
+  Init(filename, kFrames);
+  HeaderChecks(bit_depth, format);
+  Md5Check(md5raw);
+}
+
+INSTANTIATE_TEST_CASE_P(C, Y4mVideoWriteTest,
+                        ::testing::ValuesIn(kY4mTestVectors));
+
+}  // namespace
diff --git a/test/y4m_video_source.h b/test/y4m_video_source.h
index 7419043..378e75b 100644
--- a/test/y4m_video_source.h
+++ b/test/y4m_video_source.h
@@ -38,24 +38,30 @@
     CloseSource();
   }
 
-  virtual void Begin() {
+  virtual void OpenSource() {
     CloseSource();
     input_file_ = OpenTestDataFile(file_name_);
     ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
-        << file_name_;
+                                     << file_name_;
+  }
 
-    y4m_input_open(&y4m_, input_file_, NULL, 0, 0);
+  virtual void ReadSourceToStart() {
+    ASSERT_TRUE(input_file_ != NULL);
+    ASSERT_FALSE(y4m_input_open(&y4m_, input_file_, NULL, 0, 0));
     framerate_numerator_ = y4m_.fps_n;
     framerate_denominator_ = y4m_.fps_d;
-
     frame_ = 0;
     for (unsigned int i = 0; i < start_; i++) {
-        Next();
+      Next();
     }
-
     FillFrame();
   }
 
+  virtual void Begin() {
+    OpenSource();
+    ReadSourceToStart();
+  }
+
   virtual void Next() {
     ++frame_;
     FillFrame();
diff --git a/tools_common.h b/tools_common.h
index e033de2..6a9f4f7 100644
--- a/tools_common.h
+++ b/tools_common.h
@@ -90,6 +90,7 @@
   uint32_t width;
   uint32_t height;
   vpx_img_fmt_t fmt;
+  vpx_bit_depth_t bit_depth;
   int only_i420;
   uint32_t fourcc;
   struct VpxRational framerate;
diff --git a/vp8/encoder/bitstream.h b/vp8/encoder/bitstream.h
index eef2d79..66f4bf6 100644
--- a/vp8/encoder/bitstream.h
+++ b/vp8/encoder/bitstream.h
@@ -18,18 +18,18 @@
 
 #if HAVE_EDSP
 void vp8cx_pack_tokens_armv5(vp8_writer *w, const TOKENEXTRA *p, int xcount,
-                             const vp8_token *,
+                             vp8_token *,
                              const vp8_extra_bit_struct *,
                              const vp8_tree_index *);
 void vp8cx_pack_tokens_into_partitions_armv5(VP8_COMP *,
                                              unsigned char * cx_data,
                                              const unsigned char *cx_data_end,
                                              int num_parts,
-                                             const vp8_token *,
+                                             vp8_token *,
                                              const vp8_extra_bit_struct *,
                                              const vp8_tree_index *);
 void vp8cx_pack_mb_row_tokens_armv5(VP8_COMP *cpi, vp8_writer *w,
-                                    const vp8_token *,
+                                    vp8_token *,
                                     const vp8_extra_bit_struct *,
                                     const vp8_tree_index *);
 # define pack_tokens(a,b,c)                  \
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 68613ec..f52dccb 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -714,6 +714,9 @@
 add_proto qw/void vp9_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
 specialize qw/vp9_subtract_block/, "$sse2_x86inc";
 
+add_proto qw/void vp9_quantize_fp/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+specialize qw/vp9_quantize_fp/, "$ssse3_x86_64";
+
 add_proto qw/void vp9_quantize_b/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
 specialize qw/vp9_quantize_b/, "$ssse3_x86_64";
 
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 8efae95..8b96abb 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -771,6 +771,7 @@
                                    const uint8_t *data,
                                    const uint8_t *data_end) {
   VP9_COMMON *const cm = &pbi->common;
+  const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
   const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols);
   const int tile_cols = 1 << cm->log2_tile_cols;
   const int tile_rows = 1 << cm->log2_tile_rows;
@@ -783,7 +784,7 @@
     CHECK_MEM_ERROR(cm, pbi->lf_worker.data1,
                     vpx_memalign(32, sizeof(LFWorkerData)));
     pbi->lf_worker.hook = (VP9WorkerHook)vp9_loop_filter_worker;
-    if (pbi->max_threads > 1 && !vp9_worker_reset(&pbi->lf_worker)) {
+    if (pbi->max_threads > 1 && !winterface->reset(&pbi->lf_worker)) {
       vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
                          "Loop filter thread creation failed");
     }
@@ -869,13 +870,13 @@
         // decoding has completed: finish up the loop filter in this thread.
         if (mi_row + MI_BLOCK_SIZE >= cm->mi_rows) continue;
 
-        vp9_worker_sync(&pbi->lf_worker);
+        winterface->sync(&pbi->lf_worker);
         lf_data->start = lf_start;
         lf_data->stop = mi_row;
         if (pbi->max_threads > 1) {
-          vp9_worker_launch(&pbi->lf_worker);
+          winterface->launch(&pbi->lf_worker);
         } else {
-          vp9_worker_execute(&pbi->lf_worker);
+          winterface->execute(&pbi->lf_worker);
         }
       }
     }
@@ -884,10 +885,10 @@
   // Loopfilter remaining rows in the frame.
   if (cm->lf.filter_level) {
     LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
-    vp9_worker_sync(&pbi->lf_worker);
+    winterface->sync(&pbi->lf_worker);
     lf_data->start = lf_data->stop;
     lf_data->stop = cm->mi_rows;
-    vp9_worker_execute(&pbi->lf_worker);
+    winterface->execute(&pbi->lf_worker);
   }
 
   // Get last tile data.
@@ -931,6 +932,7 @@
                                       const uint8_t *data,
                                       const uint8_t *data_end) {
   VP9_COMMON *const cm = &pbi->common;
+  const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
   const uint8_t *bit_reader_end = NULL;
   const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
   const int tile_cols = 1 << cm->log2_tile_cols;
@@ -957,11 +959,11 @@
       VP9Worker *const worker = &pbi->tile_workers[i];
       ++pbi->num_tile_workers;
 
-      vp9_worker_init(worker);
+      winterface->init(worker);
       CHECK_MEM_ERROR(cm, worker->data1,
                       vpx_memalign(32, sizeof(TileWorkerData)));
       CHECK_MEM_ERROR(cm, worker->data2, vpx_malloc(sizeof(TileInfo)));
-      if (i < num_threads - 1 && !vp9_worker_reset(worker)) {
+      if (i < num_threads - 1 && !winterface->reset(worker)) {
         vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
                            "Tile decoder thread creation failed");
       }
@@ -1024,9 +1026,9 @@
 
       worker->had_error = 0;
       if (i == num_workers - 1 || n == tile_cols - 1) {
-        vp9_worker_execute(worker);
+        winterface->execute(worker);
       } else {
-        vp9_worker_launch(worker);
+        winterface->launch(worker);
       }
 
       if (buf->col == tile_cols - 1) {
@@ -1038,7 +1040,7 @@
 
     for (; i > 0; --i) {
       VP9Worker *const worker = &pbi->tile_workers[i - 1];
-      pbi->mb.corrupted |= !vp9_worker_sync(worker);
+      pbi->mb.corrupted |= !winterface->sync(worker);
     }
     if (final_worker > -1) {
       TileWorkerData *const tile_data =
diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index 84cb84a..d154e9d 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -76,7 +76,7 @@
 
   cm->error.setjmp = 0;
 
-  vp9_worker_init(&pbi->lf_worker);
+  vp9_get_worker_interface()->init(&pbi->lf_worker);
 
   return pbi;
 }
@@ -86,12 +86,12 @@
   int i;
 
   vp9_remove_common(cm);
-  vp9_worker_end(&pbi->lf_worker);
+  vp9_get_worker_interface()->end(&pbi->lf_worker);
   vpx_free(pbi->lf_worker.data1);
   vpx_free(pbi->tile_data);
   for (i = 0; i < pbi->num_tile_workers; ++i) {
     VP9Worker *const worker = &pbi->tile_workers[i];
-    vp9_worker_end(worker);
+    vp9_get_worker_interface()->end(worker);
     vpx_free(worker->data1);
     vpx_free(worker->data2);
   }
diff --git a/vp9/decoder/vp9_dthread.c b/vp9/decoder/vp9_dthread.c
index 170ccb5..5dda49a 100644
--- a/vp9/decoder/vp9_dthread.c
+++ b/vp9/decoder/vp9_dthread.c
@@ -138,6 +138,7 @@
                               int frame_filter_level,
                               int y_only) {
   VP9LfSync *const lf_sync = &pbi->lf_row_sync;
+  const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
   // Number of superblock rows and cols
   const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
   const int tile_cols = 1 << cm->log2_tile_cols;
@@ -197,15 +198,15 @@
 
     // Start loopfiltering
     if (i == num_workers - 1) {
-      vp9_worker_execute(worker);
+      winterface->execute(worker);
     } else {
-      vp9_worker_launch(worker);
+      winterface->launch(worker);
     }
   }
 
   // Wait till all rows are finished
   for (i = 0; i < num_workers; ++i) {
-    vp9_worker_sync(&pbi->tile_workers[i]);
+    winterface->sync(&pbi->tile_workers[i]);
   }
 }
 
diff --git a/vp9/decoder/vp9_thread.c b/vp9/decoder/vp9_thread.c
index 5d31d3d..348bdf6 100644
--- a/vp9/decoder/vp9_thread.c
+++ b/vp9/decoder/vp9_thread.c
@@ -11,71 +11,79 @@
 //
 // Original source:
 //  http://git.chromium.org/webm/libwebp.git
-//  100644 blob eff8f2a8c20095aade3c292b0e9292dac6cb3587  src/utils/thread.c
-
+//  100644 blob 08ad4e1fecba302bf1247645e84a7d2779956bc3  src/utils/thread.c
 
 #include <assert.h>
 #include <string.h>   // for memset()
 #include "./vp9_thread.h"
-
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
+#include "vpx_mem/vpx_mem.h"
 
 #if CONFIG_MULTITHREAD
 
+struct VP9WorkerImpl {
+  pthread_mutex_t mutex_;
+  pthread_cond_t  condition_;
+  pthread_t       thread_;
+};
+
 //------------------------------------------------------------------------------
 
-static THREADFN thread_loop(void *ptr) {    // thread loop
-  VP9Worker* const worker = (VP9Worker*)ptr;
+static void execute(VP9Worker *const worker);  // Forward declaration.
+
+static THREADFN thread_loop(void *ptr) {
+  VP9Worker *const worker = (VP9Worker*)ptr;
   int done = 0;
   while (!done) {
-    pthread_mutex_lock(&worker->mutex_);
+    pthread_mutex_lock(&worker->impl_->mutex_);
     while (worker->status_ == OK) {   // wait in idling mode
-      pthread_cond_wait(&worker->condition_, &worker->mutex_);
+      pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_);
     }
     if (worker->status_ == WORK) {
-      vp9_worker_execute(worker);
+      execute(worker);
       worker->status_ = OK;
     } else if (worker->status_ == NOT_OK) {   // finish the worker
       done = 1;
     }
-    // signal to the main thread that we're done (for Sync())
-    pthread_cond_signal(&worker->condition_);
-    pthread_mutex_unlock(&worker->mutex_);
+    // signal to the main thread that we're done (for sync())
+    pthread_cond_signal(&worker->impl_->condition_);
+    pthread_mutex_unlock(&worker->impl_->mutex_);
   }
   return THREAD_RETURN(NULL);    // Thread is finished
 }
 
 // main thread state control
-static void change_state(VP9Worker* const worker,
+static void change_state(VP9Worker *const worker,
                          VP9WorkerStatus new_status) {
-  // no-op when attempting to change state on a thread that didn't come up
-  if (worker->status_ < OK) return;
+  // No-op when attempting to change state on a thread that didn't come up.
+  // Checking status_ without acquiring the lock first would result in a data
+  // race.
+  if (worker->impl_ == NULL) return;
 
-  pthread_mutex_lock(&worker->mutex_);
-  // wait for the worker to finish
-  while (worker->status_ != OK) {
-    pthread_cond_wait(&worker->condition_, &worker->mutex_);
+  pthread_mutex_lock(&worker->impl_->mutex_);
+  if (worker->status_ >= OK) {
+    // wait for the worker to finish
+    while (worker->status_ != OK) {
+      pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_);
+    }
+    // assign new status and release the working thread if needed
+    if (new_status != OK) {
+      worker->status_ = new_status;
+      pthread_cond_signal(&worker->impl_->condition_);
+    }
   }
-  // assign new status and release the working thread if needed
-  if (new_status != OK) {
-    worker->status_ = new_status;
-    pthread_cond_signal(&worker->condition_);
-  }
-  pthread_mutex_unlock(&worker->mutex_);
+  pthread_mutex_unlock(&worker->impl_->mutex_);
 }
 
 #endif  // CONFIG_MULTITHREAD
 
 //------------------------------------------------------------------------------
 
-void vp9_worker_init(VP9Worker* const worker) {
+static void init(VP9Worker *const worker) {
   memset(worker, 0, sizeof(*worker));
   worker->status_ = NOT_OK;
 }
 
-int vp9_worker_sync(VP9Worker* const worker) {
+static int sync(VP9Worker *const worker) {
 #if CONFIG_MULTITHREAD
   change_state(worker, OK);
 #endif
@@ -83,59 +91,93 @@
   return !worker->had_error;
 }
 
-int vp9_worker_reset(VP9Worker* const worker) {
+static int reset(VP9Worker *const worker) {
   int ok = 1;
   worker->had_error = 0;
   if (worker->status_ < OK) {
 #if CONFIG_MULTITHREAD
-    if (pthread_mutex_init(&worker->mutex_, NULL) ||
-        pthread_cond_init(&worker->condition_, NULL)) {
+    worker->impl_ = (VP9WorkerImpl*)vpx_calloc(1, sizeof(*worker->impl_));
+    if (worker->impl_ == NULL) {
       return 0;
     }
-    pthread_mutex_lock(&worker->mutex_);
-    ok = !pthread_create(&worker->thread_, NULL, thread_loop, worker);
+    if (pthread_mutex_init(&worker->impl_->mutex_, NULL)) {
+      goto Error;
+    }
+    if (pthread_cond_init(&worker->impl_->condition_, NULL)) {
+      pthread_mutex_destroy(&worker->impl_->mutex_);
+      goto Error;
+    }
+    pthread_mutex_lock(&worker->impl_->mutex_);
+    ok = !pthread_create(&worker->impl_->thread_, NULL, thread_loop, worker);
     if (ok) worker->status_ = OK;
-    pthread_mutex_unlock(&worker->mutex_);
+    pthread_mutex_unlock(&worker->impl_->mutex_);
+    if (!ok) {
+      pthread_mutex_destroy(&worker->impl_->mutex_);
+      pthread_cond_destroy(&worker->impl_->condition_);
+ Error:
+      vpx_free(worker->impl_);
+      worker->impl_ = NULL;
+      return 0;
+    }
 #else
     worker->status_ = OK;
 #endif
   } else if (worker->status_ > OK) {
-    ok = vp9_worker_sync(worker);
+    ok = sync(worker);
   }
   assert(!ok || (worker->status_ == OK));
   return ok;
 }
 
-void vp9_worker_execute(VP9Worker* const worker) {
+static void execute(VP9Worker *const worker) {
   if (worker->hook != NULL) {
     worker->had_error |= !worker->hook(worker->data1, worker->data2);
   }
 }
 
-void vp9_worker_launch(VP9Worker* const worker) {
+static void launch(VP9Worker *const worker) {
 #if CONFIG_MULTITHREAD
   change_state(worker, WORK);
 #else
-  vp9_worker_execute(worker);
+  execute(worker);
 #endif
 }
 
-void vp9_worker_end(VP9Worker* const worker) {
+static void end(VP9Worker *const worker) {
   if (worker->status_ >= OK) {
 #if CONFIG_MULTITHREAD
     change_state(worker, NOT_OK);
-    pthread_join(worker->thread_, NULL);
-    pthread_mutex_destroy(&worker->mutex_);
-    pthread_cond_destroy(&worker->condition_);
+    pthread_join(worker->impl_->thread_, NULL);
+    pthread_mutex_destroy(&worker->impl_->mutex_);
+    pthread_cond_destroy(&worker->impl_->condition_);
 #else
     worker->status_ = NOT_OK;
 #endif
   }
+  vpx_free(worker->impl_);
+  worker->impl_ = NULL;
   assert(worker->status_ == NOT_OK);
 }
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
+static VP9WorkerInterface g_worker_interface = {
+  init, reset, sync, launch, execute, end
+};
+
+int vp9_set_worker_interface(const VP9WorkerInterface* const winterface) {
+  if (winterface == NULL ||
+      winterface->init == NULL || winterface->reset == NULL ||
+      winterface->sync == NULL || winterface->launch == NULL ||
+      winterface->execute == NULL || winterface->end == NULL) {
+    return 0;
+  }
+  g_worker_interface = *winterface;
+  return 1;
+}
+
+const VP9WorkerInterface *vp9_get_worker_interface(void) {
+  return &g_worker_interface;
+}
+
+//------------------------------------------------------------------------------
diff --git a/vp9/decoder/vp9_thread.h b/vp9/decoder/vp9_thread.h
index 2f8728d..864579c 100644
--- a/vp9/decoder/vp9_thread.h
+++ b/vp9/decoder/vp9_thread.h
@@ -11,8 +11,7 @@
 //
 // Original source:
 //  http://git.chromium.org/webm/libwebp.git
-//  100644 blob 13a61a4c84194c3374080cbf03d881d3cd6af40d  src/utils/thread.h
-
+//  100644 blob 7bd451b124ae3b81596abfbcc823e3cb129d3a38  src/utils/thread.h
 
 #ifndef VP9_DECODER_VP9_THREAD_H_
 #define VP9_DECODER_VP9_THREAD_H_
@@ -163,40 +162,53 @@
 // arguments (data1 and data2), and should return false in case of error.
 typedef int (*VP9WorkerHook)(void*, void*);
 
-// Synchronize object used to launch job in the worker thread
+// Platform-dependent implementation details for the worker.
+typedef struct VP9WorkerImpl VP9WorkerImpl;
+
+// Synchronization object used to launch job in the worker thread
 typedef struct {
-#if CONFIG_MULTITHREAD
-  pthread_mutex_t mutex_;
-  pthread_cond_t  condition_;
-  pthread_t       thread_;
-#endif
+  VP9WorkerImpl *impl_;
   VP9WorkerStatus status_;
   VP9WorkerHook hook;     // hook to call
-  void* data1;            // first argument passed to 'hook'
-  void* data2;            // second argument passed to 'hook'
+  void *data1;            // first argument passed to 'hook'
+  void *data2;            // second argument passed to 'hook'
   int had_error;          // return value of the last call to 'hook'
 } VP9Worker;
 
-// Must be called first, before any other method.
-void vp9_worker_init(VP9Worker* const worker);
-// Must be called to initialize the object and spawn the thread. Re-entrant.
-// Will potentially launch the thread. Returns false in case of error.
-int vp9_worker_reset(VP9Worker* const worker);
-// Makes sure the previous work is finished. Returns true if worker->had_error
-// was not set and no error condition was triggered by the working thread.
-int vp9_worker_sync(VP9Worker* const worker);
-// Triggers the thread to call hook() with data1 and data2 argument. These
-// hook/data1/data2 can be changed at any time before calling this function,
-// but not be changed afterward until the next call to vp9_worker_sync().
-void vp9_worker_launch(VP9Worker* const worker);
-// This function is similar to vp9_worker_launch() except that it calls the
-// hook directly instead of using a thread. Convenient to bypass the thread
-// mechanism while still using the VP9Worker structs. vp9_worker_sync() must
-// still be called afterward (for error reporting).
-void vp9_worker_execute(VP9Worker* const worker);
-// Kill the thread and terminate the object. To use the object again, one
-// must call vp9_worker_reset() again.
-void vp9_worker_end(VP9Worker* const worker);
+// The interface for all thread-worker related functions. All these functions
+// must be implemented.
+typedef struct {
+  // Must be called first, before any other method.
+  void (*init)(VP9Worker *const worker);
+  // Must be called to initialize the object and spawn the thread. Re-entrant.
+  // Will potentially launch the thread. Returns false in case of error.
+  int (*reset)(VP9Worker *const worker);
+  // Makes sure the previous work is finished. Returns true if worker->had_error
+  // was not set and no error condition was triggered by the working thread.
+  int (*sync)(VP9Worker *const worker);
+  // Triggers the thread to call hook() with data1 and data2 arguments. These
+  // hook/data1/data2 values can be changed at any time before calling this
+  // function, but not be changed afterward until the next call to Sync().
+  void (*launch)(VP9Worker *const worker);
+  // This function is similar to launch() except that it calls the
+  // hook directly instead of using a thread. Convenient to bypass the thread
+  // mechanism while still using the VP9Worker structs. sync() must
+  // still be called afterward (for error reporting).
+  void (*execute)(VP9Worker *const worker);
+  // Kill the thread and terminate the object. To use the object again, one
+  // must call reset() again.
+  void (*end)(VP9Worker *const worker);
+} VP9WorkerInterface;
+
+// Install a new set of threading functions, overriding the defaults. This
+// should be done before any workers are started, i.e., before any encoding or
+// decoding takes place. The contents of the interface struct are copied, it
+// is safe to free the corresponding memory after this call. This function is
+// not thread-safe. Return false in case of invalid pointer or methods.
+int vp9_set_worker_interface(const VP9WorkerInterface *const winterface);
+
+// Retrieve the currently set thread worker interface.
+const VP9WorkerInterface *vp9_get_worker_interface(void);
 
 //------------------------------------------------------------------------------
 
diff --git a/vp9/encoder/vp9_aq_complexity.c b/vp9/encoder/vp9_aq_complexity.c
index 0d6b41d..33f9239 100644
--- a/vp9/encoder/vp9_aq_complexity.c
+++ b/vp9/encoder/vp9_aq_complexity.c
@@ -15,8 +15,19 @@
 
 #include "vp9/encoder/vp9_segmentation.h"
 
-static const double in_frame_q_adj_ratio[MAX_SEGMENTS] =
-  {1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+#define AQ_C_SEGMENTS  3
+#define AQ_C_STRENGTHS  3
+static const int aq_c_active_segments[AQ_C_STRENGTHS] = {1, 2, 3};
+static const double aq_c_q_adj_factor[AQ_C_STRENGTHS][AQ_C_SEGMENTS] =
+  {{1.0, 1.0, 1.0}, {1.0, 2.0, 1.0}, {1.0, 1.5, 2.5}};
+static const double aq_c_transitions[AQ_C_STRENGTHS][AQ_C_SEGMENTS] =
+  {{1.0, 1.0, 1.0}, {1.0, 0.25, 0.0}, {1.0, 0.5, 0.25}};
+
+static int get_aq_c_strength(int q_index) {
+  // Approximate base quatizer (truncated to int)
+  int base_quant = vp9_ac_quant(q_index, 0) / 4;
+  return (base_quant > 20) + (base_quant > 45);
+}
 
 void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
@@ -29,6 +40,8 @@
       cpi->refresh_alt_ref_frame ||
       (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
     int segment;
+    const int aq_strength = get_aq_c_strength(cm->base_qindex);
+    const int active_segments = aq_c_active_segments[aq_strength];
 
     // Clear down the segment map.
     vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
@@ -36,9 +49,17 @@
     // Clear down the complexity map used for rd.
     vpx_memset(cpi->complexity_map, 0, cm->mi_rows * cm->mi_cols);
 
-    vp9_enable_segmentation(seg);
     vp9_clearall_segfeatures(seg);
 
+    // Segmentation only makes sense if the target bits per SB is above a
+    // threshold. Below this the overheads will usually outweigh any benefit.
+    if (cpi->rc.sb64_target_rate < 256) {
+      vp9_disable_segmentation(seg);
+      return;
+    }
+
+    vp9_enable_segmentation(seg);
+
     // Select delta coding method.
     seg->abs_delta = SEGMENT_DELTADATA;
 
@@ -46,14 +67,14 @@
     vp9_disable_segfeature(seg, 0, SEG_LVL_ALT_Q);
 
     // Use some of the segments for in frame Q adjustment.
-    for (segment = 1; segment < 2; segment++) {
+    for (segment = 1; segment < active_segments; ++segment) {
       int qindex_delta =
           vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex,
-                                     in_frame_q_adj_ratio[segment]);
+                                     aq_c_q_adj_factor[aq_strength][segment]);
 
-      // For AQ mode 2, we dont allow Q0 in a segment if the base Q is not 0.
-      // Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment Q delta
-      // is sometimes applied without going back around the rd loop.
+      // For AQ complexity mode, we dont allow Q0 in a segment if the base
+      // Q is not 0. Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment
+      // Q delta is sometimes applied without going back around the rd loop.
       // This could lead to an illegal combination of partition size and q.
       if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) {
         qindex_delta = -cm->base_qindex + 1;
@@ -66,10 +87,15 @@
   }
 }
 
-// Select a segment for the current SB64
+// Select a segment for the current SB64 block.
+// The choice of segment for a block depends on the ratio of the projected
+// bits for the block vs a target average.
+// An "aq_strength" value determines how many segments are supported,
+// the set of transition points to use and the extent of the quantizer
+// adjustment for each segment (configured in vp9_setup_in_frame_q_adj()).
 void vp9_select_in_frame_q_segment(VP9_COMP *cpi,
-                                      int mi_row, int mi_col,
-                                      int output_enabled, int projected_rate) {
+                                   int mi_row, int mi_col,
+                                   int output_enabled, int projected_rate) {
   VP9_COMMON *const cm = &cpi->common;
 
   const int mi_offset = mi_row * cm->mi_cols + mi_col;
@@ -89,11 +115,22 @@
     // It is converted to bits * 256 units.
     const int target_rate = (cpi->rc.sb64_target_rate * xmis * ymis * 256) /
                             (bw * bh);
+    const int aq_strength = get_aq_c_strength(cm->base_qindex);
+    const int active_segments = aq_c_active_segments[aq_strength];
 
-    if (projected_rate < (target_rate / 4)) {
-      segment = 1;
-    } else {
-      segment = 0;
+    // The number of segments considered and the transition points used to
+    // select them is determined by the "aq_strength" value.
+    // Currently this loop only supports segments that reduce Q (i.e. where
+    // there is undershoot.
+    // The loop counts down towards segment 0 which is the default segment
+    // with no Q adjustment.
+    segment = active_segments - 1;
+    while (segment > 0) {
+      if (projected_rate <
+          (target_rate * aq_c_transitions[aq_strength][segment])) {
+        break;
+      }
+      --segment;
     }
 
     if (target_rate > 0) {
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 61d9d5d..ab7991e 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -35,6 +35,7 @@
 
   // Quantizer setings
   int16_t *quant_fp;
+  int16_t *round_fp;
   int16_t *quant;
   int16_t *quant_shift;
   int16_t *zbin;
@@ -110,6 +111,9 @@
   int use_lp32x32fdct;
   int skip_encode;
 
+  // use fast quantization process
+  int quant_fp;
+
   // skip forward transform and quantization
   int skip_txfm;
 
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index ff66abb..f6393e0 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -15,34 +15,84 @@
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/encoder/vp9_denoiser.h"
 
+/* The VP9 denoiser is a work-in-progress. It currently is only designed to work
+ * with speed 6, though it (inexplicably) seems to also work with speed 5 (one
+ * would need to modify the source code in vp9_pickmode.c and vp9_encoder.c to
+ * make the calls to the vp9_denoiser_* functions when in speed 5).
+ *
+ * The implementation is very similar to that of the VP8 denoiser. While
+ * choosing the motion vectors / reference frames, the denoiser is run, and if
+ * it did not modify the signal to much, the denoised block is copied to the
+ * signal.
+ */
+
+#ifdef OUTPUT_YUV_DENOISED
+static void make_grayscale(YV12_BUFFER_CONFIG *yuv);
+#endif
+
 static const int widths[]  = {4, 4, 8, 8,  8, 16, 16, 16, 32, 32, 32, 64, 64};
 static const int heights[] = {4, 8, 4, 8, 16,  8, 16, 32, 16, 32, 64, 32, 64};
 
-int vp9_denoiser_filter() {
-  return 0;
+static int absdiff_thresh(BLOCK_SIZE bs, int increase_denoising) {
+  (void)bs;
+  return 3 + (increase_denoising ? 1 : 0);
 }
 
-static int update_running_avg(const uint8_t *mc_avg, int mc_avg_stride,
-                              uint8_t *avg, int avg_stride,
-                              const uint8_t *sig, int sig_stride,
-                              int increase_denoising, BLOCK_SIZE bs) {
+static int delta_thresh(BLOCK_SIZE bs, int increase_denoising) {
+  (void)bs;
+  (void)increase_denoising;
+  return 4;
+}
+
+static int noise_motion_thresh(BLOCK_SIZE bs, int increase_denoising) {
+  (void)bs;
+  (void)increase_denoising;
+  return 25 * 25;
+}
+
+static unsigned int sse_thresh(BLOCK_SIZE bs, int increase_denoising) {
+  return widths[bs] * heights[bs] * (increase_denoising ? 60 : 40);
+}
+
+static int sse_diff_thresh(BLOCK_SIZE bs, int increase_denoising,
+                           int mv_row, int mv_col) {
+  if (mv_row * mv_row + mv_col * mv_col >
+      noise_motion_thresh(bs, increase_denoising)) {
+    return 0;
+  } else {
+    return widths[bs] * heights[bs] * 20;
+  }
+}
+
+static int total_adj_strong_thresh(BLOCK_SIZE bs, int increase_denoising) {
+  return widths[bs] * heights[bs] * (increase_denoising ? 3 : 2);
+}
+
+static int total_adj_weak_thresh(BLOCK_SIZE bs, int increase_denoising) {
+  return widths[bs] * heights[bs] * (increase_denoising ? 3 : 2);
+}
+
+static VP9_DENOISER_DECISION denoiser_filter(const uint8_t *sig, int sig_stride,
+                                             const uint8_t *mc_avg,
+                                             int mc_avg_stride,
+                                             uint8_t *avg, int avg_stride,
+                                             int increase_denoising,
+                                             BLOCK_SIZE bs) {
   int r, c;
-  int diff, adj, absdiff;
-  int shift_inc1 = 0, shift_inc2 = 1;
+  const uint8_t *sig_start = sig;
+  const uint8_t *mc_avg_start = mc_avg;
+  uint8_t *avg_start = avg;
+  int diff, adj, absdiff, delta;
   int adj_val[] = {3, 4, 6};
   int total_adj = 0;
 
-  if (increase_denoising) {
-    shift_inc1 = 1;
-    shift_inc2 = 2;
-  }
-
+  // First attempt to apply a strong temporal denoising filter.
   for (r = 0; r < heights[bs]; ++r) {
     for (c = 0; c < widths[bs]; ++c) {
       diff = mc_avg[c] - sig[c];
       absdiff = abs(diff);
 
-      if (absdiff <= 3 + shift_inc1) {
+      if (absdiff <= absdiff_thresh(bs, increase_denoising)) {
         avg[c] = mc_avg[c];
         total_adj += diff;
       } else {
@@ -70,7 +120,47 @@
     avg += avg_stride;
     mc_avg += mc_avg_stride;
   }
-  return total_adj;
+
+  // If the strong filter did not modify the signal too much, we're all set.
+  if (abs(total_adj) <= total_adj_strong_thresh(bs, increase_denoising)) {
+    return FILTER_BLOCK;
+  }
+
+  // Otherwise, we try to dampen the filter if the delta is not too high.
+  delta = ((abs(total_adj) - total_adj_strong_thresh(bs, increase_denoising))
+           >> 8) + 1;
+  if (delta > delta_thresh(bs, increase_denoising)) {
+    return COPY_BLOCK;
+  }
+
+  mc_avg =  mc_avg_start;
+  avg = avg_start;
+  sig = sig_start;
+  for (r = 0; r < heights[bs]; ++r) {
+    for (c = 0; c < widths[bs]; ++c) {
+      diff = mc_avg[c] - sig[c];
+      adj = abs(diff);
+      if (adj > delta) {
+        adj = delta;
+      }
+      if (diff > 0) {
+        avg[c] = MAX(0, avg[c] - adj);
+        total_adj += adj;
+      } else {
+        avg[c] = MIN(UINT8_MAX, avg[c] + adj);
+        total_adj -= adj;
+      }
+    }
+    sig += sig_stride;
+    avg += avg_stride;
+    mc_avg += mc_avg_stride;
+  }
+
+  // We can use the filter if it has been sufficiently dampened
+  if (abs(total_adj) <= total_adj_weak_thresh(bs, increase_denoising)) {
+    return FILTER_BLOCK;
+  }
+  return COPY_BLOCK;
 }
 
 static uint8_t *block_start(uint8_t *framebuf, int stride,
@@ -78,33 +168,24 @@
   return framebuf + (stride * mi_row * 8) + (mi_col * 8);
 }
 
-void copy_block(uint8_t *dest, int dest_stride,
-                uint8_t *src, int src_stride, BLOCK_SIZE bs) {
-  int r, c;
+static void copy_block(uint8_t *dest, int dest_stride,
+                       const uint8_t *src, int src_stride, BLOCK_SIZE bs) {
+  int r;
   for (r = 0; r < heights[bs]; ++r) {
-    for (c = 0; c < widths[bs]; ++c) {
-      dest[c] = src[c];
-    }
+    vpx_memcpy(dest, src, widths[bs]);
     dest += dest_stride;
     src += src_stride;
   }
 }
 
-static int perform_motion_compensation(VP9_DENOISER *denoiser, MACROBLOCK *mb,
-                                       BLOCK_SIZE bs, int increase_denoising,
-                                       int mi_row, int mi_col) {
-  // constants
-  // TODO(tkopp): empirically determine good constants, or functions of block
-  // size.
-  int NOISE_MOTION_THRESHOLD = 25 * 25;
-  int SSE_DIFF_THRESHOLD = heights[bs] * widths[bs] * 20;
-  unsigned int SSE_THRESH = heights[bs] * widths[bs] * 40;
-  unsigned int SSE_THRESH_HI = heights[bs] * widths[bs] * 60;
-
+static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser,
+                                                         MACROBLOCK *mb,
+                                                         BLOCK_SIZE bs,
+                                                         int increase_denoising,
+                                                         int mi_row,
+                                                         int mi_col) {
   int mv_col, mv_row;
   int sse_diff = denoiser->zero_mv_sse - denoiser->best_sse;
-  int sse_diff_thresh;
-  int sse_thresh;
   MV_REFERENCE_FRAME frame;
   MACROBLOCKD *filter_mbd = &mb->e_mbd;
   MB_MODE_INFO *mbmi = &filter_mbd->mi[0]->mbmi;
@@ -116,20 +197,15 @@
   saved_pre[0] = filter_mbd->plane[0].pre[0];
   saved_pre[1] = filter_mbd->plane[0].pre[1];
 
-  // Decide the threshold for sum squared error.
   mv_col = denoiser->best_sse_mv.as_mv.col;
   mv_row = denoiser->best_sse_mv.as_mv.row;
-  if (mv_row * mv_row + mv_col * mv_col > NOISE_MOTION_THRESHOLD) {
-    sse_diff_thresh = 0;
-  } else {
-    sse_diff_thresh = SSE_DIFF_THRESHOLD;
-  }
 
   frame = denoiser->best_reference_frame;
 
   // If the best reference frame uses inter-prediction and there is enough of a
   // difference in sum-squared-error, use it.
-  if (frame != INTRA_FRAME && sse_diff > sse_diff_thresh) {
+  if (frame != INTRA_FRAME &&
+      sse_diff > sse_diff_thresh(bs, increase_denoising, mv_row, mv_col)) {
     mbmi->ref_frame[0] = denoiser->best_reference_frame;
     mbmi->mode = denoiser->best_sse_inter_mode;
     mbmi->mv[0] = denoiser->best_sse_mv;
@@ -212,11 +288,12 @@
 
   mv_row = denoiser->best_sse_mv.as_mv.row;
   mv_col = denoiser->best_sse_mv.as_mv.col;
-  sse_thresh = denoiser->increase_denoising ? SSE_THRESH_HI : SSE_THRESH;
 
-  // TODO(tkopp) why 8?
-  if (denoiser->best_sse > sse_thresh ||
-    mv_row * mv_row + mv_col * mv_col > 8 * NOISE_MOTION_THRESHOLD) {
+  if (denoiser->best_sse > sse_thresh(bs, increase_denoising)) {
+    return COPY_BLOCK;
+  }
+  if (mv_row * mv_row + mv_col * mv_col >
+      8 * noise_motion_thresh(bs, increase_denoising)) {
     return COPY_BLOCK;
   }
   return FILTER_BLOCK;
@@ -224,8 +301,7 @@
 
 void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
                           int mi_row, int mi_col, BLOCK_SIZE bs) {
-  int decision = COPY_BLOCK;
-
+  VP9_DENOISER_DECISION decision = FILTER_BLOCK;
   YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME];
   YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y;
   uint8_t *avg_start = block_start(avg.y_buffer, avg.y_stride, mi_row, mi_col);
@@ -236,28 +312,30 @@
   decision = perform_motion_compensation(denoiser, mb, bs,
                                          denoiser->increase_denoising,
                                          mi_row, mi_col);
-  update_running_avg(mc_avg_start, mc_avg.y_stride, avg_start, avg.y_stride,
-                     mb->plane[0].src.buf, mb->plane[0].src.stride, 0, bs);
 
   if (decision == FILTER_BLOCK) {
-    // TODO(tkopp)
+    decision = denoiser_filter(src.buf, src.stride,
+                               mc_avg_start, mc_avg.y_stride,
+                               avg_start, avg.y_stride,
+                               0, bs);
   }
-  if (decision == COPY_BLOCK) {
+
+  if (decision == FILTER_BLOCK) {
+    copy_block(src.buf, src.stride, avg_start, avg.y_stride, bs);
+  } else {  // COPY_BLOCK
     copy_block(avg_start, avg.y_stride, src.buf, src.stride, bs);
   }
 }
 
 static void copy_frame(YV12_BUFFER_CONFIG dest, const YV12_BUFFER_CONFIG src) {
-  int r, c;
+  int r;
   const uint8_t *srcbuf = src.y_buffer;
   uint8_t *destbuf = dest.y_buffer;
   assert(dest.y_width == src.y_width);
   assert(dest.y_height == src.y_height);
 
   for (r = 0; r < dest.y_height; ++r) {
-    for (c = 0; c < dest.y_width; ++c) {
-      destbuf[c] = srcbuf[c];
-    }
+    vpx_memcpy(destbuf, srcbuf, dest.y_width);
     destbuf += dest.y_stride;
     srcbuf += src.y_stride;
   }
@@ -325,6 +403,9 @@
       vp9_denoiser_free(denoiser);
       return 1;
     }
+#ifdef OUTPUT_YUV_DENOISED
+    make_grayscale(&denoiser->running_avg_y[i]);
+#endif
   }
 
   fail = vp9_alloc_frame_buffer(&denoiser->mc_running_avg_y, width, height,
@@ -333,7 +414,9 @@
     vp9_denoiser_free(denoiser);
     return 1;
   }
-
+#ifdef OUTPUT_YUV_DENOISED
+  make_grayscale(&denoiser->running_avg_y[i]);
+#endif
   denoiser->increase_denoising = 0;
 
   return 0;
@@ -353,3 +436,22 @@
     vp9_free_frame_buffer(&denoiser->mc_running_avg_y);
   }
 }
+
+#ifdef OUTPUT_YUV_DENOISED
+static void make_grayscale(YV12_BUFFER_CONFIG *yuv) {
+  int r, c;
+  uint8_t *u = yuv->u_buffer;
+  uint8_t *v = yuv->v_buffer;
+
+  // The '/2's are there because we have a 440 buffer, but we want to output
+  // 420.
+  for (r = 0; r < yuv->uv_height / 2; ++r) {
+    for (c = 0; c < yuv->uv_width / 2; ++c) {
+      u[c] = UINT8_MAX / 2;
+      v[c] = UINT8_MAX / 2;
+    }
+    u += yuv->uv_stride + yuv->uv_width / 2;
+    v += yuv->uv_stride + yuv->uv_width / 2;
+  }
+}
+#endif
diff --git a/vp9/encoder/vp9_denoiser.h b/vp9/encoder/vp9_denoiser.h
index 7855989..cbb6423 100644
--- a/vp9/encoder/vp9_denoiser.h
+++ b/vp9/encoder/vp9_denoiser.h
@@ -18,10 +18,10 @@
 extern "C" {
 #endif
 
-enum vp9_denoiser_decision {
+typedef enum vp9_denoiser_decision {
   COPY_BLOCK,
   FILTER_BLOCK
-};
+} VP9_DENOISER_DECISION;
 
 typedef struct vp9_denoiser {
   YV12_BUFFER_CONFIG running_avg_y[MAX_REF_FRAMES];
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 94143d9..dab3ff7 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -3074,6 +3074,7 @@
   init_encode_frame_mb_context(cpi);
   set_prev_mi(cm);
 
+  x->quant_fp = cpi->sf.use_quant_fp;
   x->skip_txfm = 0;
   if (sf->use_nonrd_pick_mode) {
     // Initialize internal buffer pointers for rtc coding, where non-RD
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 0961f3b..d97226e 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -306,6 +306,56 @@
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
+  int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  uint16_t *const eob = &p->eobs[block];
+  const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  int i, j;
+  const int16_t *src_diff;
+  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
+  src_diff = &p->src_diff[4 * (j * diff_stride + i)];
+
+  switch (tx_size) {
+    case TX_32X32:
+      fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
+      vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
+                           p->quant, p->quant_shift, qcoeff, dqcoeff,
+                           pd->dequant, p->zbin_extra, eob, scan_order->scan,
+                           scan_order->iscan);
+      break;
+    case TX_16X16:
+      vp9_fdct16x16(src_diff, coeff, diff_stride);
+      vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
+                      p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                      pd->dequant, p->zbin_extra, eob,
+                      scan_order->scan, scan_order->iscan);
+      break;
+    case TX_8X8:
+      vp9_fdct8x8(src_diff, coeff, diff_stride);
+      vp9_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
+                      p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                      pd->dequant, p->zbin_extra, eob,
+                      scan_order->scan, scan_order->iscan);
+      break;
+    case TX_4X4:
+      x->fwd_txm4x4(src_diff, coeff, diff_stride);
+      vp9_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
+                      p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                      pd->dequant, p->zbin_extra, eob,
+                      scan_order->scan, scan_order->iscan);
+      break;
+    default:
+      assert(0);
+  }
+}
+
+void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block,
+                        BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
   int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
   int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
@@ -424,11 +474,15 @@
 
   if (x->skip_txfm == 0) {
     // full forward transform and quantization
-    if (!x->skip_recode)
-      vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
+    if (!x->skip_recode) {
+      if (x->quant_fp)
+        vp9_xform_quant_fp(x, plane, block, plane_bsize, tx_size);
+      else
+        vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
+    }
   } else if (x->skip_txfm == 2) {
     // fast path forward transform and quantization
-    vp9_xform_quant_fp(x, plane, block, plane_bsize, tx_size);
+    vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size);
   } else {
     // skip forward transform
     p->eobs[block] = 0;
diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h
index 3196c99..0b8c3d2 100644
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@@ -24,6 +24,8 @@
 void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize);
 void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
+void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block,
+                        BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
 void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
                      BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
 
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 54fb68b..a1007c0 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -65,7 +65,7 @@
 // #define OUTPUT_YUV_REC
 
 #ifdef OUTPUT_YUV_DENOISED
-FILE *yuv_denoised_file;
+FILE *yuv_denoised_file = NULL;
 #endif
 #ifdef OUTPUT_YUV_SRC
 FILE *yuv_file;
@@ -199,6 +199,13 @@
     vpx_free(cpi->source_diff_var);
     cpi->source_diff_var = NULL;
   }
+
+#if CONFIG_FP_MB_STATS
+  if (cpi->use_fp_mb_stats) {
+    vpx_free(cpi->twopass.this_frame_mb_stats.mb_stats);
+    cpi->twopass.this_frame_mb_stats.mb_stats = NULL;
+  }
+#endif
 }
 
 static void save_coding_context(VP9_COMP *cpi) {
@@ -657,9 +664,11 @@
   cpi->ext_refresh_frame_context_pending = 0;
 
 #if CONFIG_DENOISING
-  vp9_denoiser_alloc(&(cpi->denoiser), cm->width, cm->height,
-                     cm->subsampling_x, cm->subsampling_y,
-                     VP9_ENC_BORDER_IN_PIXELS);
+  if (cpi->oxcf.noise_sensitivity > 0) {
+    vp9_denoiser_alloc(&(cpi->denoiser), cm->width, cm->height,
+                       cm->subsampling_x, cm->subsampling_y,
+                       VP9_ENC_BORDER_IN_PIXELS);
+  }
 #endif
 }
 
@@ -766,6 +775,17 @@
                                sizeof(*cpi->mbgraph_stats[i].mb_stats), 1));
   }
 
+#if CONFIG_FP_MB_STATS
+  cpi->use_fp_mb_stats = 0;
+  if (cpi->use_fp_mb_stats) {
+    // a place holder for the mb stats obtained from the first pass
+    CHECK_MEM_ERROR(cm, cpi->twopass.this_frame_mb_stats.mb_stats,
+                    vpx_calloc(cm->MBs * sizeof(FIRSTPASS_MB_STATS), 1));
+  } else {
+    cpi->twopass.this_frame_mb_stats.mb_stats = NULL;
+  }
+#endif
+
   cpi->refresh_alt_ref_frame = 0;
 
   // Note that at the moment multi_arf will not work with svc.
@@ -839,8 +859,12 @@
   cpi->mb.nmvsadcost_hp[1] = &cpi->mb.nmvsadcosts_hp[1][MV_MAX];
   cal_nmvsadcosts_hp(cpi->mb.nmvsadcost_hp);
 
+#if CONFIG_DENOISING
 #ifdef OUTPUT_YUV_DENOISED
-  yuv_denoised_file = fopen("denoised.yuv", "ab");
+  if (cpi->oxcf.noise_sensitivity > 0) {
+    yuv_denoised_file = fopen("denoised.yuv", "ab");
+  }
+#endif
 #endif
 #ifdef OUTPUT_YUV_SRC
   yuv_file = fopen("bd.yuv", "ab");
@@ -1079,7 +1103,9 @@
   }
 
 #if CONFIG_DENOISING
-  vp9_denoiser_free(&(cpi->denoiser));
+  if (cpi->oxcf.noise_sensitivity > 0) {
+    vp9_denoiser_free(&(cpi->denoiser));
+  }
 #endif
 
   dealloc_compressor_data(cpi);
@@ -1093,8 +1119,12 @@
   vp9_remove_common(&cpi->common);
   vpx_free(cpi);
 
+#if CONFIG_DENOISING
 #ifdef OUTPUT_YUV_DENOISED
-  fclose(yuv_denoised_file);
+  if (cpi->oxcf.noise_sensitivity > 0) {
+    fclose(yuv_denoised_file);
+  }
+#endif
 #endif
 #ifdef OUTPUT_YUV_SRC
   fclose(yuv_file);
@@ -1305,6 +1335,7 @@
 }
 #endif
 
+#if CONFIG_DENOISING
 #if defined(OUTPUT_YUV_DENOISED)
 // The denoiser buffer is allocated as a YUV 440 buffer. This function writes it
 // as YUV 420. We simply use the top-left pixels of the UV buffers, since we do
@@ -1336,6 +1367,7 @@
   } while (--h);
 }
 #endif
+#endif
 
 #ifdef OUTPUT_YUV_REC
 void vp9_write_yuv_rec_frame(VP9_COMMON *cm) {
@@ -1574,12 +1606,14 @@
                &cm->ref_frame_map[cpi->lst_fb_idx], cm->new_fb_idx);
   }
 #if CONFIG_DENOISING
-  vp9_denoiser_update_frame_info(&cpi->denoiser,
-                                *cpi->Source,
-                                cpi->common.frame_type,
-                                cpi->refresh_alt_ref_frame,
-                                cpi->refresh_golden_frame,
-                                cpi->refresh_last_frame);
+  if (cpi->oxcf.noise_sensitivity > 0) {
+    vp9_denoiser_update_frame_info(&cpi->denoiser,
+                                   *cpi->Source,
+                                   cpi->common.frame_type,
+                                   cpi->refresh_alt_ref_frame,
+                                   cpi->refresh_golden_frame,
+                                   cpi->refresh_last_frame);
+  }
 #endif
 }
 
@@ -2171,16 +2205,21 @@
   }
 #endif
 
-#ifdef OUTPUT_YUV_DENOISED
-  vp9_write_yuv_frame_420(&cpi->denoiser.running_avg_y[INTRA_FRAME],
-                      yuv_denoised_file);
-#endif
 #ifdef OUTPUT_YUV_SRC
   vp9_write_yuv_frame(cpi->Source, yuv_file);
 #endif
 
   set_speed_features(cpi);
 
+#if CONFIG_DENOISING
+#ifdef OUTPUT_YUV_DENOISED
+  if (cpi->oxcf.noise_sensitivity > 0) {
+    vp9_write_yuv_frame_420(&cpi->denoiser.running_avg_y[INTRA_FRAME],
+                            yuv_denoised_file);
+  }
+#endif
+#endif
+
   // Decide q and q bounds.
   q = vp9_rc_pick_q_and_bounds(cpi, &bottom_index, &top_index);
 
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 5e8430a..b38f9c2 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -348,6 +348,10 @@
   uint64_t time_pick_lpf;
   uint64_t time_encode_sb_row;
 
+#if CONFIG_FP_MB_STATS
+  int use_fp_mb_stats;
+#endif
+
   TWO_PASS twopass;
 
   YV12_BUFFER_CONFIG alt_ref_buffer;
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 971b159..d505ebf 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -89,23 +89,43 @@
 
 
 // Read frame stats at an offset from the current position.
-static int read_frame_stats(const TWO_PASS *p,
-                            FIRSTPASS_STATS *frame_stats, int offset) {
-  const FIRSTPASS_STATS *fps_ptr = p->stats_in;
-
-  // Check legality of offset.
-  if (offset >= 0) {
-    if (&fps_ptr[offset] >= p->stats_in_end)
-      return EOF;
-  } else if (offset < 0) {
-    if (&fps_ptr[offset] < p->stats_in_start)
-      return EOF;
+static const FIRSTPASS_STATS *read_frame_stats(const TWO_PASS *p, int offset) {
+  if ((offset >= 0 && p->stats_in + offset >= p->stats_in_end) ||
+      (offset < 0 && p->stats_in + offset < p->stats_in_start)) {
+    return NULL;
   }
 
-  *frame_stats = fps_ptr[offset];
+  return &p->stats_in[offset];
+}
+
+#if CONFIG_FP_MB_STATS
+static int input_mb_stats(FIRSTPASS_FRAME_MB_STATS *fp_frame_stats,
+                          const VP9_COMMON *const cm) {
+  FILE *fpfile;
+  int ret;
+
+  fpfile = fopen("firstpass_mb.stt", "r");
+  fseek(fpfile, cm->current_video_frame * cm->MBs * sizeof(FIRSTPASS_MB_STATS),
+        SEEK_SET);
+  ret = fread(fp_frame_stats->mb_stats, sizeof(FIRSTPASS_MB_STATS), cm->MBs,
+              fpfile);
+  fclose(fpfile);
+  if (ret < cm->MBs) {
+    return EOF;
+  }
   return 1;
 }
 
+static void output_mb_stats(FIRSTPASS_FRAME_MB_STATS *fp_frame_stats,
+                          const VP9_COMMON *const cm) {
+  FILE *fpfile;
+
+  fpfile = fopen("firstpass_mb.stt", "a");
+  fwrite(fp_frame_stats->mb_stats, sizeof(FIRSTPASS_MB_STATS), cm->MBs, fpfile);
+  fclose(fpfile);
+}
+#endif
+
 static int input_stats(TWO_PASS *p, FIRSTPASS_STATS *fps) {
   if (p->stats_in >= p->stats_in_end)
     return EOF;
@@ -452,6 +472,10 @@
   const MV zero_mv = {0, 0};
   const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12;
 
+#if CONFIG_FP_MB_STATS
+  FIRSTPASS_FRAME_MB_STATS *this_frame_mb_stats = &twopass->this_frame_mb_stats;
+#endif
+
   vp9_clear_system_state();
 
   set_first_pass_params(cpi);
@@ -579,6 +603,17 @@
       // Accumulate the intra error.
       intra_error += (int64_t)this_error;
 
+#if CONFIG_FP_MB_STATS
+      if (cpi->use_fp_mb_stats) {
+        this_frame_mb_stats->mb_stats[mb_row * cm->mb_cols + mb_col].mode =
+            DC_PRED;
+        this_frame_mb_stats->mb_stats[mb_row * cm->mb_cols + mb_col].err =
+            this_error;
+        this_frame_mb_stats->mb_stats[mb_row * cm->mb_cols + mb_col].mv.as_int
+            = 0;
+      }
+#endif
+
       // Set up limit values for motion vectors to prevent them extending
       // outside the UMV borders.
       x->mv_col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16);
@@ -704,6 +739,17 @@
 
           best_ref_mv.as_int = mv.as_int;
 
+#if CONFIG_FP_MB_STATS
+          if (cpi->use_fp_mb_stats) {
+            this_frame_mb_stats->mb_stats[mb_row * cm->mb_cols + mb_col].mode =
+                NEWMV;
+            this_frame_mb_stats->mb_stats[mb_row * cm->mb_cols + mb_col].err =
+                motion_error;
+            this_frame_mb_stats->mb_stats[mb_row * cm->mb_cols + mb_col].mv.
+                as_int = mv.as_int;
+          }
+#endif
+
           if (mv.as_int) {
             ++mvcount;
 
@@ -808,6 +854,12 @@
     twopass->this_frame_stats = fps;
     output_stats(&twopass->this_frame_stats, cpi->output_pkt_list);
     accumulate_stats(&twopass->total_stats, &fps);
+
+#if CONFIG_FP_MB_STATS
+    if (cpi->use_fp_mb_stats) {
+      output_mb_stats(this_frame_mb_stats, cm);
+    }
+#endif
   }
 
   // Copy the previous Last Frame back into gf and and arf buffers if
@@ -1053,24 +1105,16 @@
 // score in the frame following a flash frame. The offset passed in should
 // reflect this.
 static int detect_flash(const TWO_PASS *twopass, int offset) {
-  FIRSTPASS_STATS next_frame;
+  const FIRSTPASS_STATS *const next_frame = read_frame_stats(twopass, offset);
 
-  int flash_detected = 0;
-
-  // Read the frame data.
-  // The return is FALSE (no flash detected) if not a valid frame
-  if (read_frame_stats(twopass, &next_frame, offset) != EOF) {
-    // What we are looking for here is a situation where there is a
-    // brief break in prediction (such as a flash) but subsequent frames
-    // are reasonably well predicted by an earlier (pre flash) frame.
-    // The recovery after a flash is indicated by a high pcnt_second_ref
-    // compared to pcnt_inter.
-    if (next_frame.pcnt_second_ref > next_frame.pcnt_inter &&
-        next_frame.pcnt_second_ref >= 0.5)
-      flash_detected = 1;
-  }
-
-  return flash_detected;
+  // What we are looking for here is a situation where there is a
+  // brief break in prediction (such as a flash) but subsequent frames
+  // are reasonably well predicted by an earlier (pre flash) frame.
+  // The recovery after a flash is indicated by a high pcnt_second_ref
+  // compared to pcnt_inter.
+  return next_frame != NULL &&
+         next_frame->pcnt_second_ref > next_frame->pcnt_inter &&
+         next_frame->pcnt_second_ref >= 0.5;
 }
 
 // Update the motion related elements to the GF arf boost calculation.
@@ -1130,7 +1174,6 @@
 static int calc_arf_boost(VP9_COMP *cpi, int offset,
                           int f_frames, int b_frames,
                           int *f_boost, int *b_boost) {
-  FIRSTPASS_STATS this_frame;
   TWO_PASS *const twopass = &cpi->twopass;
   int i;
   double boost_score = 0.0;
@@ -1144,11 +1187,12 @@
 
   // Search forward from the proposed arf/next gf position.
   for (i = 0; i < f_frames; ++i) {
-    if (read_frame_stats(twopass, &this_frame, (i + offset)) == EOF)
+    const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset);
+    if (this_frame == NULL)
       break;
 
     // Update the motion related elements to the boost calculation.
-    accumulate_frame_motion_stats(&this_frame,
+    accumulate_frame_motion_stats(this_frame,
                                   &this_frame_mv_in_out, &mv_in_out_accumulator,
                                   &abs_mv_in_out_accumulator,
                                   &mv_ratio_accumulator);
@@ -1160,12 +1204,12 @@
 
     // Accumulate the effect of prediction quality decay.
     if (!flash_detected) {
-      decay_accumulator *= get_prediction_decay_rate(&cpi->common, &this_frame);
+      decay_accumulator *= get_prediction_decay_rate(&cpi->common, this_frame);
       decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
                           ? MIN_DECAY_FACTOR : decay_accumulator;
     }
 
-    boost_score += decay_accumulator * calc_frame_boost(twopass, &this_frame,
+    boost_score += decay_accumulator * calc_frame_boost(twopass, this_frame,
                                                         this_frame_mv_in_out);
   }
 
@@ -1181,11 +1225,12 @@
 
   // Search backward towards last gf position.
   for (i = -1; i >= -b_frames; --i) {
-    if (read_frame_stats(twopass, &this_frame, (i + offset)) == EOF)
+    const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset);
+    if (this_frame == NULL)
       break;
 
     // Update the motion related elements to the boost calculation.
-    accumulate_frame_motion_stats(&this_frame,
+    accumulate_frame_motion_stats(this_frame,
                                   &this_frame_mv_in_out, &mv_in_out_accumulator,
                                   &abs_mv_in_out_accumulator,
                                   &mv_ratio_accumulator);
@@ -1197,12 +1242,12 @@
 
     // Cumulative effect of prediction quality decay.
     if (!flash_detected) {
-      decay_accumulator *= get_prediction_decay_rate(&cpi->common, &this_frame);
+      decay_accumulator *= get_prediction_decay_rate(&cpi->common, this_frame);
       decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
                               ? MIN_DECAY_FACTOR : decay_accumulator;
     }
 
-    boost_score += decay_accumulator * calc_frame_boost(twopass, &this_frame,
+    boost_score += decay_accumulator * calc_frame_boost(twopass, this_frame,
                                                         this_frame_mv_in_out);
   }
   *b_boost = (int)boost_score;
@@ -2167,6 +2212,12 @@
 
   // Update the total stats remaining structure.
   subtract_stats(&twopass->total_left_stats, &this_frame);
+
+#if CONFIG_FP_MB_STATS
+  if (cpi->use_fp_mb_stats) {
+    input_mb_stats(&twopass->this_frame_mb_stats, cm);
+  }
+#endif
 }
 
 void vp9_twopass_postencode_update(VP9_COMP *cpi) {
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index 1ee56a3..7e4c9ee 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -18,6 +18,18 @@
 extern "C" {
 #endif
 
+#if CONFIG_FP_MB_STATS
+typedef struct {
+  PREDICTION_MODE mode;
+  int err;
+  int_mv mv;
+} FIRSTPASS_MB_STATS;
+
+typedef struct {
+  FIRSTPASS_MB_STATS *mb_stats;
+} FIRSTPASS_FRAME_MB_STATS;
+#endif
+
 typedef struct {
   double frame;
   double intra_error;
@@ -76,6 +88,10 @@
   double kf_intra_err_min;
   double gf_intra_err_min;
 
+#if CONFIG_FP_MB_STATS
+  FIRSTPASS_FRAME_MB_STATS this_frame_mb_stats;
+#endif
+
   // Projected total bits available for a key frame group of frames
   int64_t kf_group_bits;
 
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index d9edeae..22ad064 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -141,7 +141,7 @@
 
   // TODO(jingning) exploiting adaptive motion search control in non-RD
   // mode decision too.
-  step_param = 6;
+  step_param = cpi->sf.mv.fullpel_search_step_param;
 
   for (i = LAST_FRAME; i <= LAST_FRAME && cpi->common.show_frame; ++i) {
     if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
@@ -440,7 +440,9 @@
   int i;
 
 #if CONFIG_DENOISING
-  vp9_denoiser_reset_frame_stats(&cpi->denoiser);
+  if (cpi->oxcf.noise_sensitivity > 0) {
+    vp9_denoiser_reset_frame_stats(&cpi->denoiser);
+  }
 #endif
 
   if (cpi->sf.reuse_inter_pred_sby) {
@@ -658,7 +660,9 @@
       }
 
 #if CONFIG_DENOISING
-      vp9_denoiser_update_frame_stats(&cpi->denoiser, mbmi, sse_y, this_mode);
+      if (cpi->oxcf.noise_sensitivity > 0) {
+        vp9_denoiser_update_frame_stats(&cpi->denoiser, mbmi, sse_y, this_mode);
+      }
 #endif
 
       if (this_rd < best_rd || x->skip) {
@@ -774,7 +778,9 @@
   }
 
 #if CONFIG_DENOISING
-  vp9_denoiser_denoise(&cpi->denoiser, x, mi_row, mi_col, bsize);
+  if (cpi->oxcf.noise_sensitivity > 0) {
+    vp9_denoiser_denoise(&cpi->denoiser, x, mi_row, mi_col, bsize);
+  }
 #endif
 
   return INT64_MAX;
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index f817bcc..1846da9 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -42,9 +42,9 @@
 }
 
 void vp9_quantize_dc_32x32(const int16_t *coeff_ptr, int skip_block,
-                     const int16_t *round_ptr, const int16_t quant,
-                     int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
-                     const int16_t dequant_ptr, uint16_t *eob_ptr) {
+                           const int16_t *round_ptr, const int16_t quant,
+                           int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                           const int16_t dequant_ptr, uint16_t *eob_ptr) {
   int eob = -1;
 
   if (!skip_block) {
@@ -63,6 +63,47 @@
   *eob_ptr = eob + 1;
 }
 
+void vp9_quantize_fp_c(const int16_t *coeff_ptr, intptr_t count,
+                       int skip_block,
+                       const int16_t *zbin_ptr, const int16_t *round_ptr,
+                       const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
+                       int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                       const int16_t *dequant_ptr,
+                       int zbin_oq_value, uint16_t *eob_ptr,
+                       const int16_t *scan, const int16_t *iscan) {
+  int i, eob = -1;
+  // TODO(jingning) Decide the need of these arguments after the
+  // quantization process is completed.
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)zbin_oq_value;
+  (void)iscan;
+
+  vpx_memset(qcoeff_ptr, 0, count * sizeof(int16_t));
+  vpx_memset(dqcoeff_ptr, 0, count * sizeof(int16_t));
+
+  if (!skip_block) {
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    for (i = 0; i < count; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+      int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+      tmp = (tmp * quant_ptr[rc != 0]) >> 16;
+
+      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+
+      if (tmp)
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
 void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t count,
                       int skip_block,
                       const int16_t *zbin_ptr, const int16_t *round_ptr,
@@ -207,11 +248,16 @@
     const int qrounding_factor = q == 0 ? 64 : 48;
 
     for (i = 0; i < 2; ++i) {
+      int qrounding_factor_fp = i == 0 ? 48 : 42;
+      if (q == 0)
+        qrounding_factor_fp = 64;
+
       // y
       quant = i == 0 ? vp9_dc_quant(q, cm->y_dc_delta_q)
                      : vp9_ac_quant(q, 0);
       invert_quant(&quants->y_quant[q][i], &quants->y_quant_shift[q][i], quant);
       quants->y_quant_fp[q][i] = (1 << 16) / quant;
+      quants->y_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7;
       quants->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
       quants->y_round[q][i] = (qrounding_factor * quant) >> 7;
       cm->y_dequant[q][i] = quant;
@@ -222,6 +268,7 @@
       invert_quant(&quants->uv_quant[q][i],
                    &quants->uv_quant_shift[q][i], quant);
       quants->uv_quant_fp[q][i] = (1 << 16) / quant;
+      quants->uv_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7;
       quants->uv_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
       quants->uv_round[q][i] = (qrounding_factor * quant) >> 7;
       cm->uv_dequant[q][i] = quant;
@@ -240,6 +287,7 @@
     for (i = 2; i < 8; i++) {
       quants->y_quant[q][i] = quants->y_quant[q][1];
       quants->y_quant_fp[q][i] = quants->y_quant_fp[q][1];
+      quants->y_round_fp[q][i] = quants->y_round_fp[q][1];
       quants->y_quant_shift[q][i] = quants->y_quant_shift[q][1];
       quants->y_zbin[q][i] = quants->y_zbin[q][1];
       quants->y_round[q][i] = quants->y_round[q][1];
@@ -247,6 +295,7 @@
 
       quants->uv_quant[q][i] = quants->uv_quant[q][1];
       quants->uv_quant_fp[q][i] = quants->uv_quant_fp[q][1];
+      quants->uv_round_fp[q][i] = quants->uv_round_fp[q][1];
       quants->uv_quant_shift[q][i] = quants->uv_quant_shift[q][1];
       quants->uv_zbin[q][i] = quants->uv_zbin[q][1];
       quants->uv_round[q][i] = quants->uv_round[q][1];
@@ -276,6 +325,7 @@
   // Y
   x->plane[0].quant = quants->y_quant[qindex];
   x->plane[0].quant_fp = quants->y_quant_fp[qindex];
+  x->plane[0].round_fp = quants->y_round_fp[qindex];
   x->plane[0].quant_shift = quants->y_quant_shift[qindex];
   x->plane[0].zbin = quants->y_zbin[qindex];
   x->plane[0].round = quants->y_round[qindex];
@@ -286,6 +336,7 @@
   for (i = 1; i < 3; i++) {
     x->plane[i].quant = quants->uv_quant[qindex];
     x->plane[i].quant_fp = quants->uv_quant_fp[qindex];
+    x->plane[i].round_fp = quants->uv_round_fp[qindex];
     x->plane[i].quant_shift = quants->uv_quant_shift[qindex];
     x->plane[i].zbin = quants->uv_zbin[qindex];
     x->plane[i].round = quants->uv_round[qindex];
diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h
index 0e90462..24e4491 100644
--- a/vp9/encoder/vp9_quantize.h
+++ b/vp9/encoder/vp9_quantize.h
@@ -28,6 +28,8 @@
   // if we want to deprecate the current use of y_quant.
   DECLARE_ALIGNED(16, int16_t, y_quant_fp[QINDEX_RANGE][8]);
   DECLARE_ALIGNED(16, int16_t, uv_quant_fp[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, y_round_fp[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, uv_round_fp[QINDEX_RANGE][8]);
 
   DECLARE_ALIGNED(16, int16_t, uv_quant[QINDEX_RANGE][8]);
   DECLARE_ALIGNED(16, int16_t, uv_quant_shift[QINDEX_RANGE][8]);
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index e1a03a6..a4cdd33 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -4289,6 +4289,10 @@
   rd->thresh_mult[THR_NEWA] += 1000;
   rd->thresh_mult[THR_NEWG] += 1000;
 
+  // Adjust threshold only in real time mode, which only use last reference
+  // frame.
+  rd->thresh_mult[THR_NEWMV] += sf->elevate_newmv_thresh;
+
   rd->thresh_mult[THR_NEARMV] += 1000;
   rd->thresh_mult[THR_NEARA] += 1000;
   rd->thresh_mult[THR_COMP_NEARESTLA] += 1000;
@@ -4351,10 +4355,6 @@
     rd->thresh_mult[THR_COMP_NEARGA   ] = INT_MAX;
     rd->thresh_mult[THR_COMP_NEWGA    ] = INT_MAX;
   }
-
-  // Adjust threshold only in real time mode, which only use last reference
-  // frame.
-  rd->thresh_mult[THR_NEWMV] += sf->elevate_newmv_thresh;
 }
 
 void vp9_set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi) {
diff --git a/vp9/encoder/vp9_segmentation.c b/vp9/encoder/vp9_segmentation.c
index 574df62..897ae01 100644
--- a/vp9/encoder/vp9_segmentation.c
+++ b/vp9/encoder/vp9_segmentation.c
@@ -27,6 +27,8 @@
 
 void vp9_disable_segmentation(struct segmentation *seg) {
   seg->enabled = 0;
+  seg->update_map = 0;
+  seg->update_data = 0;
 }
 
 void vp9_set_segment_data(struct segmentation *seg,
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 8111870..1eac02f 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -282,6 +282,8 @@
     sf->elevate_newmv_thresh = 2000;
   }
   if (speed >= 7) {
+    sf->use_quant_fp = cm->frame_type == KEY_FRAME ? 0 : 1;
+    sf->mv.fullpel_search_step_param = 10;
     sf->lpf_pick = LPF_PICK_MINIMAL_LPF;
     sf->encode_breakout_thresh = (MIN(cm->width, cm->height) >= 720) ?
         800 : 300;
@@ -311,6 +313,7 @@
   sf->mv.reduce_first_step_size = 0;
   sf->mv.auto_mv_step_size = 0;
   sf->mv.max_step_search_steps = MAX_MVSEARCH_STEPS;
+  sf->mv.fullpel_search_step_param = 6;
   sf->comp_inter_joint_search_thresh = BLOCK_4X4;
   sf->adaptive_rd_thresh = 0;
   sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_OFF;
@@ -318,6 +321,7 @@
   sf->use_lp32x32fdct = 0;
   sf->adaptive_motion_search = 0;
   sf->adaptive_pred_interp_filter = 0;
+  sf->use_quant_fp = 0;
   sf->reference_masking = 0;
   sf->partition_search_type = SEARCH_PARTITION;
   sf->less_rectangular_check = 0;
@@ -357,7 +361,6 @@
   sf->search_type_check_frequency = 50;
   sf->encode_breakout_thresh = 0;
   sf->elevate_newmv_thresh = 0;
-
   // Recode loop tolerence %.
   sf->recode_tolerance = 25;
 
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index f6d6311..4ccb77a 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -162,6 +162,9 @@
 
   // Control when to stop subpel search
   int subpel_force_stop;
+
+  // This variable sets the step_param used in full pel motion search.
+  int fullpel_search_step_param;
 } MV_SPEED_FEATURES;
 
 typedef struct SPEED_FEATURES {
@@ -284,6 +287,9 @@
   // was selected, and 2 means we use 8 tap if no 8x8 filter mode was selected.
   int adaptive_pred_interp_filter;
 
+  // Fast quantization process path
+  int use_quant_fp;
+
   // Search through variable block partition types in non-RD mode decision
   // encoding process for RTC.
   int partition_check;
diff --git a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
index 48ccef8..62da865 100644
--- a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
+++ b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
@@ -217,3 +217,185 @@
 INIT_XMM ssse3
 QUANTIZE_FN b, 7
 QUANTIZE_FN b_32x32, 7
+
+%macro QUANTIZE_FP 2
+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
+                                shift, qcoeff, dqcoeff, dequant, zbin_oq, \
+                                eob, scan, iscan
+  cmp                    dword skipm, 0
+  jne .blank
+
+  ; actual quantize loop - setup pointers, rounders, etc.
+  movifnidn                   coeffq, coeffmp
+  movifnidn                  ncoeffq, ncoeffmp
+  mov                             r2, dequantmp
+  movifnidn                    zbinq, zbinmp
+  movifnidn                   roundq, roundmp
+  movifnidn                   quantq, quantmp
+  mova                            m1, [roundq]             ; m1 = round
+  mova                            m2, [quantq]             ; m2 = quant
+%ifidn %1, b_32x32
+; TODO(jingning) to be continued with 32x32 quantization process
+  pcmpeqw                         m5, m5
+  psrlw                           m5, 15
+  paddw                           m0, m5
+  paddw                           m1, m5
+  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
+  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
+%endif
+  mova                            m3, [r2q]                ; m3 = dequant
+  mov                             r3, qcoeffmp
+  mov                             r4, dqcoeffmp
+  mov                             r5, iscanmp
+%ifidn %1, b_32x32
+  psllw                           m4, 1
+%endif
+  pxor                            m5, m5                   ; m5 = dedicated zero
+  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob
+  lea                         coeffq, [  coeffq+ncoeffq*2]
+  lea                         iscanq, [  iscanq+ncoeffq*2]
+  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
+  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
+  neg                        ncoeffq
+
+  ; get DC and first 15 AC coeffs
+  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+  pcmpeqw                         m7, m7
+  pcmpeqw                        m12, m12
+
+  paddsw                          m6, m1                   ; m6 += round
+  punpckhqdq                      m1, m1
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
+  punpckhqdq                      m2, m2
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  psignw                          m8, m9                   ; m8 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  mova        [qcoeffq+ncoeffq*2+ 0], m8
+  mova        [qcoeffq+ncoeffq*2+16], m13
+%ifidn %1, b_32x32
+  pabsw                           m8, m8
+  pabsw                          m13, m13
+%endif
+  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
+  punpckhqdq                      m3, m3
+  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+  psrlw                           m8, 1
+  psrlw                          m13, 1
+  psignw                          m8, m9
+  psignw                         m13, m10
+%endif
+  mova       [dqcoeffq+ncoeffq*2+ 0], m8
+  mova       [dqcoeffq+ncoeffq*2+16], m13
+  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
+  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
+  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
+  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
+  psubw                           m6, m7                   ; m6 = scan[i] + 1
+  psubw                          m11, m12                  ; m11 = scan[i] + 1
+  pandn                           m8, m6                   ; m8 = max(eob)
+  pandn                          m13, m11                  ; m13 = max(eob)
+  pmaxsw                          m8, m13
+  add                        ncoeffq, mmsize
+  jz .accumulate_eob
+
+.ac_only_loop:
+  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+  pcmpeqw                         m7, m7
+  pcmpeqw                        m12, m12
+%ifidn %1, b_32x32
+  pmovmskb                        r6, m7
+  pmovmskb                        r2, m12
+  or                              r6, r2
+  jz .skip_iter
+%endif
+  paddsw                          m6, m1                   ; m6 += round
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  psignw                         m14, m9                   ; m14 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  mova        [qcoeffq+ncoeffq*2+ 0], m14
+  mova        [qcoeffq+ncoeffq*2+16], m13
+%ifidn %1, b_32x32
+  pabsw                          m14, m14
+  pabsw                          m13, m13
+%endif
+  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
+  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+  psrlw                          m14, 1
+  psrlw                          m13, 1
+  psignw                         m14, m9
+  psignw                         m13, m10
+%endif
+  mova       [dqcoeffq+ncoeffq*2+ 0], m14
+  mova       [dqcoeffq+ncoeffq*2+16], m13
+  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
+  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
+  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
+  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
+  psubw                           m6, m7                   ; m6 = scan[i] + 1
+  psubw                          m11, m12                  ; m11 = scan[i] + 1
+  pandn                          m14, m6                   ; m14 = max(eob)
+  pandn                          m13, m11                  ; m13 = max(eob)
+  pmaxsw                          m8, m14
+  pmaxsw                          m8, m13
+  add                        ncoeffq, mmsize
+  jl .ac_only_loop
+
+%ifidn %1, b_32x32
+  jmp .accumulate_eob
+.skip_iter:
+  mova        [qcoeffq+ncoeffq*2+ 0], m5
+  mova        [qcoeffq+ncoeffq*2+16], m5
+  mova       [dqcoeffq+ncoeffq*2+ 0], m5
+  mova       [dqcoeffq+ncoeffq*2+16], m5
+  add                        ncoeffq, mmsize
+  jl .ac_only_loop
+%endif
+
+.accumulate_eob:
+  ; horizontally accumulate/max eobs and write into [eob] memory pointer
+  mov                             r2, eobmp
+  pshufd                          m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0x1
+  pmaxsw                          m8, m7
+  pextrw                          r6, m8, 0
+  mov                             [r2], r6
+  RET
+
+  ; skip-block, i.e. just write all zeroes
+.blank:
+  mov                             r0, dqcoeffmp
+  movifnidn                  ncoeffq, ncoeffmp
+  mov                             r2, qcoeffmp
+  mov                             r3, eobmp
+  DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
+  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
+  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
+  neg                        ncoeffq
+  pxor                            m7, m7
+.blank_loop:
+  mova       [dqcoeffq+ncoeffq*2+ 0], m7
+  mova       [dqcoeffq+ncoeffq*2+16], m7
+  mova        [qcoeffq+ncoeffq*2+ 0], m7
+  mova        [qcoeffq+ncoeffq*2+16], m7
+  add                        ncoeffq, mmsize
+  jl .blank_loop
+  mov                    word [eobq], 0
+  RET
+%endmacro
+
+INIT_XMM ssse3
+QUANTIZE_FP fp, 7
diff --git a/vp9/vp9_iface_common.h b/vp9/vp9_iface_common.h
index d60883c..b90c37b 100644
--- a/vp9/vp9_iface_common.h
+++ b/vp9/vp9_iface_common.h
@@ -31,6 +31,7 @@
     img->fmt = VPX_IMG_FMT_I420;
     bps = 12;
   }
+  img->bit_depth = 8;
   img->w = yv12->y_stride;
   img->h = ALIGN_POWER_OF_TWO(yv12->y_height + 2 * VP9_ENC_BORDER_IN_PIXELS, 3);
   img->d_w = yv12->y_crop_width;
diff --git a/vpx/src/vpx_image.c b/vpx/src/vpx_image.c
index 36eda95..dc8fcbc 100644
--- a/vpx/src/vpx_image.c
+++ b/vpx/src/vpx_image.c
@@ -40,13 +40,13 @@
   }
 }
 
-static vpx_image_t *img_alloc_helper(vpx_image_t  *img,
-                                     vpx_img_fmt_t fmt,
-                                     unsigned int  d_w,
-                                     unsigned int  d_h,
-                                     unsigned int  buf_align,
-                                     unsigned int  stride_align,
-                                     unsigned char      *img_data) {
+static vpx_image_t *img_alloc_helper(vpx_image_t   *img,
+                                     vpx_img_fmt_t  fmt,
+                                     unsigned int   d_w,
+                                     unsigned int   d_h,
+                                     unsigned int   buf_align,
+                                     unsigned int   stride_align,
+                                     unsigned char *img_data) {
 
   unsigned int  h, w, s, xcs, ycs, bps;
   int           align;
@@ -94,6 +94,21 @@
     case VPX_IMG_FMT_VPXYV12:
       bps = 12;
       break;
+    case VPX_IMG_FMT_I422:
+      bps = 16;
+      break;
+    case VPX_IMG_FMT_I444:
+      bps = 24;
+      break;
+    case VPX_IMG_FMT_I42016:
+      bps = 24;
+      break;
+    case VPX_IMG_FMT_I42216:
+      bps = 32;
+      break;
+    case VPX_IMG_FMT_I44416:
+      bps = 48;
+      break;
     default:
       bps = 16;
       break;
@@ -105,6 +120,9 @@
     case VPX_IMG_FMT_YV12:
     case VPX_IMG_FMT_VPXI420:
     case VPX_IMG_FMT_VPXYV12:
+    case VPX_IMG_FMT_I422:
+    case VPX_IMG_FMT_I42016:
+    case VPX_IMG_FMT_I42216:
       xcs = 1;
       break;
     default:
@@ -156,6 +174,7 @@
     goto fail;
 
   img->fmt = fmt;
+  img->bit_depth = (fmt & VPX_IMG_FMT_HIGH) ? 16 : 8;
   img->w = w;
   img->h = h;
   img->x_chroma_shift = xcs;
diff --git a/vpx/vpx_codec.h b/vpx/vpx_codec.h
index 03d2dec..45e7023 100644
--- a/vpx/vpx_codec.h
+++ b/vpx/vpx_codec.h
@@ -212,6 +212,15 @@
     vpx_codec_priv_t        *priv;        /**< Algorithm private storage */
   } vpx_codec_ctx_t;
 
+  /*!\brief Bit depth for codec
+   * *
+   * This enumeration determines the bit depth of the codec.
+   */
+  typedef enum vpx_bit_depth {
+    VPX_BITS_8,   /**< 8 bits  */
+    VPX_BITS_10,  /**< 10 bits */
+    VPX_BITS_12   /**< 12 bits */
+  } vpx_bit_depth_t;
 
   /*
    * Library Version Number Interface
diff --git a/vpx/vpx_image.h b/vpx/vpx_image.h
index d45b003..7b04b70 100644
--- a/vpx/vpx_image.h
+++ b/vpx/vpx_image.h
@@ -103,8 +103,9 @@
     vpx_img_fmt_t fmt; /**< Image Format */
 
     /* Image storage dimensions */
-    unsigned int  w;   /**< Stored image width */
-    unsigned int  h;   /**< Stored image height */
+    unsigned int  w;           /**< Stored image width */
+    unsigned int  h;           /**< Stored image height */
+    unsigned int  bit_depth;   /**< Stored image bit-depth */
 
     /* Image display dimensions */
     unsigned int  d_w;   /**< Displayed image width */
diff --git a/vpxdec.c b/vpxdec.c
index a3a1da5..1213ab6 100644
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -900,7 +900,8 @@
             len = y4m_write_file_header(buf, sizeof(buf),
                                         vpx_input_ctx.width,
                                         vpx_input_ctx.height,
-                                        &vpx_input_ctx.framerate, img->fmt);
+                                        &vpx_input_ctx.framerate,
+                                        img->fmt, 8);
             if (do_md5) {
               MD5Update(&md5_ctx, (md5byte *)buf, (unsigned int)len);
             } else {
diff --git a/vpxenc.c b/vpxenc.c
index d46a83e..fce6807 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -756,6 +756,7 @@
       input->framerate.numerator = input->y4m.fps_n;
       input->framerate.denominator = input->y4m.fps_d;
       input->fmt = input->y4m.vpx_fmt;
+      input->bit_depth = input->y4m.bit_depth;
     } else
       fatal("Unsupported Y4M stream.");
   } else if (input->detect.buf_read == 4 && fourcc_is_ivf(input->detect.buf)) {
@@ -1533,6 +1534,7 @@
   input.framerate.numerator = 30;
   input.framerate.denominator = 1;
   input.only_i420 = 1;
+  input.bit_depth = 0;
 
   /* First parse the global configuration values, because we want to apply
    * other parameters on top of the default configuration provided by the
diff --git a/y4menc.c b/y4menc.c
index 8b1c95e..9211452 100644
--- a/y4menc.c
+++ b/y4menc.c
@@ -8,16 +8,48 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
 #include "./y4menc.h"
 
 int y4m_write_file_header(char *buf, size_t len, int width, int height,
                           const struct VpxRational *framerate,
-                          vpx_img_fmt_t fmt) {
-  const char *const color = fmt == VPX_IMG_FMT_444A ? "C444alpha\n" :
-                            fmt == VPX_IMG_FMT_I444 ? "C444\n" :
-                            fmt == VPX_IMG_FMT_I422 ? "C422\n" :
-                            "C420jpeg\n";
-
+                          vpx_img_fmt_t fmt, unsigned int bit_depth) {
+  const char *color;
+  switch (bit_depth) {
+    case 8:
+      color = fmt == VPX_IMG_FMT_444A ? "C444alpha\n" :
+              fmt == VPX_IMG_FMT_I444 ? "C444\n" :
+              fmt == VPX_IMG_FMT_I422 ? "C422\n" :
+              "C420jpeg\n";
+      break;
+    case 9:
+      color = fmt == VPX_IMG_FMT_I44416 ? "C444p9 XYSCSS=444P9\n" :
+              fmt == VPX_IMG_FMT_I42216 ? "C422p9 XYSCSS=422P9\n" :
+              "C420p9 XYSCSS=420P9\n";
+      break;
+    case 10:
+      color = fmt == VPX_IMG_FMT_I44416 ? "C444p10 XYSCSS=444P10\n" :
+              fmt == VPX_IMG_FMT_I42216 ? "C422p10 XYSCSS=422P10\n" :
+              "C420p10 XYSCSS=420P10\n";
+      break;
+    case 12:
+      color = fmt == VPX_IMG_FMT_I44416 ? "C444p12 XYSCSS=444P12\n" :
+              fmt == VPX_IMG_FMT_I42216 ? "C422p12 XYSCSS=422P12\n" :
+              "C420p12 XYSCSS=420P12\n";
+      break;
+    case 14:
+      color = fmt == VPX_IMG_FMT_I44416 ? "C444p14 XYSCSS=444P14\n" :
+              fmt == VPX_IMG_FMT_I42216 ? "C422p14 XYSCSS=422P14\n" :
+              "C420p14 XYSCSS=420P14\n";
+      break;
+    case 16:
+      color = fmt == VPX_IMG_FMT_I44416 ? "C444p16 XYSCSS=444P16\n" :
+              fmt == VPX_IMG_FMT_I42216 ? "C422p16 XYSCSS=422P16\n" :
+              "C420p16 XYSCSS=420P16\n";
+      break;
+    default:
+      assert(0);
+  }
   return snprintf(buf, len, "YUV4MPEG2 W%u H%u F%u:%u I%c %s", width, height,
                   framerate->numerator, framerate->denominator, 'p', color);
 }
diff --git a/y4menc.h b/y4menc.h
index 0fabf56..69d5904 100644
--- a/y4menc.h
+++ b/y4menc.h
@@ -23,7 +23,7 @@
 
 int y4m_write_file_header(char *buf, size_t len, int width, int height,
                           const struct VpxRational *framerate,
-                          vpx_img_fmt_t fmt);
+                          vpx_img_fmt_t fmt, unsigned int bit_depth);
 int y4m_write_frame_header(char *buf, size_t len);
 
 #ifdef __cplusplus
diff --git a/y4minput.c b/y4minput.c
index 90c5310..b005b71 100644
--- a/y4minput.c
+++ b/y4minput.c
@@ -737,15 +737,52 @@
     return -1;
   }
   _y4m->vpx_fmt = VPX_IMG_FMT_I420;
-  _y4m->vpx_bps = 12;
+  _y4m->bps = 12;
+  _y4m->bit_depth = 8;
   if (strcmp(_y4m->chroma_type, "420") == 0 ||
       strcmp(_y4m->chroma_type, "420jpeg") == 0) {
     _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v = _y4m->dst_c_dec_v = 2;
     _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h
                             + 2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2);
-    /*Natively supported: no conversion required.*/
+    /* Natively supported: no conversion required. */
     _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
     _y4m->convert = y4m_convert_null;
+  } else if (strcmp(_y4m->chroma_type, "420p10") == 0) {
+    _y4m->src_c_dec_h = 2;
+    _y4m->dst_c_dec_h = 2;
+    _y4m->src_c_dec_v = 2;
+    _y4m->dst_c_dec_v = 2;
+    _y4m->dst_buf_read_sz = 2 * (_y4m->pic_w * _y4m->pic_h +
+                                 2 * ((_y4m->pic_w + 1) / 2) *
+                                 ((_y4m->pic_h + 1) / 2));
+    /* Natively supported: no conversion required. */
+    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+    _y4m->convert = y4m_convert_null;
+    _y4m->bit_depth = 10;
+    _y4m->bps = 15;
+    _y4m->vpx_fmt = VPX_IMG_FMT_I42016;
+    if (only_420) {
+      fprintf(stderr, "Unsupported conversion from 420p10 to 420jpeg\n");
+      return -1;
+    }
+  } else if (strcmp(_y4m->chroma_type, "420p12") == 0) {
+    _y4m->src_c_dec_h = 2;
+    _y4m->dst_c_dec_h = 2;
+    _y4m->src_c_dec_v = 2;
+    _y4m->dst_c_dec_v = 2;
+    _y4m->dst_buf_read_sz = 2 * (_y4m->pic_w * _y4m->pic_h +
+                                 2 * ((_y4m->pic_w + 1) / 2) *
+                                 ((_y4m->pic_h + 1) / 2));
+    /* Natively supported: no conversion required. */
+    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+    _y4m->convert = y4m_convert_null;
+    _y4m->bit_depth = 12;
+    _y4m->bps = 18;
+    _y4m->vpx_fmt = VPX_IMG_FMT_I42016;
+    if (only_420) {
+      fprintf(stderr, "Unsupported conversion from 420p12 to 420jpeg\n");
+      return -1;
+    }
   } else if (strcmp(_y4m->chroma_type, "420mpeg2") == 0) {
     _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v = _y4m->dst_c_dec_v = 2;
     _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
@@ -786,7 +823,7 @@
       _y4m->convert = y4m_convert_422_420jpeg;
     } else {
       _y4m->vpx_fmt = VPX_IMG_FMT_I422;
-      _y4m->vpx_bps = 16;
+      _y4m->bps = 16;
       _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
       _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
       _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h
@@ -794,7 +831,39 @@
       /*Natively supported: no conversion required.*/
       _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
       _y4m->convert = y4m_convert_null;
-      }
+    }
+  } else if (strcmp(_y4m->chroma_type, "422p10") == 0) {
+    _y4m->src_c_dec_h = 2;
+    _y4m->src_c_dec_v = 1;
+    _y4m->vpx_fmt = VPX_IMG_FMT_I42216;
+    _y4m->bps = 20;
+    _y4m->bit_depth = 10;
+    _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
+    _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
+    _y4m->dst_buf_read_sz = 2 * (_y4m->pic_w * _y4m->pic_h +
+                                 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h);
+    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+    _y4m->convert = y4m_convert_null;
+    if (only_420) {
+      fprintf(stderr, "Unsupported conversion from 422p10 to 420jpeg\n");
+      return -1;
+    }
+  } else if (strcmp(_y4m->chroma_type, "422p12") == 0) {
+    _y4m->src_c_dec_h = 2;
+    _y4m->src_c_dec_v = 1;
+    _y4m->vpx_fmt = VPX_IMG_FMT_I42216;
+    _y4m->bps = 24;
+    _y4m->bit_depth = 12;
+    _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
+    _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
+    _y4m->dst_buf_read_sz = 2 * (_y4m->pic_w * _y4m->pic_h +
+                                 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h);
+    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+    _y4m->convert = y4m_convert_null;
+    if (only_420) {
+      fprintf(stderr, "Unsupported conversion from 422p12 to 420jpeg\n");
+      return -1;
+    }
   } else if (strcmp(_y4m->chroma_type, "411") == 0) {
     _y4m->src_c_dec_h = 4;
     _y4m->dst_c_dec_h = 2;
@@ -823,7 +892,7 @@
       _y4m->convert = y4m_convert_444_420jpeg;
     } else {
       _y4m->vpx_fmt = VPX_IMG_FMT_I444;
-      _y4m->vpx_bps = 24;
+      _y4m->bps = 24;
       _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
       _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
       _y4m->dst_buf_read_sz = 3 * _y4m->pic_w * _y4m->pic_h;
@@ -831,6 +900,36 @@
       _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
       _y4m->convert = y4m_convert_null;
     }
+  } else if (strcmp(_y4m->chroma_type, "444p10") == 0) {
+    _y4m->src_c_dec_h = 1;
+    _y4m->src_c_dec_v = 1;
+    _y4m->vpx_fmt = VPX_IMG_FMT_I44416;
+    _y4m->bps = 30;
+    _y4m->bit_depth = 10;
+    _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
+    _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
+    _y4m->dst_buf_read_sz = 2 * 3 * _y4m->pic_w * _y4m->pic_h;
+    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+    _y4m->convert = y4m_convert_null;
+    if (only_420) {
+      fprintf(stderr, "Unsupported conversion from 444p10 to 420jpeg\n");
+      return -1;
+    }
+  } else if (strcmp(_y4m->chroma_type, "444p12") == 0) {
+    _y4m->src_c_dec_h = 1;
+    _y4m->src_c_dec_v = 1;
+    _y4m->vpx_fmt = VPX_IMG_FMT_I44416;
+    _y4m->bps = 36;
+    _y4m->bit_depth = 12;
+    _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
+    _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
+    _y4m->dst_buf_read_sz = 2 * 3 * _y4m->pic_w * _y4m->pic_h;
+    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+    _y4m->convert = y4m_convert_null;
+    if (only_420) {
+      fprintf(stderr, "Unsupported conversion from 444p12 to 420jpeg\n");
+      return -1;
+    }
   } else if (strcmp(_y4m->chroma_type, "444alpha") == 0) {
     _y4m->src_c_dec_h = 1;
     _y4m->src_c_dec_v = 1;
@@ -847,7 +946,7 @@
       _y4m->convert = y4m_convert_444_420jpeg;
     } else {
       _y4m->vpx_fmt = VPX_IMG_FMT_444A;
-      _y4m->vpx_bps = 32;
+      _y4m->bps = 32;
       _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
       _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
       _y4m->dst_buf_read_sz = 4 * _y4m->pic_w * _y4m->pic_h;
@@ -871,7 +970,10 @@
   _y4m->dst_buf_sz = _y4m->pic_w * _y4m->pic_h
                      + 2 * ((_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h) *
                      ((_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v);
-  _y4m->dst_buf = (unsigned char *)malloc(_y4m->dst_buf_sz);
+  if (_y4m->bit_depth == 8)
+    _y4m->dst_buf = (unsigned char *)malloc(_y4m->dst_buf_sz);
+  else
+    _y4m->dst_buf = (unsigned char *)malloc(2 * _y4m->dst_buf_sz);
   _y4m->aux_buf = (unsigned char *)malloc(_y4m->aux_buf_sz);
   return 0;
 }
@@ -887,6 +989,7 @@
   int  c_w;
   int  c_h;
   int  c_sz;
+  int  bytes_per_sample = _y4m->bit_depth > 8 ? 2 : 1;
   /*Read and skip the frame header.*/
   if (!file_read(frame, 6, _fin)) return 0;
   if (memcmp(frame, "FRAME", 5)) {
@@ -924,14 +1027,16 @@
   _img->h = _img->d_h = _y4m->pic_h;
   _img->x_chroma_shift = _y4m->dst_c_dec_h >> 1;
   _img->y_chroma_shift = _y4m->dst_c_dec_v >> 1;
-  _img->bps = _y4m->vpx_bps;
+  _img->bps = _y4m->bps;
 
   /*Set up the buffer pointers.*/
-  pic_sz = _y4m->pic_w * _y4m->pic_h;
+  pic_sz = _y4m->pic_w * _y4m->pic_h * bytes_per_sample;
   c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h;
+  c_w *= bytes_per_sample;
   c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v;
   c_sz = c_w * c_h;
-  _img->stride[PLANE_Y] = _img->stride[PLANE_ALPHA] = _y4m->pic_w;
+  _img->stride[PLANE_Y] = _img->stride[PLANE_ALPHA] =
+      _y4m->pic_w * bytes_per_sample;
   _img->stride[PLANE_U] = _img->stride[PLANE_V] = c_w;
   _img->planes[PLANE_Y] = _y4m->dst_buf;
   _img->planes[PLANE_U] = _y4m->dst_buf + pic_sz;
diff --git a/y4minput.h b/y4minput.h
index d53eb65..356cebb 100644
--- a/y4minput.h
+++ b/y4minput.h
@@ -58,7 +58,8 @@
   unsigned char    *dst_buf;
   unsigned char    *aux_buf;
   enum vpx_img_fmt  vpx_fmt;
-  int               vpx_bps;
+  int               bps;
+  unsigned int      bit_depth;
 };
 
 int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,