Merge "gen_msvs_*proj.sh: speed up file generation"
diff --git a/README b/README
index 6d7d5ec..d3cb302 100644
--- a/README
+++ b/README
@@ -1,5 +1,4 @@
-vpx Multi-Format Codec SDK
-README - 1 August 2013
+README - 30 May 2014
 
 Welcome to the WebM VP8/VP9 Codec SDK!
 
@@ -131,6 +130,14 @@
   This defaults to config.log. This should give a good indication of what went
   wrong. If not, contact us for support.
 
+VP8/VP9 TEST VECTORS:
+  The test vectors can be downloaded and verified using the build system after
+  running configure. To specify an alternate directory the
+  LIBVPX_TEST_DATA_PATH environment variable can be used.
+
+  $ ./configure --enable-unit-tests
+  $ LIBVPX_TEST_DATA_PATH=../libvpx-test-data make testdata
+
 SUPPORT
   This library is an open source project supported by its community. Please
   please email webm-discuss@webmproject.org for help.
diff --git a/build/arm-msvs/obj_int_extract.bat b/build/arm-msvs/obj_int_extract.bat
index 267ed61..c0987bc 100644
--- a/build/arm-msvs/obj_int_extract.bat
+++ b/build/arm-msvs/obj_int_extract.bat
@@ -11,8 +11,8 @@
 REM   %1 - Relative path to the directory containing the vp8 and vpx_scale
 REM        source directories.
 REM   %2 - Path to obj_int_extract.exe.
-cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vp8/encoder/vp8_asm_enc_offsets.c"
+cl /I. /I%1 /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%~1/vp8/encoder/vp8_asm_enc_offsets.c"
 %2\obj_int_extract.exe rvds "vp8_asm_enc_offsets.obj" > "vp8_asm_enc_offsets.asm"
 
-cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vpx_scale/vpx_scale_asm_offsets.c"
+cl /I. /I%1 /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%~1/vpx_scale/vpx_scale_asm_offsets.c"
 %2\obj_int_extract.exe rvds "vpx_scale_asm_offsets.obj" > "vpx_scale_asm_offsets.asm"
diff --git a/build/make/configure.sh b/build/make/configure.sh
index ad7dc82..ced8330 100755
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -774,6 +774,13 @@
             add_cflags  "-mmacosx-version-min=10.9"
             add_ldflags "-mmacosx-version-min=10.9"
             ;;
+        *-iphonesimulator-*)
+            add_cflags  "-miphoneos-version-min=5.0"
+            add_ldflags "-miphoneos-version-min=5.0"
+            osx_sdk_dir="$(xcrun --sdk iphonesimulator --show-sdk-path)"
+            add_cflags  "-isysroot ${osx_sdk_dir}"
+            add_ldflags "-isysroot ${osx_sdk_dir}"
+            ;;
     esac
 
     # Handle Solaris variants. Solaris 10 needs -lposix4
@@ -1164,6 +1171,12 @@
                 # enabled icc && ! enabled pic && add_cflags -fno-pic -mdynamic-no-pic
                 enabled icc && ! enabled pic && add_cflags -fno-pic
             ;;
+            iphonesimulator)
+                add_asflags -f macho32
+                sim_arch="-arch i386"
+                add_cflags  ${sim_arch}
+                add_ldflags ${sim_arch}
+           ;;
             os2)
                 add_asflags -f aout
                 enabled debug && add_asflags -g
diff --git a/build/make/gen_msvs_proj.sh b/build/make/gen_msvs_proj.sh
index d5c57d8..f1cc04e 100755
--- a/build/make/gen_msvs_proj.sh
+++ b/build/make/gen_msvs_proj.sh
@@ -67,7 +67,9 @@
             if [ "${f##*.}" == "$pat" ]; then
                 unset file_list[i]
 
-                objf=$(echo ${f%.*}.obj | sed -e 's/^[\./]\+//g' -e 's,[:/],_,g')
+                objf=$(echo ${f%.*}.obj \
+                       | sed -e "s,$src_path_bare,," \
+                             -e 's/^[\./]\+//g' -e 's,[:/ ],_,g')
                 open_tag File RelativePath="$f"
 
                 if [ "$pat" == "asm" ] && $asm_use_custom_step; then
@@ -153,7 +155,7 @@
             opt=${opt##-I}
             opt=$(fix_path "$opt")
             incs="${incs}${incs:+;}"${opt}""
-            yasmincs="${yasmincs} -I${opt}"
+            yasmincs="${yasmincs} -I"${opt}""
         ;;
         -D*) defines="${defines}${defines:+;}${opt##-D}"
         ;;
@@ -305,7 +307,7 @@
                     vpx)
                         tag Tool \
                             Name="VCPreBuildEventTool" \
-                            CommandLine="call obj_int_extract.bat $src_path_bare $plat_no_ws\\\$(ConfigurationName)" \
+                            CommandLine="call obj_int_extract.bat "$src_path_bare" $plat_no_ws\\\$(ConfigurationName)" \
 
                         tag Tool \
                             Name="VCCLCompilerTool" \
diff --git a/build/make/gen_msvs_vcxproj.sh b/build/make/gen_msvs_vcxproj.sh
index 6259640..eee354d 100755
--- a/build/make/gen_msvs_vcxproj.sh
+++ b/build/make/gen_msvs_vcxproj.sh
@@ -78,7 +78,9 @@
             if [ "${f##*.}" == "$pat" ]; then
                 unset file_list[i]
 
-                objf=$(echo ${f%.*}.obj | sed -e 's/^[\./]\+//g' -e 's,[:/],_,g')
+                objf=$(echo ${f%.*}.obj \
+                       | sed -e "s,$src_path_bare,," \
+                             -e 's/^[\./]\+//g' -e 's,[:/ ],_,g')
 
                 if ([ "$pat" == "asm" ] || [ "$pat" == "s" ]) && $asm_use_custom_step; then
                     # Avoid object file name collisions, i.e. vpx_config.c and
@@ -175,7 +177,7 @@
             opt=${opt##-I}
             opt=$(fix_path "$opt")
             incs="${incs}${incs:+;}"${opt}""
-            yasmincs="${yasmincs} -I${opt}"
+            yasmincs="${yasmincs} -I"${opt}""
         ;;
         -D*) defines="${defines}${defines:+;}${opt##-D}"
         ;;
@@ -397,7 +399,7 @@
                     hostplat=Win32
                 fi
                 open_tag PreBuildEvent
-                tag_content Command "call obj_int_extract.bat $src_path_bare $hostplat\\\$(Configuration)"
+                tag_content Command "call obj_int_extract.bat "$src_path_bare" $hostplat\\\$(Configuration)"
                 close_tag PreBuildEvent
             fi
             open_tag ClCompile
diff --git a/build/x86-msvs/obj_int_extract.bat b/build/x86-msvs/obj_int_extract.bat
index 44d095d..dfa3b90 100644
--- a/build/x86-msvs/obj_int_extract.bat
+++ b/build/x86-msvs/obj_int_extract.bat
@@ -10,6 +10,6 @@
 REM Arguments:
 REM   %1 - Relative path to the directory containing the vp8 source directory.
 REM   %2 - Path to obj_int_extract.exe.
-cl /I "./" /I "%1" /nologo /c "%1/vp8/encoder/vp8_asm_enc_offsets.c"
+cl /I. /I%1 /nologo /c "%~1/vp8/encoder/vp8_asm_enc_offsets.c"
 %2\obj_int_extract.exe rvds "vp8_asm_enc_offsets.obj" > "vp8_asm_enc_offsets.asm"
 
diff --git a/configure b/configure
index b6d645a..5fd5a90 100755
--- a/configure
+++ b/configure
@@ -120,6 +120,7 @@
 all_platforms="${all_platforms} x86-darwin11-gcc"
 all_platforms="${all_platforms} x86-darwin12-gcc"
 all_platforms="${all_platforms} x86-darwin13-gcc"
+all_platforms="${all_platforms} x86-iphonesimulator-gcc"
 all_platforms="${all_platforms} x86-linux-gcc"
 all_platforms="${all_platforms} x86-linux-icc"
 all_platforms="${all_platforms} x86-os2-gcc"
@@ -723,6 +724,10 @@
             # iOS/ARM builds do not work with gtest. This does not match
             # x86 targets.
         ;;
+        *-iphonesimulator-*)
+            soft_enable webm_io
+            soft_enable libyuv
+        ;;
         *-win*)
             # Some mingw toolchains don't have pthread available by default.
             # Treat these more like visual studio where threading in gtest
diff --git a/examples.mk b/examples.mk
index 946c030..ce833fc 100644
--- a/examples.mk
+++ b/examples.mk
@@ -306,6 +306,7 @@
             --name=$$(@:.$(VCPROJ_SFX)=)\
             --ver=$$(CONFIG_VS_VERSION)\
             --proj-guid=$$($$(@:.$(VCPROJ_SFX)=).GUID)\
+            --src-path-bare="$(SRC_PATH_BARE)" \
             $$(if $$(CONFIG_STATIC_MSVCRT),--static-crt) \
             --out=$$@ $$(INTERNAL_CFLAGS) $$(CFLAGS) \
             $$(INTERNAL_LDFLAGS) $$(LDFLAGS) -l$$(CODEC_LIB) $$^
diff --git a/libs.mk b/libs.mk
index d02e9bc..2b072b6 100644
--- a/libs.mk
+++ b/libs.mk
@@ -222,6 +222,7 @@
     --name=obj_int_extract \
     --ver=$(CONFIG_VS_VERSION) \
     --proj-guid=E1360C65-D375-4335-8057-7ED99CC3F9B2 \
+    --src-path-bare="$(SRC_PATH_BARE)" \
     $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
     --out=$@ $^ \
     -I. \
@@ -253,6 +254,7 @@
             --proj-guid=DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74 \
             --module-def=vpx.def \
             --ver=$(CONFIG_VS_VERSION) \
+            --src-path-bare="$(SRC_PATH_BARE)" \
             --out=$@ $(CFLAGS) \
             $(filter-out $(addprefix %, $(ASM_INCLUDES)), $^) \
             --src-path-bare="$(SRC_PATH_BARE)" \
@@ -447,6 +449,7 @@
             -D_VARIADIC_MAX=10 \
             --proj-guid=CD837F5F-52D8-4314-A370-895D614166A7 \
             --ver=$(CONFIG_VS_VERSION) \
+            --src-path-bare="$(SRC_PATH_BARE)" \
             $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
             --out=$@ $(INTERNAL_CFLAGS) $(CFLAGS) \
             -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/include" \
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index 92976e1..99c8d0c 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -366,12 +366,13 @@
         input_block[j] = rnd.Rand8() - rnd.Rand8();
         input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
       }
-      if (i == 0)
+      if (i == 0) {
         for (int j = 0; j < kNumCoeffs; ++j)
           input_extreme_block[j] = 255;
-      if (i == 1)
+      } else if (i == 1) {
         for (int j = 0; j < kNumCoeffs; ++j)
           input_extreme_block[j] = -255;
+      }
 
       fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);
       REGISTER_STATE_CHECK(RunFwdTxfm(input_extreme_block,
diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc
index 72c0bd6..501c696 100644
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -179,12 +179,13 @@
       input_block[j] = rnd.Rand8() - rnd.Rand8();
       input_extreme_block[j] = rnd.Rand8() & 1 ? 255 : -255;
     }
-    if (i == 0)
+    if (i == 0) {
       for (int j = 0; j < kNumCoeffs; ++j)
         input_extreme_block[j] = 255;
-    if (i == 1)
+    } else if (i == 1) {
       for (int j = 0; j < kNumCoeffs; ++j)
         input_extreme_block[j] = -255;
+    }
 
     const int stride = 32;
     vp9_fdct32x32_c(input_extreme_block, output_ref_block, stride);
diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index 030665e..961eb4d 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -136,12 +136,13 @@
         input_block[j] = rnd.Rand8() - rnd.Rand8();
         input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
       }
-      if (i == 0)
+      if (i == 0) {
         for (int j = 0; j < kNumCoeffs; ++j)
           input_extreme_block[j] = 255;
-      if (i == 1)
+      } else if (i == 1) {
         for (int j = 0; j < kNumCoeffs; ++j)
           input_extreme_block[j] = -255;
+      }
 
       fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);
       REGISTER_STATE_CHECK(RunFwdTxfm(input_extreme_block,
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index c7cf164..72a5fad 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -177,23 +177,36 @@
     ACMRandom rnd(ACMRandom::DeterministicSeed());
     int max_error = 0;
     int total_error = 0;
+    int total_coeff_error = 0;
     const int count_test_block = 100000;
     DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
     DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 64);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, ref_temp_block, 64);
     DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 64);
     DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 64);
 
     for (int i = 0; i < count_test_block; ++i) {
       // Initialize a test block with input range [-255, 255].
       for (int j = 0; j < 64; ++j) {
-        src[j] = rnd.Rand8() % 2 ? 255 : 0;
-        dst[j] = src[j] > 0 ? 0 : 255;
+        if (i == 0) {
+          src[j] = 255;
+          dst[j] = 0;
+        } else if (i == 1) {
+          src[j] = 0;
+          dst[j] = 255;
+        } else {
+          src[j] = rnd.Rand8() % 2 ? 255 : 0;
+          dst[j] = rnd.Rand8() % 2 ? 255 : 0;
+        }
+
         test_input_block[j] = src[j] - dst[j];
       }
 
       REGISTER_STATE_CHECK(
           RunFwdTxfm(test_input_block, test_temp_block, pitch_));
       REGISTER_STATE_CHECK(
+          fwd_txfm_ref(test_input_block, ref_temp_block, pitch_, tx_type_));
+      REGISTER_STATE_CHECK(
           RunInvTxfm(test_temp_block, dst, pitch_));
 
       for (int j = 0; j < 64; ++j) {
@@ -202,6 +215,9 @@
         if (max_error < error)
           max_error = error;
         total_error += error;
+
+        const int coeff_diff = test_temp_block[j] - ref_temp_block[j];
+        total_coeff_error += abs(coeff_diff);
       }
 
       EXPECT_GE(1, max_error)
@@ -211,6 +227,10 @@
       EXPECT_GE(count_test_block/5, total_error)
           << "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has average"
           << " roundtrip error > 1/5 per block";
+
+      EXPECT_EQ(0, total_coeff_error)
+          << "Error: Extremal 8x8 FDCT/FHT has"
+          << "overflow issues in the intermediate steps > 1";
     }
   }
 
diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc
index 79ef521..f2171b2 100644
--- a/test/partial_idct_test.cc
+++ b/test/partial_idct_test.cc
@@ -28,7 +28,8 @@
 namespace {
 typedef void (*fwd_txfm_t)(const int16_t *in, int16_t *out, int stride);
 typedef void (*inv_txfm_t)(const int16_t *in, uint8_t *out, int stride);
-typedef std::tr1::tuple<inv_txfm_t,
+typedef std::tr1::tuple<fwd_txfm_t,
+                        inv_txfm_t,
                         inv_txfm_t,
                         TX_SIZE, int> partial_itxfm_param_t;
 const int kMaxNumCoeffs = 1024;
@@ -36,10 +37,11 @@
  public:
   virtual ~PartialIDctTest() {}
   virtual void SetUp() {
-    full_itxfm_ = GET_PARAM(0);
-    partial_itxfm_ = GET_PARAM(1);
-    tx_size_  = GET_PARAM(2);
-    last_nonzero_ = GET_PARAM(3);
+    ftxfm_ = GET_PARAM(0);
+    full_itxfm_ = GET_PARAM(1);
+    partial_itxfm_ = GET_PARAM(2);
+    tx_size_  = GET_PARAM(3);
+    last_nonzero_ = GET_PARAM(4);
   }
 
   virtual void TearDown() { libvpx_test::ClearSystemState(); }
@@ -47,10 +49,90 @@
  protected:
   int last_nonzero_;
   TX_SIZE tx_size_;
+  fwd_txfm_t ftxfm_;
   inv_txfm_t full_itxfm_;
   inv_txfm_t partial_itxfm_;
 };
 
+TEST_P(PartialIDctTest, RunQuantCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  int size;
+  switch (tx_size_) {
+    case TX_4X4:
+      size = 4;
+      break;
+    case TX_8X8:
+      size = 8;
+      break;
+    case TX_16X16:
+      size = 16;
+      break;
+    case TX_32X32:
+      size = 32;
+      break;
+    default:
+      FAIL() << "Wrong Size!";
+      break;
+  }
+  DECLARE_ALIGNED_ARRAY(16, int16_t, test_coef_block1, kMaxNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, test_coef_block2, kMaxNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, dst1, kMaxNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, dst2, kMaxNumCoeffs);
+
+  const int count_test_block = 1000;
+  const int block_size = size * size;
+
+  DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kMaxNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kMaxNumCoeffs);
+
+  int max_error = 0;
+  for (int i = 0; i < count_test_block; ++i) {
+    // clear out destination buffer
+    memset(dst1, 0, sizeof(*dst1) * block_size);
+    memset(dst2, 0, sizeof(*dst2) * block_size);
+    memset(test_coef_block1, 0, sizeof(*test_coef_block1) * block_size);
+    memset(test_coef_block2, 0, sizeof(*test_coef_block2) * block_size);
+
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-255, 255].
+      if (i == 0) {
+        for (int j = 0; j < block_size; ++j)
+          input_extreme_block[j] = 255;
+      } else if (i == 1) {
+        for (int j = 0; j < block_size; ++j)
+          input_extreme_block[j] = -255;
+      } else {
+        for (int j = 0; j < block_size; ++j) {
+          input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
+        }
+      }
+
+      ftxfm_(input_extreme_block, output_ref_block, size);
+
+      // quantization with maximum allowed step sizes
+      test_coef_block1[0] = (output_ref_block[0] / 1336) * 1336;
+      for (int j = 1; j < last_nonzero_; ++j)
+        test_coef_block1[vp9_default_scan_orders[tx_size_].scan[j]]
+                         = (output_ref_block[j] / 1828) * 1828;
+    }
+
+    REGISTER_STATE_CHECK(full_itxfm_(test_coef_block1, dst1, size));
+    REGISTER_STATE_CHECK(partial_itxfm_(test_coef_block1, dst2, size));
+
+    for (int j = 0; j < block_size; ++j) {
+      const int diff = dst1[j] - dst2[j];
+      const int error = diff * diff;
+      if (max_error < error)
+        max_error = error;
+    }
+  }
+
+  EXPECT_EQ(0, max_error)
+      << "Error: partial inverse transform produces different results";
+}
+
 TEST_P(PartialIDctTest, ResultsMatch) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   int size;
@@ -119,47 +201,60 @@
 INSTANTIATE_TEST_CASE_P(
     C, PartialIDctTest,
     ::testing::Values(
-        make_tuple(&vp9_idct32x32_1024_add_c,
+        make_tuple(&vp9_fdct32x32_c,
+                   &vp9_idct32x32_1024_add_c,
                    &vp9_idct32x32_34_add_c,
                    TX_32X32, 34),
-        make_tuple(&vp9_idct32x32_1024_add_c,
+        make_tuple(&vp9_fdct32x32_c,
+                   &vp9_idct32x32_1024_add_c,
                    &vp9_idct32x32_1_add_c,
                    TX_32X32, 1),
-        make_tuple(&vp9_idct16x16_256_add_c,
+        make_tuple(&vp9_fdct16x16_c,
+                   &vp9_idct16x16_256_add_c,
                    &vp9_idct16x16_10_add_c,
                    TX_16X16, 10),
-        make_tuple(&vp9_idct16x16_256_add_c,
+        make_tuple(&vp9_fdct16x16_c,
+                   &vp9_idct16x16_256_add_c,
                    &vp9_idct16x16_1_add_c,
                    TX_16X16, 1),
-        make_tuple(&vp9_idct8x8_64_add_c,
+        make_tuple(&vp9_fdct8x8_c,
+                   &vp9_idct8x8_64_add_c,
                    &vp9_idct8x8_12_add_c,
                    TX_8X8, 12),
-        make_tuple(&vp9_idct8x8_64_add_c,
+        make_tuple(&vp9_fdct8x8_c,
+                   &vp9_idct8x8_64_add_c,
                    &vp9_idct8x8_1_add_c,
                    TX_8X8, 1),
-        make_tuple(&vp9_idct4x4_16_add_c,
+        make_tuple(&vp9_fdct4x4_c,
+                   &vp9_idct4x4_16_add_c,
                    &vp9_idct4x4_1_add_c,
                    TX_4X4, 1)));
 #if HAVE_NEON_ASM
 INSTANTIATE_TEST_CASE_P(
     NEON, PartialIDctTest,
     ::testing::Values(
-        make_tuple(&vp9_idct32x32_1024_add_c,
+        make_tuple(&vp9_fdct32x32_c,
+                   &vp9_idct32x32_1024_add_c,
                    &vp9_idct32x32_1_add_neon,
                    TX_32X32, 1),
-        make_tuple(&vp9_idct16x16_256_add_c,
+        make_tuple(&vp9_fdct16x16_c,
+                   &vp9_idct16x16_256_add_c,
                    &vp9_idct16x16_10_add_neon,
                    TX_16X16, 10),
-        make_tuple(&vp9_idct16x16_256_add_c,
+        make_tuple(&vp9_fdct16x16_c,
+                   &vp9_idct16x16_256_add_c,
                    &vp9_idct16x16_1_add_neon,
                    TX_16X16, 1),
-        make_tuple(&vp9_idct8x8_64_add_c,
+        make_tuple(&vp9_fdct8x8_c,
+                   &vp9_idct8x8_64_add_c,
                    &vp9_idct8x8_12_add_neon,
                    TX_8X8, 12),
-        make_tuple(&vp9_idct8x8_64_add_c,
+        make_tuple(&vp9_fdct8x8_c,
+                   &vp9_idct8x8_64_add_c,
                    &vp9_idct8x8_1_add_neon,
                    TX_8X8, 1),
-        make_tuple(&vp9_idct4x4_16_add_c,
+        make_tuple(&vp9_fdct4x4_c,
+                   &vp9_idct4x4_16_add_c,
                    &vp9_idct4x4_1_add_neon,
                    TX_4X4, 1)));
 #endif
@@ -168,35 +263,53 @@
 INSTANTIATE_TEST_CASE_P(
     SSE2, PartialIDctTest,
     ::testing::Values(
-        make_tuple(&vp9_idct32x32_1024_add_c,
+        make_tuple(&vp9_fdct32x32_c,
+                   &vp9_idct32x32_1024_add_c,
                    &vp9_idct32x32_34_add_sse2,
                    TX_32X32, 34),
-        make_tuple(&vp9_idct32x32_1024_add_c,
+        make_tuple(&vp9_fdct32x32_c,
+                   &vp9_idct32x32_1024_add_c,
                    &vp9_idct32x32_1_add_sse2,
                    TX_32X32, 1),
-        make_tuple(&vp9_idct16x16_256_add_c,
+        make_tuple(&vp9_fdct16x16_c,
+                   &vp9_idct16x16_256_add_c,
                    &vp9_idct16x16_10_add_sse2,
                    TX_16X16, 10),
-        make_tuple(&vp9_idct16x16_256_add_c,
+        make_tuple(&vp9_fdct16x16_c,
+                   &vp9_idct16x16_256_add_c,
                    &vp9_idct16x16_1_add_sse2,
                    TX_16X16, 1),
-        make_tuple(&vp9_idct8x8_64_add_c,
+        make_tuple(&vp9_fdct8x8_c,
+                   &vp9_idct8x8_64_add_c,
                    &vp9_idct8x8_12_add_sse2,
                    TX_8X8, 12),
-        make_tuple(&vp9_idct8x8_64_add_c,
+        make_tuple(&vp9_fdct8x8_c,
+                   &vp9_idct8x8_64_add_c,
                    &vp9_idct8x8_1_add_sse2,
                    TX_8X8, 1),
-        make_tuple(&vp9_idct4x4_16_add_c,
+        make_tuple(&vp9_fdct4x4_c,
+                   &vp9_idct4x4_16_add_c,
                    &vp9_idct4x4_1_add_sse2,
                    TX_4X4, 1)));
 #endif
 
 #if HAVE_SSSE3 && ARCH_X86_64
 INSTANTIATE_TEST_CASE_P(
-    SSSE3, PartialIDctTest,
+    SSSE3_64, PartialIDctTest,
     ::testing::Values(
-        make_tuple(&vp9_idct8x8_64_add_c,
+        make_tuple(&vp9_fdct8x8_c,
+                   &vp9_idct8x8_64_add_c,
                    &vp9_idct8x8_12_add_ssse3,
                    TX_8X8, 12)));
 #endif
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, PartialIDctTest,
+    ::testing::Values(
+        make_tuple(&vp9_fdct16x16_c,
+                   &vp9_idct16x16_256_add_c,
+                   &vp9_idct16x16_10_add_ssse3,
+                   TX_16X16, 10)));
+#endif
 }  // namespace
diff --git a/test/sad_test.cc b/test/sad_test.cc
index a692891..f9ffa92 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -29,12 +29,22 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 
+#if CONFIG_VP8_ENCODER
 typedef unsigned int (*sad_m_by_n_fn_t)(const unsigned char *source_ptr,
                                         int source_stride,
                                         const unsigned char *reference_ptr,
                                         int reference_stride,
                                         unsigned int max_sad);
 typedef std::tr1::tuple<int, int, sad_m_by_n_fn_t> sad_m_by_n_test_param_t;
+#endif
+#if CONFIG_VP9_ENCODER
+typedef unsigned int (*sad_m_by_n_fn_vp9_t)(const unsigned char *source_ptr,
+                                            int source_stride,
+                                            const unsigned char *reference_ptr,
+                                            int reference_stride);
+typedef std::tr1::tuple<int, int, sad_m_by_n_fn_vp9_t>
+                  sad_m_by_n_test_param_vp9_t;
+#endif
 
 typedef void (*sad_n_by_n_by_4_fn_t)(const uint8_t *src_ptr,
                                      int src_stride,
@@ -87,7 +97,7 @@
 
   // Sum of Absolute Differences. Given two blocks, calculate the absolute
   // difference between two pixels in the same relative location; accumulate.
-  unsigned int ReferenceSAD(unsigned int max_sad, int block_idx = 0) {
+  unsigned int ReferenceSAD(unsigned int max_sad, int block_idx) {
     unsigned int sad = 0;
     const uint8_t* const reference = GetReference(block_idx);
 
@@ -128,39 +138,9 @@
   ACMRandom rnd_;
 };
 
-class SADTest : public SADTestBase,
-    public ::testing::WithParamInterface<sad_m_by_n_test_param_t> {
- public:
-  SADTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1)) {}
-
- protected:
-  unsigned int SAD(unsigned int max_sad, int block_idx = 0) {
-    unsigned int ret;
-    const uint8_t* const reference = GetReference(block_idx);
-
-    REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
-                                            reference, reference_stride_,
-                                            max_sad));
-    return ret;
-  }
-
-  void CheckSad(unsigned int max_sad) {
-    unsigned int reference_sad, exp_sad;
-
-    reference_sad = ReferenceSAD(max_sad);
-    exp_sad = SAD(max_sad);
-
-    if (reference_sad <= max_sad) {
-      ASSERT_EQ(exp_sad, reference_sad);
-    } else {
-      // Alternative implementations are not required to check max_sad
-      ASSERT_GE(exp_sad, reference_sad);
-    }
-  }
-};
-
-class SADx4Test : public SADTestBase,
-    public ::testing::WithParamInterface<sad_n_by_n_by_4_test_param_t> {
+class SADx4Test
+    : public SADTestBase,
+      public ::testing::WithParamInterface<sad_n_by_n_by_4_test_param_t> {
  public:
   SADx4Test() : SADTestBase(GET_PARAM(0), GET_PARAM(1)) {}
 
@@ -178,23 +158,169 @@
     unsigned int reference_sad, exp_sad[4];
 
     SADs(exp_sad);
-    for (int block = 0; block < 4; block++) {
+    for (int block = 0; block < 4; ++block) {
       reference_sad = ReferenceSAD(UINT_MAX, block);
 
-      EXPECT_EQ(exp_sad[block], reference_sad) << "block " << block;
+      EXPECT_EQ(reference_sad, exp_sad[block]) << "block " << block;
     }
   }
 };
 
+#if CONFIG_VP8_ENCODER
+class SADTest
+    : public SADTestBase,
+      public ::testing::WithParamInterface<sad_m_by_n_test_param_t> {
+ public:
+  SADTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1)) {}
+
+ protected:
+  unsigned int SAD(unsigned int max_sad, int block_idx) {
+    unsigned int ret;
+    const uint8_t* const reference = GetReference(block_idx);
+
+    REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
+                                            reference, reference_stride_,
+                                            max_sad));
+    return ret;
+  }
+
+  void CheckSAD(unsigned int max_sad) {
+    const unsigned int reference_sad = ReferenceSAD(max_sad, 0);
+    const unsigned int exp_sad = SAD(max_sad, 0);
+
+    if (reference_sad <= max_sad) {
+      ASSERT_EQ(exp_sad, reference_sad);
+    } else {
+      // Alternative implementations are not required to check max_sad
+      ASSERT_GE(exp_sad, reference_sad);
+    }
+  }
+};
+#endif  // CONFIG_VP8_ENCODER
+
+#if CONFIG_VP9_ENCODER
+class SADVP9Test
+    : public SADTestBase,
+      public ::testing::WithParamInterface<sad_m_by_n_test_param_vp9_t> {
+ public:
+  SADVP9Test() : SADTestBase(GET_PARAM(0), GET_PARAM(1)) {}
+
+ protected:
+  unsigned int SAD(int block_idx) {
+    unsigned int ret;
+    const uint8_t* const reference = GetReference(block_idx);
+
+    REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
+                                            reference, reference_stride_));
+    return ret;
+  }
+
+  void CheckSAD() {
+    const unsigned int reference_sad = ReferenceSAD(UINT_MAX, 0);
+    const unsigned int exp_sad = SAD(0);
+
+    ASSERT_EQ(reference_sad, exp_sad);
+  }
+};
+#endif  // CONFIG_VP9_ENCODER
+
 uint8_t* SADTestBase::source_data_ = NULL;
 uint8_t* SADTestBase::reference_data_ = NULL;
 
+#if CONFIG_VP8_ENCODER
 TEST_P(SADTest, MaxRef) {
   FillConstant(source_data_, source_stride_, 0);
   FillConstant(reference_data_, reference_stride_, 255);
-  CheckSad(UINT_MAX);
+  CheckSAD(UINT_MAX);
 }
 
+TEST_P(SADTest, MaxSrc) {
+  FillConstant(source_data_, source_stride_, 255);
+  FillConstant(reference_data_, reference_stride_, 0);
+  CheckSAD(UINT_MAX);
+}
+
+TEST_P(SADTest, ShortRef) {
+  int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSAD(UINT_MAX);
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADTest, UnalignedRef) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  const int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSAD(UINT_MAX);
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADTest, ShortSrc) {
+  const int tmp_stride = source_stride_;
+  source_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSAD(UINT_MAX);
+  source_stride_ = tmp_stride;
+}
+
+TEST_P(SADTest, MaxSAD) {
+  // Verify that, when max_sad is set, the implementation does not return a
+  // value lower than the reference.
+  FillConstant(source_data_, source_stride_, 255);
+  FillConstant(reference_data_, reference_stride_, 0);
+  CheckSAD(128);
+}
+#endif  // CONFIG_VP8_ENCODER
+
+#if CONFIG_VP9_ENCODER
+TEST_P(SADVP9Test, MaxRef) {
+  FillConstant(source_data_, source_stride_, 0);
+  FillConstant(reference_data_, reference_stride_, 255);
+  CheckSAD();
+}
+
+TEST_P(SADVP9Test, MaxSrc) {
+  FillConstant(source_data_, source_stride_, 255);
+  FillConstant(reference_data_, reference_stride_, 0);
+  CheckSAD();
+}
+
+TEST_P(SADVP9Test, ShortRef) {
+  const int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSAD();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADVP9Test, UnalignedRef) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  const int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSAD();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADVP9Test, ShortSrc) {
+  const int tmp_stride = source_stride_;
+  source_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSAD();
+  source_stride_ = tmp_stride;
+}
+#endif  // CONFIG_VP9_ENCODER
+
 TEST_P(SADx4Test, MaxRef) {
   FillConstant(source_data_, source_stride_, 0);
   FillConstant(GetReference(0), reference_stride_, 255);
@@ -204,12 +330,6 @@
   CheckSADs();
 }
 
-TEST_P(SADTest, MaxSrc) {
-  FillConstant(source_data_, source_stride_, 255);
-  FillConstant(reference_data_, reference_stride_, 0);
-  CheckSad(UINT_MAX);
-}
-
 TEST_P(SADx4Test, MaxSrc) {
   FillConstant(source_data_, source_stride_, 255);
   FillConstant(GetReference(0), reference_stride_, 0);
@@ -219,15 +339,6 @@
   CheckSADs();
 }
 
-TEST_P(SADTest, ShortRef) {
-  int tmp_stride = reference_stride_;
-  reference_stride_ >>= 1;
-  FillRandom(source_data_, source_stride_);
-  FillRandom(reference_data_, reference_stride_);
-  CheckSad(UINT_MAX);
-  reference_stride_ = tmp_stride;
-}
-
 TEST_P(SADx4Test, ShortRef) {
   int tmp_stride = reference_stride_;
   reference_stride_ >>= 1;
@@ -240,17 +351,6 @@
   reference_stride_ = tmp_stride;
 }
 
-TEST_P(SADTest, UnalignedRef) {
-  // The reference frame, but not the source frame, may be unaligned for
-  // certain types of searches.
-  int tmp_stride = reference_stride_;
-  reference_stride_ -= 1;
-  FillRandom(source_data_, source_stride_);
-  FillRandom(reference_data_, reference_stride_);
-  CheckSad(UINT_MAX);
-  reference_stride_ = tmp_stride;
-}
-
 TEST_P(SADx4Test, UnalignedRef) {
   // The reference frame, but not the source frame, may be unaligned for
   // certain types of searches.
@@ -265,15 +365,6 @@
   reference_stride_ = tmp_stride;
 }
 
-TEST_P(SADTest, ShortSrc) {
-  int tmp_stride = source_stride_;
-  source_stride_ >>= 1;
-  FillRandom(source_data_, source_stride_);
-  FillRandom(reference_data_, reference_stride_);
-  CheckSad(UINT_MAX);
-  source_stride_ = tmp_stride;
-}
-
 TEST_P(SADx4Test, ShortSrc) {
   int tmp_stride = source_stride_;
   source_stride_ >>= 1;
@@ -286,14 +377,6 @@
   source_stride_ = tmp_stride;
 }
 
-TEST_P(SADTest, MaxSAD) {
-  // Verify that, when max_sad is set, the implementation does not return a
-  // value lower than the reference.
-  FillConstant(source_data_, source_stride_, 255);
-  FillConstant(reference_data_, reference_stride_, 0);
-  CheckSad(128);
-}
-
 using std::tr1::make_tuple;
 
 //------------------------------------------------------------------------------
@@ -304,27 +387,27 @@
 const sad_m_by_n_fn_t sad_16x8_c = vp8_sad16x8_c;
 const sad_m_by_n_fn_t sad_8x8_c = vp8_sad8x8_c;
 const sad_m_by_n_fn_t sad_4x4_c = vp8_sad4x4_c;
-#endif
-#if CONFIG_VP9_ENCODER
-const sad_m_by_n_fn_t sad_64x64_c_vp9 = vp9_sad64x64_c;
-const sad_m_by_n_fn_t sad_32x32_c_vp9 = vp9_sad32x32_c;
-const sad_m_by_n_fn_t sad_16x16_c_vp9 = vp9_sad16x16_c;
-const sad_m_by_n_fn_t sad_8x16_c_vp9 = vp9_sad8x16_c;
-const sad_m_by_n_fn_t sad_16x8_c_vp9 = vp9_sad16x8_c;
-const sad_m_by_n_fn_t sad_8x8_c_vp9 = vp9_sad8x8_c;
-const sad_m_by_n_fn_t sad_8x4_c_vp9 = vp9_sad8x4_c;
-const sad_m_by_n_fn_t sad_4x8_c_vp9 = vp9_sad4x8_c;
-const sad_m_by_n_fn_t sad_4x4_c_vp9 = vp9_sad4x4_c;
-#endif
 const sad_m_by_n_test_param_t c_tests[] = {
-#if CONFIG_VP8_ENCODER
   make_tuple(16, 16, sad_16x16_c),
   make_tuple(8, 16, sad_8x16_c),
   make_tuple(16, 8, sad_16x8_c),
   make_tuple(8, 8, sad_8x8_c),
   make_tuple(4, 4, sad_4x4_c),
-#endif
+};
+INSTANTIATE_TEST_CASE_P(C, SADTest, ::testing::ValuesIn(c_tests));
+#endif  // CONFIG_VP8_ENCODER
+
 #if CONFIG_VP9_ENCODER
+const sad_m_by_n_fn_vp9_t sad_64x64_c_vp9 = vp9_sad64x64_c;
+const sad_m_by_n_fn_vp9_t sad_32x32_c_vp9 = vp9_sad32x32_c;
+const sad_m_by_n_fn_vp9_t sad_16x16_c_vp9 = vp9_sad16x16_c;
+const sad_m_by_n_fn_vp9_t sad_8x16_c_vp9 = vp9_sad8x16_c;
+const sad_m_by_n_fn_vp9_t sad_16x8_c_vp9 = vp9_sad16x8_c;
+const sad_m_by_n_fn_vp9_t sad_8x8_c_vp9 = vp9_sad8x8_c;
+const sad_m_by_n_fn_vp9_t sad_8x4_c_vp9 = vp9_sad8x4_c;
+const sad_m_by_n_fn_vp9_t sad_4x8_c_vp9 = vp9_sad4x8_c;
+const sad_m_by_n_fn_vp9_t sad_4x4_c_vp9 = vp9_sad4x4_c;
+const sad_m_by_n_test_param_vp9_t c_vp9_tests[] = {
   make_tuple(64, 64, sad_64x64_c_vp9),
   make_tuple(32, 32, sad_32x32_c_vp9),
   make_tuple(16, 16, sad_16x16_c_vp9),
@@ -334,11 +417,9 @@
   make_tuple(8, 4, sad_8x4_c_vp9),
   make_tuple(4, 8, sad_4x8_c_vp9),
   make_tuple(4, 4, sad_4x4_c_vp9),
-#endif
 };
-INSTANTIATE_TEST_CASE_P(C, SADTest, ::testing::ValuesIn(c_tests));
+INSTANTIATE_TEST_CASE_P(C, SADVP9Test, ::testing::ValuesIn(c_vp9_tests));
 
-#if CONFIG_VP9_ENCODER
 const sad_n_by_n_by_4_fn_t sad_64x64x4d_c = vp9_sad64x64x4d_c;
 const sad_n_by_n_by_4_fn_t sad_64x32x4d_c = vp9_sad64x32x4d_c;
 const sad_n_by_n_by_4_fn_t sad_32x64x4d_c = vp9_sad32x64x4d_c;
@@ -375,8 +456,8 @@
 const sad_m_by_n_fn_t sad_16x16_armv6 = vp8_sad16x16_armv6;
 INSTANTIATE_TEST_CASE_P(MEDIA, SADTest, ::testing::Values(
                         make_tuple(16, 16, sad_16x16_armv6)));
-#endif
-#endif
+#endif  // CONFIG_VP8_ENCODER
+#endif  // HAVE_MEDIA
 
 #if HAVE_NEON
 #if CONFIG_VP8_ENCODER
@@ -391,8 +472,8 @@
                         make_tuple(16, 8, sad_16x8_neon),
                         make_tuple(8, 8, sad_8x8_neon),
                         make_tuple(4, 4, sad_4x4_neon)));
-#endif
-#endif
+#endif  // CONFIG_VP8_ENCODER
+#endif  // HAVE_NEON
 
 //------------------------------------------------------------------------------
 // x86 functions
@@ -403,40 +484,39 @@
 const sad_m_by_n_fn_t sad_16x8_mmx = vp8_sad16x8_mmx;
 const sad_m_by_n_fn_t sad_8x8_mmx = vp8_sad8x8_mmx;
 const sad_m_by_n_fn_t sad_4x4_mmx = vp8_sad4x4_mmx;
-#endif
-#if CONFIG_VP9_ENCODER
-const sad_m_by_n_fn_t sad_16x16_mmx_vp9 = vp9_sad16x16_mmx;
-const sad_m_by_n_fn_t sad_8x16_mmx_vp9 = vp9_sad8x16_mmx;
-const sad_m_by_n_fn_t sad_16x8_mmx_vp9 = vp9_sad16x8_mmx;
-const sad_m_by_n_fn_t sad_8x8_mmx_vp9 = vp9_sad8x8_mmx;
-const sad_m_by_n_fn_t sad_4x4_mmx_vp9 = vp9_sad4x4_mmx;
-#endif
-
 const sad_m_by_n_test_param_t mmx_tests[] = {
-#if CONFIG_VP8_ENCODER
   make_tuple(16, 16, sad_16x16_mmx),
   make_tuple(8, 16, sad_8x16_mmx),
   make_tuple(16, 8, sad_16x8_mmx),
   make_tuple(8, 8, sad_8x8_mmx),
   make_tuple(4, 4, sad_4x4_mmx),
-#endif
+};
+INSTANTIATE_TEST_CASE_P(MMX, SADTest, ::testing::ValuesIn(mmx_tests));
+#endif  // CONFIG_VP8_ENCODER
+
 #if CONFIG_VP9_ENCODER
+const sad_m_by_n_fn_vp9_t sad_16x16_mmx_vp9 = vp9_sad16x16_mmx;
+const sad_m_by_n_fn_vp9_t sad_8x16_mmx_vp9 = vp9_sad8x16_mmx;
+const sad_m_by_n_fn_vp9_t sad_16x8_mmx_vp9 = vp9_sad16x8_mmx;
+const sad_m_by_n_fn_vp9_t sad_8x8_mmx_vp9 = vp9_sad8x8_mmx;
+const sad_m_by_n_fn_vp9_t sad_4x4_mmx_vp9 = vp9_sad4x4_mmx;
+const sad_m_by_n_test_param_vp9_t mmx_vp9_tests[] = {
   make_tuple(16, 16, sad_16x16_mmx_vp9),
   make_tuple(8, 16, sad_8x16_mmx_vp9),
   make_tuple(16, 8, sad_16x8_mmx_vp9),
   make_tuple(8, 8, sad_8x8_mmx_vp9),
   make_tuple(4, 4, sad_4x4_mmx_vp9),
-#endif
 };
-INSTANTIATE_TEST_CASE_P(MMX, SADTest, ::testing::ValuesIn(mmx_tests));
-#endif
+INSTANTIATE_TEST_CASE_P(MMX, SADVP9Test, ::testing::ValuesIn(mmx_vp9_tests));
+#endif  // CONFIG_VP9_ENCODER
+#endif  // HAVE_MMX
 
 #if HAVE_SSE
 #if CONFIG_VP9_ENCODER
 #if CONFIG_USE_X86INC
-const sad_m_by_n_fn_t sad_4x4_sse_vp9 = vp9_sad4x4_sse;
-const sad_m_by_n_fn_t sad_4x8_sse_vp9 = vp9_sad4x8_sse;
-INSTANTIATE_TEST_CASE_P(SSE, SADTest, ::testing::Values(
+const sad_m_by_n_fn_vp9_t sad_4x4_sse_vp9 = vp9_sad4x4_sse;
+const sad_m_by_n_fn_vp9_t sad_4x8_sse_vp9 = vp9_sad4x8_sse;
+INSTANTIATE_TEST_CASE_P(SSE, SADVP9Test, ::testing::Values(
                         make_tuple(4, 4, sad_4x4_sse_vp9),
                         make_tuple(4, 8, sad_4x8_sse_vp9)));
 
@@ -456,32 +536,30 @@
 const sad_m_by_n_fn_t sad_16x8_wmt = vp8_sad16x8_wmt;
 const sad_m_by_n_fn_t sad_8x8_wmt = vp8_sad8x8_wmt;
 const sad_m_by_n_fn_t sad_4x4_wmt = vp8_sad4x4_wmt;
-#endif
-#if CONFIG_VP9_ENCODER
-#if CONFIG_USE_X86INC
-const sad_m_by_n_fn_t sad_64x64_sse2_vp9 = vp9_sad64x64_sse2;
-const sad_m_by_n_fn_t sad_64x32_sse2_vp9 = vp9_sad64x32_sse2;
-const sad_m_by_n_fn_t sad_32x64_sse2_vp9 = vp9_sad32x64_sse2;
-const sad_m_by_n_fn_t sad_32x32_sse2_vp9 = vp9_sad32x32_sse2;
-const sad_m_by_n_fn_t sad_32x16_sse2_vp9 = vp9_sad32x16_sse2;
-const sad_m_by_n_fn_t sad_16x32_sse2_vp9 = vp9_sad16x32_sse2;
-const sad_m_by_n_fn_t sad_16x16_sse2_vp9 = vp9_sad16x16_sse2;
-const sad_m_by_n_fn_t sad_16x8_sse2_vp9 = vp9_sad16x8_sse2;
-const sad_m_by_n_fn_t sad_8x16_sse2_vp9 = vp9_sad8x16_sse2;
-const sad_m_by_n_fn_t sad_8x8_sse2_vp9 = vp9_sad8x8_sse2;
-const sad_m_by_n_fn_t sad_8x4_sse2_vp9 = vp9_sad8x4_sse2;
-#endif
-#endif
 const sad_m_by_n_test_param_t sse2_tests[] = {
-#if CONFIG_VP8_ENCODER
   make_tuple(16, 16, sad_16x16_wmt),
   make_tuple(8, 16, sad_8x16_wmt),
   make_tuple(16, 8, sad_16x8_wmt),
   make_tuple(8, 8, sad_8x8_wmt),
   make_tuple(4, 4, sad_4x4_wmt),
-#endif
+};
+INSTANTIATE_TEST_CASE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests));
+#endif  // CONFIG_VP8_ENCODER
+
 #if CONFIG_VP9_ENCODER
 #if CONFIG_USE_X86INC
+const sad_m_by_n_fn_vp9_t sad_64x64_sse2_vp9 = vp9_sad64x64_sse2;
+const sad_m_by_n_fn_vp9_t sad_64x32_sse2_vp9 = vp9_sad64x32_sse2;
+const sad_m_by_n_fn_vp9_t sad_32x64_sse2_vp9 = vp9_sad32x64_sse2;
+const sad_m_by_n_fn_vp9_t sad_32x32_sse2_vp9 = vp9_sad32x32_sse2;
+const sad_m_by_n_fn_vp9_t sad_32x16_sse2_vp9 = vp9_sad32x16_sse2;
+const sad_m_by_n_fn_vp9_t sad_16x32_sse2_vp9 = vp9_sad16x32_sse2;
+const sad_m_by_n_fn_vp9_t sad_16x16_sse2_vp9 = vp9_sad16x16_sse2;
+const sad_m_by_n_fn_vp9_t sad_16x8_sse2_vp9 = vp9_sad16x8_sse2;
+const sad_m_by_n_fn_vp9_t sad_8x16_sse2_vp9 = vp9_sad8x16_sse2;
+const sad_m_by_n_fn_vp9_t sad_8x8_sse2_vp9 = vp9_sad8x8_sse2;
+const sad_m_by_n_fn_vp9_t sad_8x4_sse2_vp9 = vp9_sad8x4_sse2;
+const sad_m_by_n_test_param_vp9_t sse2_vp9_tests[] = {
   make_tuple(64, 64, sad_64x64_sse2_vp9),
   make_tuple(64, 32, sad_64x32_sse2_vp9),
   make_tuple(32, 64, sad_32x64_sse2_vp9),
@@ -493,13 +571,9 @@
   make_tuple(8, 16, sad_8x16_sse2_vp9),
   make_tuple(8, 8, sad_8x8_sse2_vp9),
   make_tuple(8, 4, sad_8x4_sse2_vp9),
-#endif
-#endif
 };
-INSTANTIATE_TEST_CASE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests));
+INSTANTIATE_TEST_CASE_P(SSE2, SADVP9Test, ::testing::ValuesIn(sse2_vp9_tests));
 
-#if CONFIG_VP9_ENCODER
-#if CONFIG_USE_X86INC
 const sad_n_by_n_by_4_fn_t sad_64x64x4d_sse2 = vp9_sad64x64x4d_sse2;
 const sad_n_by_n_by_4_fn_t sad_64x32x4d_sse2 = vp9_sad64x32x4d_sse2;
 const sad_n_by_n_by_4_fn_t sad_32x64x4d_sse2 = vp9_sad32x64x4d_sse2;
@@ -523,9 +597,9 @@
                         make_tuple(8, 16, sad_8x16x4d_sse2),
                         make_tuple(8, 8, sad_8x8x4d_sse2),
                         make_tuple(8, 4, sad_8x4x4d_sse2)));
-#endif
-#endif
-#endif
+#endif  // CONFIG_USE_X86INC
+#endif  // CONFIG_VP9_ENCODER
+#endif  // HAVE_SSE2
 
 #if HAVE_SSE3
 #if CONFIG_VP8_ENCODER
@@ -540,8 +614,8 @@
                         make_tuple(8, 16, sad_8x16x4d_sse3),
                         make_tuple(8, 8, sad_8x8x4d_sse3),
                         make_tuple(4, 4, sad_4x4x4d_sse3)));
-#endif
-#endif
+#endif  // CONFIG_VP8_ENCODER
+#endif  // HAVE_SSE3
 
 #if HAVE_SSSE3
 #if CONFIG_USE_X86INC
@@ -549,8 +623,8 @@
 const sad_m_by_n_fn_t sad_16x16_sse3 = vp8_sad16x16_sse3;
 INSTANTIATE_TEST_CASE_P(SSE3, SADTest, ::testing::Values(
                         make_tuple(16, 16, sad_16x16_sse3)));
-#endif
-#endif
-#endif
+#endif  // CONFIG_VP8_ENCODER
+#endif  // CONFIG_USE_X86INC
+#endif  // HAVE_SSSE3
 
 }  // namespace
diff --git a/vp8/encoder/arm/neon/denoising_neon.c b/vp8/encoder/arm/neon/denoising_neon.c
index b8e4034..78cc6fa 100644
--- a/vp8/encoder/arm/neon/denoising_neon.c
+++ b/vp8/encoder/arm/neon/denoising_neon.c
@@ -68,8 +68,8 @@
     int64x2_t v_sum_diff_total = vdupq_n_s64(0);
 
     /* Go over lines. */
-    int i;
-    for (i = 0; i < 16; ++i) {
+    int r;
+    for (r = 0; r < 16; ++r) {
         /* Load inputs. */
         const uint8x16_t v_sig = vld1q_u8(sig);
         const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y);
@@ -145,14 +145,91 @@
 
     /* Too much adjustments => copy block. */
     {
-        const int64x1_t x = vqadd_s64(vget_high_s64(v_sum_diff_total),
+        int64x1_t x = vqadd_s64(vget_high_s64(v_sum_diff_total),
                                       vget_low_s64(v_sum_diff_total));
-        const int s0 = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0);
+        int sum_diff = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0);
         int sum_diff_thresh = SUM_DIFF_THRESHOLD;
 
         if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH;
-        if (s0 > sum_diff_thresh)
+        if (sum_diff > sum_diff_thresh) {
+          // Before returning to copy the block (i.e., apply no denoising),
+          // checK if we can still apply some (weaker) temporal filtering to
+          // this block, that would otherwise not be denoised at all. Simplest
+          // is to apply an additional adjustment to running_avg_y to bring it
+          // closer to sig. The adjustment is capped by a maximum delta, and
+          // chosen such that in most cases the resulting sum_diff will be
+          // within the accceptable range given by sum_diff_thresh.
+
+          // The delta is set by the excess of absolute pixel diff over the
+          // threshold.
+          int delta = ((sum_diff - sum_diff_thresh) >> 8) + 1;
+          // Only apply the adjustment for max delta up to 3.
+          if (delta < 4) {
+            const uint8x16_t k_delta = vmovq_n_u8(delta);
+            sig -= sig_stride * 16;
+            mc_running_avg_y -= mc_running_avg_y_stride * 16;
+            running_avg_y -= running_avg_y_stride * 16;
+            for (r = 0; r < 16; ++r) {
+              uint8x16_t v_running_avg_y = vld1q_u8(running_avg_y);
+              const uint8x16_t v_sig = vld1q_u8(sig);
+              const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y);
+
+              /* Calculate absolute difference and sign masks. */
+              const uint8x16_t v_abs_diff      = vabdq_u8(v_sig,
+                                                          v_mc_running_avg_y);
+              const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig,
+                                                          v_mc_running_avg_y);
+              const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig,
+                                                          v_mc_running_avg_y);
+              // Clamp absolute difference to delta to get the adjustment.
+              const uint8x16_t v_abs_adjustment =
+                  vminq_u8(v_abs_diff, (k_delta));
+
+              const uint8x16_t v_pos_adjustment = vandq_u8(v_diff_pos_mask,
+                                                           v_abs_adjustment);
+              const uint8x16_t v_neg_adjustment = vandq_u8(v_diff_neg_mask,
+                                                           v_abs_adjustment);
+
+              v_running_avg_y = vqsubq_u8(v_running_avg_y, v_pos_adjustment);
+              v_running_avg_y = vqaddq_u8(v_running_avg_y, v_neg_adjustment);
+
+              /* Store results. */
+              vst1q_u8(running_avg_y, v_running_avg_y);
+
+              {
+                  const int8x16_t v_sum_diff =
+                      vqsubq_s8(vreinterpretq_s8_u8(v_neg_adjustment),
+                                vreinterpretq_s8_u8(v_pos_adjustment));
+
+                  const int16x8_t fe_dc_ba_98_76_54_32_10 =
+                      vpaddlq_s8(v_sum_diff);
+                  const int32x4_t fedc_ba98_7654_3210 =
+                      vpaddlq_s16(fe_dc_ba_98_76_54_32_10);
+                  const int64x2_t fedcba98_76543210 =
+                      vpaddlq_s32(fedc_ba98_7654_3210);
+
+                  v_sum_diff_total = vqaddq_s64(v_sum_diff_total,
+                                                fedcba98_76543210);
+              }
+              /* Update pointers for next iteration. */
+              sig += sig_stride;
+              mc_running_avg_y += mc_running_avg_y_stride;
+              running_avg_y += running_avg_y_stride;
+            }
+            {
+              // Update the sum of all pixel differences of this MB.
+              x = vqadd_s64(vget_high_s64(v_sum_diff_total),
+                            vget_low_s64(v_sum_diff_total));
+              sum_diff = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0);
+
+              if (sum_diff > sum_diff_thresh) {
+                return COPY_BLOCK;
+              }
+            }
+          } else {
             return COPY_BLOCK;
+          }
+        }
     }
 
     /* Tell above level that block was filtered. */
diff --git a/vp8/encoder/block.h b/vp8/encoder/block.h
index 34879cf..1f212ca 100644
--- a/vp8/encoder/block.h
+++ b/vp8/encoder/block.h
@@ -125,9 +125,9 @@
 
     int optimize;
     int q_index;
-    int increase_denoising;
 
 #if CONFIG_TEMPORAL_DENOISING
+    int increase_denoising;
     MB_PREDICTION_MODE best_sse_inter_mode;
     int_mv best_sse_mv;
     MV_REFERENCE_FRAME best_reference_frame;
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index cf6a82f..817c9ef 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -590,9 +590,9 @@
     int distortion2;
     int bestsme = INT_MAX;
     int best_mode_index = 0;
-    unsigned int sse = INT_MAX, best_rd_sse = INT_MAX;
+    unsigned int sse = UINT_MAX, best_rd_sse = UINT_MAX;
 #if CONFIG_TEMPORAL_DENOISING
-    unsigned int zero_mv_sse = INT_MAX, best_sse = INT_MAX;
+    unsigned int zero_mv_sse = UINT_MAX, best_sse = UINT_MAX;
 #endif
 
     int sf_improved_mv_pred = cpi->sf.improved_mv_pred;
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index d9f39b5..f145d09 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -1973,8 +1973,8 @@
                                              cpi->common.y1dc_delta_q);
 
 #if CONFIG_TEMPORAL_DENOISING
-    unsigned int zero_mv_sse = INT_MAX, best_sse = INT_MAX,
-            best_rd_sse = INT_MAX;
+    unsigned int zero_mv_sse = UINT_MAX, best_sse = UINT_MAX,
+            best_rd_sse = UINT_MAX;
 #endif
 
     mode_mv = mode_mv_sb[sign_bias];
diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h
index d868776..3253bcb 100644
--- a/vp9/common/vp9_idct.h
+++ b/vp9/common/vp9_idct.h
@@ -33,6 +33,9 @@
 #define pair_set_epi16(a, b) \
   _mm_set_epi16(b, a, b, a, b, a, b, a)
 
+#define dual_set_epi16(a, b) \
+  _mm_set_epi16(b, b, b, b, a, a, a, a)
+
 // Constants:
 //  for (int i = 1; i< 32; ++i)
 //    printf("static const int cospi_%d_64 = %.0f;\n", i,
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 95cb40a..09ce72e 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -360,7 +360,7 @@
 $vp9_idct16x16_256_add_neon_asm=vp9_idct16x16_256_add_neon;
 
 add_proto qw/void vp9_idct16x16_10_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct16x16_10_add sse2 neon_asm dspr2/;
+specialize qw/vp9_idct16x16_10_add sse2 ssse3 neon_asm dspr2/;
 $vp9_idct16x16_10_add_neon_asm=vp9_idct16x16_10_add_neon;
 
 add_proto qw/void vp9_idct32x32_1024_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
@@ -422,10 +422,6 @@
 add_proto qw/unsigned int vp9_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_variance16x16 mmx/, "$sse2_x86inc", "$avx2_x86inc";
 
-add_proto qw/void vp9_get_sse_sum_16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-specialize qw/vp9_get_sse_sum_16x16 sse2/;
-$vp9_get_sse_sum_16x16_sse2=vp9_get16x16var_sse2;
-
 add_proto qw/unsigned int vp9_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_variance16x8 mmx/, "$sse2_x86inc";
 
@@ -435,9 +431,11 @@
 add_proto qw/unsigned int vp9_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_variance8x8 mmx/, "$sse2_x86inc";
 
-add_proto qw/void vp9_get_sse_sum_8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-specialize qw/vp9_get_sse_sum_8x8 sse2/;
-$vp9_get_sse_sum_8x8_sse2=vp9_get8x8var_sse2;
+add_proto qw/void vp9_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+specialize qw/vp9_get8x8var mmx/, "$sse2_x86inc";
+
+add_proto qw/void vp9_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+specialize qw/vp9_get16x16var avx2/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_variance8x4/, "$sse2_x86inc";
@@ -528,82 +526,82 @@
 add_proto qw/unsigned int vp9_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
 specialize qw/vp9_sub_pixel_avg_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";
 
-add_proto qw/unsigned int vp9_sad64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
 specialize qw/vp9_sad64x64/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
 specialize qw/vp9_sad32x64/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
 specialize qw/vp9_sad64x32/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
 specialize qw/vp9_sad32x16/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
 specialize qw/vp9_sad16x32/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
 specialize qw/vp9_sad32x32/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
 specialize qw/vp9_sad16x16 mmx/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
 specialize qw/vp9_sad16x8 mmx/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
 specialize qw/vp9_sad8x16 mmx/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
 specialize qw/vp9_sad8x8 mmx/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
 specialize qw/vp9_sad8x4/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
 specialize qw/vp9_sad4x8/, "$sse_x86inc";
 
-add_proto qw/unsigned int vp9_sad4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
 specialize qw/vp9_sad4x4 mmx/, "$sse_x86inc";
 
-add_proto qw/unsigned int vp9_sad64x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad64x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
 specialize qw/vp9_sad64x64_avg/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad32x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad32x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
 specialize qw/vp9_sad32x64_avg/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad64x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad64x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
 specialize qw/vp9_sad64x32_avg/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad32x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad32x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
 specialize qw/vp9_sad32x16_avg/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad16x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad16x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
 specialize qw/vp9_sad16x32_avg/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad32x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad32x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
 specialize qw/vp9_sad32x32_avg/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad16x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad16x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
 specialize qw/vp9_sad16x16_avg/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad16x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad16x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
 specialize qw/vp9_sad16x8_avg/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad8x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad8x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
 specialize qw/vp9_sad8x16_avg/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad8x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad8x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
 specialize qw/vp9_sad8x8_avg/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad8x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad8x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
 specialize qw/vp9_sad8x4_avg/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vp9_sad4x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad4x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
 specialize qw/vp9_sad4x8_avg/, "$sse_x86inc";
 
-add_proto qw/unsigned int vp9_sad4x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+add_proto qw/unsigned int vp9_sad4x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
 specialize qw/vp9_sad4x4_avg/, "$sse_x86inc";
 
 add_proto qw/void vp9_sad64x64x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index ff9c432..b60f8a0 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -375,15 +375,6 @@
     out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
   }
 
-#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \
-  {                                                     \
-    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
-                                                        \
-    in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);  /* i1 i0 */  \
-    in1 = _mm_unpackhi_epi32(tr0_0, tr0_1);  /* i3 i2 */  \
-  }
-
 #define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
   {                                            \
     const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
@@ -612,23 +603,6 @@
   RECON_AND_STORE(dest, dc_value);
 }
 
-static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) {
-  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
-  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
-  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
-  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
-
-  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-
-  out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4);
-  out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4);
-  out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6);
-  out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);
-}
-
 static void idct8_sse2(__m128i *in) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.h b/vp9/common/x86/vp9_idct_intrin_sse2.h
index 1c62e32..0f179b4 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.h
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.h
@@ -45,6 +45,32 @@
   res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
 }
 
+#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \
+  {                                                     \
+    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
+    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
+                                                        \
+    in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);  /* i1 i0 */  \
+    in1 = _mm_unpackhi_epi32(tr0_0, tr0_1);  /* i3 i2 */  \
+  }
+
+static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) {
+  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
+
+  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+
+  out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4);
+  out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4);
+  out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6);
+  out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);
+}
+
 static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
   __m128i tbuf[8];
   array_transpose_8x8(res0, res0);
diff --git a/vp9/common/x86/vp9_idct_intrin_ssse3.c b/vp9/common/x86/vp9_idct_intrin_ssse3.c
index 0930e78..73bf5d1 100644
--- a/vp9/common/x86/vp9_idct_intrin_ssse3.c
+++ b/vp9/common/x86/vp9_idct_intrin_ssse3.c
@@ -383,3 +383,380 @@
   dest += 8;
   write_buffer_8x16(dest, in1, stride);
 }
+
+static void idct16_10_r1(__m128i *in, __m128i *l) {
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i zero = _mm_setzero_si128();
+
+  const __m128i stg2_01 = dual_set_epi16(3212, 32610);
+  const __m128i stg2_67 = dual_set_epi16(-9512, 31358);
+  const __m128i stg3_01 = dual_set_epi16(6392, 32138);
+  const __m128i stg4_01 = dual_set_epi16(23170, 23170);
+
+
+
+  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+
+  __m128i stp1_0, stp1_1, stp1_4, stp1_6,
+          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15;
+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4;
+
+  // Stage2
+  {
+    const __m128i lo_1_15 = _mm_unpackhi_epi64(in[0], in[0]);
+    const __m128i lo_13_3 = _mm_unpackhi_epi64(in[1], in[1]);
+
+    stp2_8  = _mm_mulhrs_epi16(lo_1_15, stg2_01);
+    stp2_11 = _mm_mulhrs_epi16(lo_13_3, stg2_67);
+  }
+
+  // Stage3
+  {
+    const __m128i lo_2_14 = _mm_unpacklo_epi64(in[1], in[1]);
+    stp1_4 = _mm_mulhrs_epi16(lo_2_14, stg3_01);
+
+    stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
+    stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
+  }
+
+  // Stage4
+  {
+    const __m128i lo_0_8 = _mm_unpacklo_epi64(in[0], in[0]);
+    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
+    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
+
+    tmp0 = _mm_mulhrs_epi16(lo_0_8, stg4_01);
+    tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
+    tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
+    tmp2 = _mm_madd_epi16(lo_10_13, stg4_6);
+    tmp4 = _mm_madd_epi16(lo_10_13, stg4_7);
+
+    tmp1 = _mm_add_epi32(tmp1, rounding);
+    tmp3 = _mm_add_epi32(tmp3, rounding);
+    tmp2 = _mm_add_epi32(tmp2, rounding);
+    tmp4 = _mm_add_epi32(tmp4, rounding);
+
+    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
+    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
+
+    stp1_0 = _mm_unpacklo_epi64(tmp0, tmp0);
+    stp1_1 = _mm_unpackhi_epi64(tmp0, tmp0);
+    stp2_9 = _mm_packs_epi32(tmp1, tmp3);
+    stp2_10 = _mm_packs_epi32(tmp2, tmp4);
+
+    stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
+  }
+
+  // Stage5 and Stage6
+  {
+    tmp0 = _mm_add_epi16(stp2_8, stp2_11);
+    tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
+    tmp2 = _mm_add_epi16(stp2_9, stp2_10);
+    tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
+
+    stp1_9  = _mm_unpacklo_epi64(tmp2, zero);
+    stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
+    stp1_8  = _mm_unpacklo_epi64(tmp0, zero);
+    stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
+
+    stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
+    stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
+    stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
+    stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
+  }
+
+  // Stage6
+  {
+    const __m128i lo_6_5 = _mm_add_epi16(stp2_6, stp1_4);
+    const __m128i lo_6_6 = _mm_sub_epi16(stp2_6, stp1_4);
+    const __m128i lo_10_13 = _mm_sub_epi16(stp1_13, stp1_10);
+    const __m128i lo_10_14 = _mm_add_epi16(stp1_13, stp1_10);
+    const __m128i lo_11_12 = _mm_sub_epi16(stp1_12, stp1_11);
+    const __m128i lo_11_13 = _mm_add_epi16(stp1_12, stp1_11);
+
+    tmp1 = _mm_unpacklo_epi64(lo_6_5, lo_6_6);
+    tmp0 = _mm_unpacklo_epi64(lo_10_13, lo_10_14);
+    tmp4 = _mm_unpacklo_epi64(lo_11_12, lo_11_13);
+
+    stp1_6 = _mm_mulhrs_epi16(tmp1, stg4_01);
+    tmp0   = _mm_mulhrs_epi16(tmp0, stg4_01);
+    tmp4   = _mm_mulhrs_epi16(tmp4, stg4_01);
+
+    stp2_10 = _mm_unpacklo_epi64(tmp0, zero);
+    stp2_13 = _mm_unpackhi_epi64(tmp0, zero);
+    stp2_11 = _mm_unpacklo_epi64(tmp4, zero);
+    stp2_12 = _mm_unpackhi_epi64(tmp4, zero);
+
+    tmp0 = _mm_add_epi16(stp1_0, stp1_4);
+    tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
+    tmp2 = _mm_add_epi16(stp1_1, stp1_6);
+    tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
+
+    stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
+    stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
+    stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
+    stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
+    stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
+    stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
+    stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
+    stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
+  }
+
+  // Stage7. Left 8x16 only.
+  l[0] = _mm_add_epi16(stp2_0, stp1_15);
+  l[1] = _mm_add_epi16(stp2_1, stp1_14);
+  l[2] = _mm_add_epi16(stp2_2, stp2_13);
+  l[3] = _mm_add_epi16(stp2_3, stp2_12);
+  l[4] = _mm_add_epi16(stp2_4, stp2_11);
+  l[5] = _mm_add_epi16(stp2_5, stp2_10);
+  l[6] = _mm_add_epi16(stp2_6, stp1_9);
+  l[7] = _mm_add_epi16(stp2_7, stp1_8);
+  l[8] = _mm_sub_epi16(stp2_7, stp1_8);
+  l[9] = _mm_sub_epi16(stp2_6, stp1_9);
+  l[10] = _mm_sub_epi16(stp2_5, stp2_10);
+  l[11] = _mm_sub_epi16(stp2_4, stp2_11);
+  l[12] = _mm_sub_epi16(stp2_3, stp2_12);
+  l[13] = _mm_sub_epi16(stp2_2, stp2_13);
+  l[14] = _mm_sub_epi16(stp2_1, stp1_14);
+  l[15] = _mm_sub_epi16(stp2_0, stp1_15);
+}
+
+static void idct16_10_r2(__m128i *in) {
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  const __m128i stg2_0 = dual_set_epi16(3212, 3212);
+  const __m128i stg2_1 = dual_set_epi16(32610, 32610);
+  const __m128i stg2_6 = dual_set_epi16(-9512, -9512);
+  const __m128i stg2_7 = dual_set_epi16(31358, 31358);
+  const __m128i stg3_0 = dual_set_epi16(6392, 6392);
+  const __m128i stg3_1 = dual_set_epi16(32138, 32138);
+  const __m128i stg4_01 = dual_set_epi16(23170, 23170);
+
+  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+
+  __m128i stp1_0, stp1_2, stp1_3, stp1_5, stp1_6,
+          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+          stp1_8_0, stp1_12_0;
+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+          stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+  /* Stage2 */
+  {
+    stp1_8_0  = _mm_mulhrs_epi16(in[1], stg2_0);
+    stp1_15   = _mm_mulhrs_epi16(in[1], stg2_1);
+    stp1_11   = _mm_mulhrs_epi16(in[3], stg2_6);
+    stp1_12_0 = _mm_mulhrs_epi16(in[3], stg2_7);
+  }
+
+  /* Stage3 */
+  {
+    stp2_4 = _mm_mulhrs_epi16(in[2], stg3_0);
+    stp2_7 = _mm_mulhrs_epi16(in[2], stg3_1);
+
+    stp1_9  =  stp1_8_0;
+    stp1_10 =  stp1_11;
+
+    stp1_13 = stp1_12_0;
+    stp1_14 = stp1_15;
+  }
+
+  /* Stage4 */
+  {
+    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);
+    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);
+    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
+    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);
+
+    stp1_0 = _mm_mulhrs_epi16(in[0], stg4_01);
+
+    stp2_5 = stp2_4;
+    stp2_6 = stp2_7;
+
+
+    tmp0 = _mm_madd_epi16(lo_9_14, stg4_4);
+    tmp1 = _mm_madd_epi16(hi_9_14, stg4_4);
+    tmp2 = _mm_madd_epi16(lo_9_14, stg4_5);
+    tmp3 = _mm_madd_epi16(hi_9_14, stg4_5);
+    tmp4 = _mm_madd_epi16(lo_10_13, stg4_6);
+    tmp5 = _mm_madd_epi16(hi_10_13, stg4_6);
+    tmp6 = _mm_madd_epi16(lo_10_13, stg4_7);
+    tmp7 = _mm_madd_epi16(hi_10_13, stg4_7);
+
+    tmp0 = _mm_add_epi32(tmp0, rounding);
+    tmp1 = _mm_add_epi32(tmp1, rounding);
+    tmp2 = _mm_add_epi32(tmp2, rounding);
+    tmp3 = _mm_add_epi32(tmp3, rounding);
+    tmp4 = _mm_add_epi32(tmp4, rounding);
+    tmp5 = _mm_add_epi32(tmp5, rounding);
+    tmp6 = _mm_add_epi32(tmp6, rounding);
+    tmp7 = _mm_add_epi32(tmp7, rounding);
+
+    tmp0 = _mm_srai_epi32(tmp0, 14);
+    tmp1 = _mm_srai_epi32(tmp1, 14);
+    tmp2 = _mm_srai_epi32(tmp2, 14);
+    tmp3 = _mm_srai_epi32(tmp3, 14);
+    tmp4 = _mm_srai_epi32(tmp4, 14);
+    tmp5 = _mm_srai_epi32(tmp5, 14);
+    tmp6 = _mm_srai_epi32(tmp6, 14);
+    tmp7 = _mm_srai_epi32(tmp7, 14);
+
+    stp2_9 = _mm_packs_epi32(tmp0, tmp1);
+    stp2_14 = _mm_packs_epi32(tmp2, tmp3);
+    stp2_10 = _mm_packs_epi32(tmp4, tmp5);
+    stp2_13 = _mm_packs_epi32(tmp6, tmp7);
+  }
+
+  /* Stage5 */
+  {
+    stp1_2 = stp1_0;
+    stp1_3 = stp1_0;
+
+    tmp0 = _mm_sub_epi16(stp2_6, stp2_5);
+    tmp1 = _mm_add_epi16(stp2_6, stp2_5);
+
+    stp1_5 = _mm_mulhrs_epi16(tmp0, stg4_01);
+    stp1_6 = _mm_mulhrs_epi16(tmp1, stg4_01);
+
+    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);
+    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);
+    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);
+    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);
+
+    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);
+    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);
+    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);
+    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);
+  }
+
+  /* Stage6 */
+  {
+    stp2_0 = _mm_add_epi16(stp1_0, stp2_7);
+    stp2_1 = _mm_add_epi16(stp1_0, stp1_6);
+    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);
+    stp2_3 = _mm_add_epi16(stp1_3, stp2_4);
+
+    tmp0 = _mm_sub_epi16(stp1_13, stp1_10);
+    tmp1 = _mm_add_epi16(stp1_13, stp1_10);
+    tmp2 = _mm_sub_epi16(stp1_12, stp1_11);
+    tmp3 = _mm_add_epi16(stp1_12, stp1_11);
+
+    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);
+    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);
+    stp2_6 = _mm_sub_epi16(stp1_0, stp1_6);
+    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);
+
+    stp2_10 = _mm_mulhrs_epi16(tmp0, stg4_01);
+    stp2_13 = _mm_mulhrs_epi16(tmp1, stg4_01);
+    stp2_11 = _mm_mulhrs_epi16(tmp2, stg4_01);
+    stp2_12 = _mm_mulhrs_epi16(tmp3, stg4_01);
+  }
+
+  // Stage7
+  in[0] = _mm_add_epi16(stp2_0, stp1_15);
+  in[1] = _mm_add_epi16(stp2_1, stp1_14);
+  in[2] = _mm_add_epi16(stp2_2, stp2_13);
+  in[3] = _mm_add_epi16(stp2_3, stp2_12);
+  in[4] = _mm_add_epi16(stp2_4, stp2_11);
+  in[5] = _mm_add_epi16(stp2_5, stp2_10);
+  in[6] = _mm_add_epi16(stp2_6, stp1_9);
+  in[7] = _mm_add_epi16(stp2_7, stp1_8);
+  in[8] = _mm_sub_epi16(stp2_7, stp1_8);
+  in[9] = _mm_sub_epi16(stp2_6, stp1_9);
+  in[10] = _mm_sub_epi16(stp2_5, stp2_10);
+  in[11] = _mm_sub_epi16(stp2_4, stp2_11);
+  in[12] = _mm_sub_epi16(stp2_3, stp2_12);
+  in[13] = _mm_sub_epi16(stp2_2, stp2_13);
+  in[14] = _mm_sub_epi16(stp2_1, stp1_14);
+  in[15] = _mm_sub_epi16(stp2_0, stp1_15);
+}
+
+void vp9_idct16x16_10_add_ssse3(const int16_t *input, uint8_t *dest,
+                               int stride) {
+  const __m128i final_rounding = _mm_set1_epi16(1<<5);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i in[16], l[16];
+
+  int i;
+  // First 1-D inverse DCT
+  // Load input data.
+  in[0] = _mm_load_si128((const __m128i *)input);
+  in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
+  in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
+  in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
+
+  TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
+
+  idct16_10_r1(in, l);
+
+  // Second 1-D inverse transform, performed per 8x16 block
+  for (i = 0; i < 2; i++) {
+    array_transpose_4X8(l + 8*i, in);
+
+    idct16_10_r2(in);
+
+    // Final rounding and shift
+    in[0] = _mm_adds_epi16(in[0], final_rounding);
+    in[1] = _mm_adds_epi16(in[1], final_rounding);
+    in[2] = _mm_adds_epi16(in[2], final_rounding);
+    in[3] = _mm_adds_epi16(in[3], final_rounding);
+    in[4] = _mm_adds_epi16(in[4], final_rounding);
+    in[5] = _mm_adds_epi16(in[5], final_rounding);
+    in[6] = _mm_adds_epi16(in[6], final_rounding);
+    in[7] = _mm_adds_epi16(in[7], final_rounding);
+    in[8] = _mm_adds_epi16(in[8], final_rounding);
+    in[9] = _mm_adds_epi16(in[9], final_rounding);
+    in[10] = _mm_adds_epi16(in[10], final_rounding);
+    in[11] = _mm_adds_epi16(in[11], final_rounding);
+    in[12] = _mm_adds_epi16(in[12], final_rounding);
+    in[13] = _mm_adds_epi16(in[13], final_rounding);
+    in[14] = _mm_adds_epi16(in[14], final_rounding);
+    in[15] = _mm_adds_epi16(in[15], final_rounding);
+
+    in[0] = _mm_srai_epi16(in[0], 6);
+    in[1] = _mm_srai_epi16(in[1], 6);
+    in[2] = _mm_srai_epi16(in[2], 6);
+    in[3] = _mm_srai_epi16(in[3], 6);
+    in[4] = _mm_srai_epi16(in[4], 6);
+    in[5] = _mm_srai_epi16(in[5], 6);
+    in[6] = _mm_srai_epi16(in[6], 6);
+    in[7] = _mm_srai_epi16(in[7], 6);
+    in[8] = _mm_srai_epi16(in[8], 6);
+    in[9] = _mm_srai_epi16(in[9], 6);
+    in[10] = _mm_srai_epi16(in[10], 6);
+    in[11] = _mm_srai_epi16(in[11], 6);
+    in[12] = _mm_srai_epi16(in[12], 6);
+    in[13] = _mm_srai_epi16(in[13], 6);
+    in[14] = _mm_srai_epi16(in[14], 6);
+    in[15] = _mm_srai_epi16(in[15], 6);
+
+    RECON_AND_STORE(dest, in[0]);
+    RECON_AND_STORE(dest, in[1]);
+    RECON_AND_STORE(dest, in[2]);
+    RECON_AND_STORE(dest, in[3]);
+    RECON_AND_STORE(dest, in[4]);
+    RECON_AND_STORE(dest, in[5]);
+    RECON_AND_STORE(dest, in[6]);
+    RECON_AND_STORE(dest, in[7]);
+    RECON_AND_STORE(dest, in[8]);
+    RECON_AND_STORE(dest, in[9]);
+    RECON_AND_STORE(dest, in[10]);
+    RECON_AND_STORE(dest, in[11]);
+    RECON_AND_STORE(dest, in[12]);
+    RECON_AND_STORE(dest, in[13]);
+    RECON_AND_STORE(dest, in[14]);
+    RECON_AND_STORE(dest, in[15]);
+
+    dest += 8 - (stride * 16);
+  }
+}
diff --git a/vp9/encoder/vp9_aq_complexity.c b/vp9/encoder/vp9_aq_complexity.c
index 47ad8d8..0d6b41d 100644
--- a/vp9/encoder/vp9_aq_complexity.c
+++ b/vp9/encoder/vp9_aq_complexity.c
@@ -47,11 +47,21 @@
 
     // Use some of the segments for in frame Q adjustment.
     for (segment = 1; segment < 2; segment++) {
-      const int qindex_delta =
+      int qindex_delta =
           vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex,
                                      in_frame_q_adj_ratio[segment]);
-      vp9_enable_segfeature(seg, segment, SEG_LVL_ALT_Q);
-      vp9_set_segdata(seg, segment, SEG_LVL_ALT_Q, qindex_delta);
+
+      // For AQ mode 2, we dont allow Q0 in a segment if the base Q is not 0.
+      // Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment Q delta
+      // is sometimes applied without going back around the rd loop.
+      // This could lead to an illegal combination of partition size and q.
+      if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) {
+        qindex_delta = -cm->base_qindex + 1;
+      }
+      if ((cm->base_qindex + qindex_delta) > 0) {
+        vp9_enable_segfeature(seg, segment, SEG_LVL_ALT_Q);
+        vp9_set_segdata(seg, segment, SEG_LVL_ALT_Q, qindex_delta);
+      }
     }
   }
 }
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 2e4cd86..914bb37 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -478,8 +478,8 @@
         unsigned int sse = 0;
         int sum = 0;
         if (x_idx < pixels_wide && y_idx < pixels_high)
-          vp9_get_sse_sum_8x8(s + y_idx * sp + x_idx, sp,
-                              d + y_idx * dp + x_idx, dp, &sse, &sum);
+          vp9_get8x8var(s + y_idx * sp + x_idx, sp,
+                        d + y_idx * dp + x_idx, dp, &sse, &sum);
         fill_variance(sse, sum, 64, &vst->split[k].part_variances.none);
       }
     }
@@ -1214,9 +1214,9 @@
         int b_offset = b_mi_row * MI_SIZE * src_stride +
                        b_mi_col * MI_SIZE;
 
-        vp9_get_sse_sum_16x16(src + b_offset, src_stride,
-                              pre_src + b_offset, pre_stride,
-                              &d16[j].sse, &d16[j].sum);
+        vp9_get16x16var(src + b_offset, src_stride,
+                        pre_src + b_offset, pre_stride,
+                        &d16[j].sse, &d16[j].sum);
 
         d16[j].var = d16[j].sse -
             (((uint32_t)d16[j].sum * d16[j].sum) >> 8);
@@ -1291,14 +1291,14 @@
   if (row8x8_remaining >= MI_BLOCK_SIZE &&
       col8x8_remaining >= MI_BLOCK_SIZE) {
     this_sad = cpi->fn_ptr[BLOCK_64X64].sdf(src, src_stride,
-                                            pre, pre_stride, 0x7fffffff);
+                                            pre, pre_stride);
     threshold = (1 << 12);
   } else {
     int r, c;
     for (r = 0; r < row8x8_remaining; r += 2)
       for (c = 0; c < col8x8_remaining; c += 2)
-        this_sad += cpi->fn_ptr[BLOCK_16X16].sdf(src, src_stride, pre,
-                                                 pre_stride, 0x7fffffff);
+        this_sad += cpi->fn_ptr[BLOCK_16X16].sdf(src, src_stride,
+                                                 pre, pre_stride);
     threshold = (row8x8_remaining * col8x8_remaining) << 6;
   }
 
@@ -2369,22 +2369,6 @@
              sizeof(*xd->above_seg_context) * aligned_mi_cols);
 }
 
-static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {
-  if (lossless) {
-    // printf("Switching to lossless\n");
-    cpi->mb.fwd_txm4x4 = vp9_fwht4x4;
-    cpi->mb.itxm_add = vp9_iwht4x4_add;
-    cpi->mb.optimize = 0;
-    cpi->common.lf.filter_level = 0;
-    cpi->zbin_mode_boost_enabled = 0;
-    cpi->common.tx_mode = ONLY_4X4;
-  } else {
-    // printf("Not lossless\n");
-    cpi->mb.fwd_txm4x4 = vp9_fdct4x4;
-    cpi->mb.itxm_add = vp9_idct4x4_add;
-  }
-}
-
 static int check_dual_ref_flags(VP9_COMP *cpi) {
   const int ref_flags = cpi->ref_frame_flags;
 
@@ -2421,7 +2405,7 @@
 }
 
 static TX_MODE select_tx_mode(const VP9_COMP *cpi) {
-  if (cpi->oxcf.lossless) {
+  if (cpi->mb.e_mbd.lossless) {
     return ONLY_4X4;
   } else if (cpi->common.current_video_frame == 0) {
     return TX_MODE_SELECT;
@@ -2993,6 +2977,33 @@
          cm->show_frame;
 }
 
+static void encode_tiles(VP9_COMP *cpi) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_rows = 1 << cm->log2_tile_rows;
+  int tile_col, tile_row;
+  TOKENEXTRA *tok = cpi->tok;
+
+  for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
+    for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+      TileInfo tile;
+      TOKENEXTRA *old_tok = tok;
+      int mi_row;
+
+      vp9_tile_init(&tile, cm, tile_row, tile_col);
+      for (mi_row = tile.mi_row_start; mi_row < tile.mi_row_end;
+           mi_row += MI_BLOCK_SIZE) {
+        if (cpi->sf.use_nonrd_pick_mode && cm->frame_type != KEY_FRAME)
+          encode_nonrd_sb_row(cpi, &tile, mi_row, &tok);
+        else
+          encode_rd_sb_row(cpi, &tile, mi_row, &tok);
+      }
+      cpi->tok_count[tile_row][tile_col] = (unsigned int)(tok - old_tok);
+      assert(tok - cpi->tok <= get_token_alloc(cm->mb_rows, cm->mb_cols));
+    }
+  }
+}
+
 static void encode_frame_internal(VP9_COMP *cpi) {
   SPEED_FEATURES *const sf = &cpi->sf;
   RD_OPT *const rd_opt = &cpi->rd;
@@ -3011,13 +3022,21 @@
   vp9_zero(rd_opt->tx_select_diff);
   vp9_zero(rd_opt->tx_select_threshes);
 
-  cm->tx_mode = select_tx_mode(cpi);
-
   cpi->mb.e_mbd.lossless = cm->base_qindex == 0 &&
                            cm->y_dc_delta_q == 0 &&
                            cm->uv_dc_delta_q == 0 &&
                            cm->uv_ac_delta_q == 0;
-  switch_lossless_mode(cpi, cpi->mb.e_mbd.lossless);
+
+  cm->tx_mode = select_tx_mode(cpi);
+
+  cpi->mb.fwd_txm4x4 = cpi->mb.e_mbd.lossless ? vp9_fwht4x4 : vp9_fdct4x4;
+  cpi->mb.itxm_add = cpi->mb.e_mbd.lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
+
+  if (cpi->mb.e_mbd.lossless) {
+    cpi->mb.optimize = 0;
+    cpi->common.lf.filter_level = 0;
+    cpi->zbin_mode_boost_enabled = 0;
+  }
 
   vp9_frame_init_quantizer(cpi);
 
@@ -3070,33 +3089,7 @@
     struct vpx_usec_timer emr_timer;
     vpx_usec_timer_start(&emr_timer);
 
-    {
-      // Take tiles into account and give start/end MB
-      int tile_col, tile_row;
-      TOKENEXTRA *tp = cpi->tok;
-      const int tile_cols = 1 << cm->log2_tile_cols;
-      const int tile_rows = 1 << cm->log2_tile_rows;
-
-      for (tile_row = 0; tile_row < tile_rows; tile_row++) {
-        for (tile_col = 0; tile_col < tile_cols; tile_col++) {
-          TileInfo tile;
-          TOKENEXTRA *tp_old = tp;
-          int mi_row;
-
-          // For each row of SBs in the frame
-          vp9_tile_init(&tile, cm, tile_row, tile_col);
-          for (mi_row = tile.mi_row_start;
-               mi_row < tile.mi_row_end; mi_row += MI_BLOCK_SIZE) {
-            if (sf->use_nonrd_pick_mode && cm->frame_type != KEY_FRAME)
-              encode_nonrd_sb_row(cpi, &tile, mi_row, &tp);
-            else
-              encode_rd_sb_row(cpi, &tile, mi_row, &tp);
-          }
-          cpi->tok_count[tile_row][tile_col] = (unsigned int)(tp - tp_old);
-          assert(tp - cpi->tok <= get_token_alloc(cm->mb_rows, cm->mb_cols));
-        }
-      }
-    }
+    encode_tiles(cpi);
 
     vpx_usec_timer_mark(&emr_timer);
     cpi->time_encode_sb_row += vpx_usec_timer_elapsed(&emr_timer);
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 90155f3..0ebc936 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -393,11 +393,6 @@
   // Set rd thresholds based on mode and speed setting
   vp9_set_rd_speed_thresholds(cpi);
   vp9_set_rd_speed_thresholds_sub8x8(cpi);
-
-  cpi->mb.fwd_txm4x4 = vp9_fdct4x4;
-  if (cpi->oxcf.lossless || cpi->mb.e_mbd.lossless) {
-    cpi->mb.fwd_txm4x4 = vp9_fwht4x4;
-  }
 }
 
 static void alloc_raw_frame_buffers(VP9_COMP *cpi) {
@@ -596,16 +591,6 @@
   if (cpi->oxcf.mode == REALTIME)
     cpi->oxcf.play_alternate = 0;
 
-  cpi->oxcf.lossless = oxcf->lossless;
-  if (cpi->oxcf.lossless) {
-    // In lossless mode, make sure right quantizer range and correct transform
-    // is set.
-    cpi->oxcf.worst_allowed_q = 0;
-    cpi->oxcf.best_allowed_q = 0;
-    cpi->mb.itxm_add = vp9_iwht4x4_add;
-  } else {
-    cpi->mb.itxm_add = vp9_idct4x4_add;
-  }
   rc->baseline_gf_interval = DEFAULT_GF_INTERVAL;
   cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
 
@@ -627,33 +612,30 @@
 
   // local file playback mode == really big buffer
   if (cpi->oxcf.rc_mode == RC_MODE_VBR) {
-    cpi->oxcf.starting_buffer_level   = 60000;
-    cpi->oxcf.optimal_buffer_level    = 60000;
-    cpi->oxcf.maximum_buffer_size     = 240000;
+    cpi->oxcf.starting_buffer_level_ms = 60000;
+    cpi->oxcf.optimal_buffer_level_ms = 60000;
+    cpi->oxcf.maximum_buffer_size_ms = 240000;
   }
 
-  cpi->oxcf.starting_buffer_level =
-      vp9_rescale(cpi->oxcf.starting_buffer_level,
-                  cpi->oxcf.target_bandwidth, 1000);
+  rc->starting_buffer_level = vp9_rescale(cpi->oxcf.starting_buffer_level_ms,
+                                          cpi->oxcf.target_bandwidth, 1000);
 
   // Set or reset optimal and maximum buffer levels.
-  if (cpi->oxcf.optimal_buffer_level == 0)
-    cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8;
+  if (cpi->oxcf.optimal_buffer_level_ms == 0)
+    rc->optimal_buffer_level = cpi->oxcf.target_bandwidth / 8;
   else
-    cpi->oxcf.optimal_buffer_level =
-        vp9_rescale(cpi->oxcf.optimal_buffer_level,
-                    cpi->oxcf.target_bandwidth, 1000);
+    rc->optimal_buffer_level = vp9_rescale(cpi->oxcf.optimal_buffer_level_ms,
+                                           cpi->oxcf.target_bandwidth, 1000);
 
-  if (cpi->oxcf.maximum_buffer_size == 0)
-    cpi->oxcf.maximum_buffer_size = cpi->oxcf.target_bandwidth / 8;
+  if (cpi->oxcf.maximum_buffer_size_ms == 0)
+    rc->maximum_buffer_size = cpi->oxcf.target_bandwidth / 8;
   else
-    cpi->oxcf.maximum_buffer_size =
-        vp9_rescale(cpi->oxcf.maximum_buffer_size,
-                    cpi->oxcf.target_bandwidth, 1000);
+    rc->maximum_buffer_size = vp9_rescale(cpi->oxcf.maximum_buffer_size_ms,
+                                          cpi->oxcf.target_bandwidth, 1000);
   // Under a configuration change, where maximum_buffer_size may change,
   // keep buffer level clipped to the maximum allowed buffer size.
-  rc->bits_off_target = MIN(rc->bits_off_target, cpi->oxcf.maximum_buffer_size);
-  rc->buffer_level = MIN(rc->buffer_level, cpi->oxcf.maximum_buffer_size);
+  rc->bits_off_target = MIN(rc->bits_off_target, rc->maximum_buffer_size);
+  rc->buffer_level = MIN(rc->buffer_level, rc->maximum_buffer_size);
 
   // Set up frame rate and related parameters rate control values.
   vp9_new_framerate(cpi, cpi->oxcf.framerate);
@@ -1439,21 +1421,6 @@
   vp8_yv12_extend_frame_borders_c(dst);
 }
 
-static int find_fp_qindex() {
-  int i;
-
-  for (i = 0; i < QINDEX_RANGE; i++) {
-    if (vp9_convert_qindex_to_q(i) >= 30.0) {
-      break;
-    }
-  }
-
-  if (i == QINDEX_RANGE)
-    i--;
-
-  return i;
-}
-
 #define WRITE_RECON_BUFFER 0
 #if WRITE_RECON_BUFFER
 void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) {
@@ -2308,17 +2275,6 @@
   encode_frame_to_data_rate(cpi, size, dest, frame_flags);
 }
 
-static void Pass1Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
-                        unsigned int *frame_flags) {
-  (void) size;
-  (void) dest;
-  (void) frame_flags;
-
-  vp9_rc_get_first_pass_params(cpi);
-  vp9_set_quantizer(&cpi->common, find_fp_qindex());
-  vp9_first_pass(cpi);
-}
-
 static void Pass2Encode(VP9_COMP *cpi, size_t *size,
                         uint8_t *dest, unsigned int *frame_flags) {
   cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
@@ -2658,7 +2614,10 @@
 
   if (cpi->pass == 1 &&
       (!cpi->use_svc || cpi->svc.number_temporal_layers == 1)) {
-    Pass1Encode(cpi, size, dest, frame_flags);
+    const int lossless = is_lossless_requested(&cpi->oxcf);
+    cpi->mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vp9_fdct4x4;
+    cpi->mb.itxm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
+    vp9_first_pass(cpi);
   } else if (cpi->pass == 2 &&
       (!cpi->use_svc || cpi->svc.number_temporal_layers == 1)) {
     Pass2Encode(cpi, size, dest, frame_flags);
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 47c9019..c69a345 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -216,9 +216,9 @@
   int over_shoot_pct;
 
   // buffering parameters
-  int64_t starting_buffer_level;  // in seconds
-  int64_t optimal_buffer_level;
-  int64_t maximum_buffer_size;
+  int64_t starting_buffer_level_ms;
+  int64_t optimal_buffer_level_ms;
+  int64_t maximum_buffer_size_ms;
 
   // Frame drop threshold.
   int drop_frames_water_mark;
@@ -228,7 +228,6 @@
   int worst_allowed_q;
   int best_allowed_q;
   int cq_level;
-  int lossless;
   AQ_MODE aq_mode;  // Adaptive Quantization mode
 
   // Internal frame size scaling.
@@ -257,7 +256,6 @@
 
   // these parameters aren't to be used in final build don't use!!!
   int play_alternate;
-  int alt_freq;
 
   int encode_breakout;  // early breakout : for video conf recommend 800
 
@@ -286,6 +284,10 @@
   vp8e_tuning tuning;
 } VP9EncoderConfig;
 
+static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) {
+  return cfg->best_allowed_q == 0 && cfg->worst_allowed_q == 0;
+}
+
 static INLINE int is_best_mode(MODE mode) {
   return mode == ONE_PASS_BEST || mode == TWO_PASS_SECOND_BEST;
 }
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index c1d925a..a49fe3d 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -398,6 +398,32 @@
   }
 }
 
+static int find_fp_qindex() {
+  int i;
+
+  for (i = 0; i < QINDEX_RANGE; ++i)
+    if (vp9_convert_qindex_to_q(i) >= 30.0)
+      break;
+
+  if (i == QINDEX_RANGE)
+    i--;
+
+  return i;
+}
+
+static void set_first_pass_params(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  if (!cpi->refresh_alt_ref_frame &&
+      (cm->current_video_frame == 0 ||
+       (cpi->frame_flags & FRAMEFLAGS_KEY))) {
+    cm->frame_type = KEY_FRAME;
+  } else {
+    cm->frame_type = INTER_FRAME;
+  }
+  // Do not use periodic key frames.
+  cpi->rc.frames_to_key = INT_MAX;
+}
+
 void vp9_first_pass(VP9_COMP *cpi) {
   int mb_row, mb_col;
   MACROBLOCK *const x = &cpi->mb;
@@ -434,10 +460,12 @@
   TWO_PASS *twopass = &cpi->twopass;
   const MV zero_mv = {0, 0};
   const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12;
-  FIRSTPASS_STATS fps;
 
   vp9_clear_system_state();
 
+  set_first_pass_params(cpi);
+  vp9_set_quantizer(cm, find_fp_qindex());
+
   if (cpi->use_svc && cpi->svc.number_temporal_layers == 1) {
     MV_REFERENCE_FRAME ref_frame = LAST_FRAME;
     const YV12_BUFFER_CONFIG *scaled_ref_buf = NULL;
@@ -729,6 +757,8 @@
 
   vp9_clear_system_state();
   {
+    FIRSTPASS_STATS fps;
+
     fps.frame = cm->current_video_frame;
     fps.spatial_layer_id = cpi->svc.spatial_layer_id;
     fps.intra_error = (double)(intra_error >> 8);
@@ -767,7 +797,8 @@
     fps.duration = (double)(cpi->source->ts_end - cpi->source->ts_start);
 
     // Don't want to do output stats with a stack variable!
-    output_stats(&fps, cpi->output_pkt_list);
+    twopass->this_frame_stats = fps;
+    output_stats(&twopass->this_frame_stats, cpi->output_pkt_list);
     accumulate_stats(&twopass->total_stats, &fps);
   }
 
@@ -775,9 +806,9 @@
   // the prediction is good enough... but also don't allow it to lag too far.
   if ((twopass->sr_update_lag > 3) ||
       ((cm->current_video_frame > 0) &&
-       (fps.pcnt_inter > 0.20) &&
-       ((fps.intra_error /
-         DOUBLE_DIVIDE_CHECK(fps.coded_error)) > 2.0))) {
+       (twopass->this_frame_stats.pcnt_inter > 0.20) &&
+       ((twopass->this_frame_stats.intra_error /
+         DOUBLE_DIVIDE_CHECK(twopass->this_frame_stats.coded_error)) > 2.0))) {
     if (gld_yv12 != NULL) {
       vp8_yv12_copy_frame(lst_yv12, gld_yv12);
     }
@@ -2051,19 +2082,6 @@
   twopass->modified_error_left -= kf_group_err;
 }
 
-void vp9_rc_get_first_pass_params(VP9_COMP *cpi) {
-  VP9_COMMON *const cm = &cpi->common;
-  if (!cpi->refresh_alt_ref_frame &&
-      (cm->current_video_frame == 0 ||
-       (cpi->frame_flags & FRAMEFLAGS_KEY))) {
-    cm->frame_type = KEY_FRAME;
-  } else {
-    cm->frame_type = INTER_FRAME;
-  }
-  // Do not use periodic key frames.
-  cpi->rc.frames_to_key = INT_MAX;
-}
-
 // For VBR...adjustment to the frame target based on error from previous frames
 void vbr_rate_correction(int * this_frame_target,
                          const int64_t vbr_bits_off_target) {
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index c89cfaf..309638c 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -44,6 +44,7 @@
   unsigned int section_intra_rating;
   unsigned int next_iiratio;
   FIRSTPASS_STATS total_stats;
+  FIRSTPASS_STATS this_frame_stats;
   const FIRSTPASS_STATS *stats_in;
   const FIRSTPASS_STATS *stats_in_start;
   const FIRSTPASS_STATS *stats_in_end;
diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index 041e583..842bc5b 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -72,8 +72,7 @@
   x->mv_row_max = tmp_row_max;
 
   return vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
-          xd->plane[0].dst.buf, xd->plane[0].dst.stride,
-          INT_MAX);
+          xd->plane[0].dst.buf, xd->plane[0].dst.stride);
 }
 
 static int do_16x16_motion_search(VP9_COMP *cpi, const MV *ref_mv,
@@ -86,8 +85,7 @@
   // Try zero MV first
   // FIXME should really use something like near/nearest MV and/or MV prediction
   err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
-                     xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride,
-                     INT_MAX);
+                     xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride);
   dst_mv->as_int = 0;
 
   // Test last reference frame using the previous best mv as the
@@ -123,8 +121,7 @@
   // Try zero MV first
   // FIXME should really use something like near/nearest MV and/or MV prediction
   err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
-                     xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride,
-                     INT_MAX);
+                     xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride);
 
   dst_mv->as_int = 0;
 
@@ -147,7 +144,7 @@
                             xd->plane[0].dst.buf, xd->plane[0].dst.stride,
                             0, 0, 0);
     err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
-                       xd->plane[0].dst.buf, xd->plane[0].dst.stride, best_err);
+                       xd->plane[0].dst.buf, xd->plane[0].dst.stride);
 
     // find best
     if (err < best_err) {
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 4f7d6f1..dbd19a2 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -524,9 +524,8 @@
 
   // Work out the start point for the search
   bestsad = vfp->sdf(what->buf, what->stride,
-                     get_buf_from_mv(in_what, ref_mv), in_what->stride,
-                     0x7fffffff) + mvsad_err_cost(x, ref_mv, &fcenter_mv,
-                                                  sad_per_bit);
+                     get_buf_from_mv(in_what, ref_mv), in_what->stride) +
+      mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
 
   // Search all possible scales upto the search param around the center point
   // pick the scale of the point that is best as the starting scale of
@@ -542,7 +541,7 @@
                               bc + candidates[t][i].col};
           thissad = vfp->sdf(what->buf, what->stride,
                              get_buf_from_mv(in_what, &this_mv),
-                             in_what->stride, bestsad);
+                             in_what->stride);
           CHECK_BETTER
         }
       } else {
@@ -553,7 +552,7 @@
             continue;
           thissad = vfp->sdf(what->buf, what->stride,
                              get_buf_from_mv(in_what, &this_mv),
-                             in_what->stride, bestsad);
+                             in_what->stride);
           CHECK_BETTER
         }
       }
@@ -585,7 +584,7 @@
                                 bc + candidates[s][i].col};
             thissad = vfp->sdf(what->buf, what->stride,
                                get_buf_from_mv(in_what, &this_mv),
-                               in_what->stride, bestsad);
+                               in_what->stride);
             CHECK_BETTER
           }
         } else {
@@ -596,7 +595,7 @@
               continue;
             thissad = vfp->sdf(what->buf, what->stride,
                                get_buf_from_mv(in_what, &this_mv),
-                               in_what->stride, bestsad);
+                               in_what->stride);
             CHECK_BETTER
           }
         }
@@ -623,7 +622,7 @@
                                 bc + candidates[s][next_chkpts_indices[i]].col};
             thissad = vfp->sdf(what->buf, what->stride,
                                get_buf_from_mv(in_what, &this_mv),
-                               in_what->stride, bestsad);
+                               in_what->stride);
             CHECK_BETTER
           }
         } else {
@@ -634,7 +633,7 @@
               continue;
             thissad = vfp->sdf(what->buf, what->stride,
                                get_buf_from_mv(in_what, &this_mv),
-                               in_what->stride, bestsad);
+                               in_what->stride);
             CHECK_BETTER
           }
         }
@@ -661,7 +660,7 @@
                               bc + neighbors[i].col};
           thissad = vfp->sdf(what->buf, what->stride,
                              get_buf_from_mv(in_what, &this_mv),
-                             in_what->stride, bestsad);
+                             in_what->stride);
           CHECK_BETTER
         }
       } else {
@@ -672,7 +671,7 @@
             continue;
           thissad = vfp->sdf(what->buf, what->stride,
                              get_buf_from_mv(in_what, &this_mv),
-                             in_what->stride, bestsad);
+                             in_what->stride);
           CHECK_BETTER
         }
       }
@@ -894,8 +893,7 @@
   *best_mv = *ref_mv;
   *num00 = 11;
   best_sad = fn_ptr->sdf(what->buf, what->stride,
-                         get_buf_from_mv(in_what, ref_mv), in_what->stride,
-                         0x7fffffff) +
+                         get_buf_from_mv(in_what, ref_mv), in_what->stride) +
                  mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
   start_row = MAX(-range, x->mv_row_min - ref_mv->row);
   start_col = MAX(-range, x->mv_col_min - ref_mv->col);
@@ -929,7 +927,7 @@
         for (i = 0; i < end_col - c; ++i) {
           const MV mv = {ref_mv->row + r, ref_mv->col + c + i};
           unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
-              get_buf_from_mv(in_what, &mv), in_what->stride, best_sad);
+              get_buf_from_mv(in_what, &mv), in_what->stride);
           if (sad < best_sad) {
             sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
             if (sad < best_sad) {
@@ -975,7 +973,7 @@
 
   // Check the starting position
   best_sad = fn_ptr->sdf(what->buf, what->stride,
-                         best_address, in_what->stride, 0x7fffffff) +
+                         best_address, in_what->stride) +
       mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit);
 
   i = 1;
@@ -986,8 +984,7 @@
                      best_mv->col + ss[i].mv.col};
       if (is_mv_in(x, &mv)) {
        int sad = fn_ptr->sdf(what->buf, what->stride,
-                             best_address + ss[i].offset, in_what->stride,
-                             best_sad);
+                             best_address + ss[i].offset, in_what->stride);
         if (sad < best_sad) {
           sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
           if (sad < best_sad) {
@@ -1012,7 +1009,7 @@
         if (is_mv_in(x, &this_mv)) {
           int sad = fn_ptr->sdf(what->buf, what->stride,
                                 best_address + ss[best_site].offset,
-                                in_what->stride, best_sad);
+                                in_what->stride);
           if (sad < best_sad) {
             sad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
             if (sad < best_sad) {
@@ -1077,7 +1074,7 @@
   best_address = in_what;
 
   // Check the starting position
-  bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff)
+  bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride)
                 + mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit);
 
   i = 1;
@@ -1129,7 +1126,7 @@
         if (is_mv_in(x, &this_mv)) {
           const uint8_t *const check_here = ss[i].offset + best_address;
           unsigned int thissad = fn_ptr->sdf(what, what_stride, check_here,
-                                             in_what_stride, bestsad);
+                                             in_what_stride);
 
           if (thissad < bestsad) {
             thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
@@ -1154,7 +1151,7 @@
         if (is_mv_in(x, &this_mv)) {
           const uint8_t *const check_here = ss[best_site].offset + best_address;
           unsigned int thissad = fn_ptr->sdf(what, what_stride, check_here,
-                                             in_what_stride, bestsad);
+                                             in_what_stride);
           if (thissad < bestsad) {
             thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
                                       mvjsadcost, mvsadcost, sad_per_bit);
@@ -1253,7 +1250,7 @@
   const int col_max = MIN(ref_mv->col + distance, x->mv_col_max);
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
   int best_sad = fn_ptr->sdf(what->buf, what->stride,
-      get_buf_from_mv(in_what, ref_mv), in_what->stride, 0x7fffffff) +
+      get_buf_from_mv(in_what, ref_mv), in_what->stride) +
       mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
   *best_mv = *ref_mv;
 
@@ -1261,7 +1258,7 @@
     for (c = col_min; c < col_max; ++c) {
       const MV mv = {r, c};
       const int sad = fn_ptr->sdf(what->buf, what->stride,
-          get_buf_from_mv(in_what, &mv), in_what->stride, best_sad) +
+          get_buf_from_mv(in_what, &mv), in_what->stride) +
               mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
       if (sad < best_sad) {
         best_sad = sad;
@@ -1286,7 +1283,7 @@
   const int col_max = MIN(ref_mv->col + distance, x->mv_col_max);
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
   unsigned int best_sad = fn_ptr->sdf(what->buf, what->stride,
-      get_buf_from_mv(in_what, ref_mv), in_what->stride, 0x7fffffff) +
+      get_buf_from_mv(in_what, ref_mv), in_what->stride) +
       mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
   *best_mv = *ref_mv;
 
@@ -1320,7 +1317,7 @@
 
     while (c < col_max) {
       unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
-                                     check_here, in_what->stride, best_sad);
+                                     check_here, in_what->stride);
       if (sad < best_sad) {
         const MV mv = {r, c};
         sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
@@ -1351,7 +1348,7 @@
   const int col_max = MIN(ref_mv->col + distance, x->mv_col_max);
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
   unsigned int best_sad = fn_ptr->sdf(what->buf, what->stride,
-      get_buf_from_mv(in_what, ref_mv), in_what->stride, 0x7fffffff) +
+      get_buf_from_mv(in_what, ref_mv), in_what->stride) +
       mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
   *best_mv = *ref_mv;
 
@@ -1409,7 +1406,7 @@
 
     while (c < col_max) {
       unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
-                                     check_here, in_what->stride, best_sad);
+                                     check_here, in_what->stride);
       if (sad < best_sad) {
         const MV mv = {r, c};
         sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
@@ -1438,7 +1435,7 @@
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
   unsigned int best_sad = fn_ptr->sdf(what->buf, what->stride,
                                      get_buf_from_mv(in_what, ref_mv),
-                                     in_what->stride, 0x7fffffff) +
+                                     in_what->stride) +
       mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
   int i, j;
 
@@ -1450,7 +1447,7 @@
                      ref_mv->col + neighbors[j].col};
       if (is_mv_in(x, &mv)) {
         unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
-            get_buf_from_mv(in_what, &mv), in_what->stride, best_sad);
+            get_buf_from_mv(in_what, &mv), in_what->stride);
         if (sad < best_sad) {
           sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
           if (sad < best_sad) {
@@ -1483,7 +1480,7 @@
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
   const uint8_t *best_address = get_buf_from_mv(in_what, ref_mv);
   unsigned int best_sad = fn_ptr->sdf(what->buf, what->stride, best_address,
-                                    in_what->stride, 0x7fffffff) +
+                                    in_what->stride) +
       mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
   int i, j;
 
@@ -1524,7 +1521,7 @@
         if (is_mv_in(x, &mv)) {
           unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
                                          get_buf_from_mv(in_what, &mv),
-                                         in_what->stride, best_sad);
+                                         in_what->stride);
           if (sad < best_sad) {
             sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
             if (sad < best_sad) {
@@ -1563,8 +1560,7 @@
   const struct buf_2d *const in_what = &xd->plane[0].pre[0];
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
   unsigned int best_sad = fn_ptr->sdaf(what->buf, what->stride,
-      get_buf_from_mv(in_what, ref_mv), in_what->stride,
-      second_pred, 0x7fffffff) +
+      get_buf_from_mv(in_what, ref_mv), in_what->stride, second_pred) +
       mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
   int i, j;
 
@@ -1577,8 +1573,7 @@
 
       if (is_mv_in(x, &mv)) {
         unsigned int sad = fn_ptr->sdaf(what->buf, what->stride,
-            get_buf_from_mv(in_what, &mv), in_what->stride,
-            second_pred, best_sad);
+            get_buf_from_mv(in_what, &mv), in_what->stride, second_pred);
         if (sad < best_sad) {
           sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
           if (sad < best_sad) {
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index b58eac9..e8e4256 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -159,7 +159,7 @@
     lrc->bits_off_target += bits_off_for_this_layer;
 
     // Clip buffer level to maximum buffer size for the layer.
-    lrc->bits_off_target = MIN(lrc->bits_off_target, lc->maximum_buffer_size);
+    lrc->bits_off_target = MIN(lrc->bits_off_target, lrc->maximum_buffer_size);
     lrc->buffer_level = lrc->bits_off_target;
   }
 }
@@ -167,7 +167,6 @@
 // Update the buffer level: leaky bucket model.
 static void update_buffer_level(VP9_COMP *cpi, int encoded_frame_size) {
   const VP9_COMMON *const cm = &cpi->common;
-  const VP9EncoderConfig *oxcf = &cpi->oxcf;
   RATE_CONTROL *const rc = &cpi->rc;
 
   // Non-viewable frames are a special case and are treated as pure overhead.
@@ -178,7 +177,7 @@
   }
 
   // Clip the buffer level to the maximum specified buffer size.
-  rc->bits_off_target = MIN(rc->bits_off_target, oxcf->maximum_buffer_size);
+  rc->bits_off_target = MIN(rc->bits_off_target, rc->maximum_buffer_size);
   rc->buffer_level = rc->bits_off_target;
 
   if (cpi->use_svc && cpi->oxcf.rc_mode == RC_MODE_CBR) {
@@ -188,23 +187,20 @@
 
 void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
   if (pass == 0 && oxcf->rc_mode == RC_MODE_CBR) {
-    rc->avg_frame_qindex[0] = oxcf->worst_allowed_q;
-    rc->avg_frame_qindex[1] = oxcf->worst_allowed_q;
-    rc->avg_frame_qindex[2] = oxcf->worst_allowed_q;
+    rc->avg_frame_qindex[KEY_FRAME] = oxcf->worst_allowed_q;
+    rc->avg_frame_qindex[INTER_FRAME] = oxcf->worst_allowed_q;
   } else {
-    rc->avg_frame_qindex[0] = (oxcf->worst_allowed_q +
-                                   oxcf->best_allowed_q) / 2;
-    rc->avg_frame_qindex[1] = (oxcf->worst_allowed_q +
-                                   oxcf->best_allowed_q) / 2;
-    rc->avg_frame_qindex[2] = (oxcf->worst_allowed_q +
-                                   oxcf->best_allowed_q) / 2;
+    rc->avg_frame_qindex[KEY_FRAME] = (oxcf->worst_allowed_q +
+                                           oxcf->best_allowed_q) / 2;
+    rc->avg_frame_qindex[INTER_FRAME] = (oxcf->worst_allowed_q +
+                                           oxcf->best_allowed_q) / 2;
   }
 
   rc->last_q[KEY_FRAME] = oxcf->best_allowed_q;
   rc->last_q[INTER_FRAME] = oxcf->best_allowed_q;
 
-  rc->buffer_level =    oxcf->starting_buffer_level;
-  rc->bits_off_target = oxcf->starting_buffer_level;
+  rc->buffer_level =    rc->starting_buffer_level;
+  rc->bits_off_target = rc->starting_buffer_level;
 
   rc->rolling_target_bits      = rc->avg_frame_bandwidth;
   rc->rolling_actual_bits      = rc->avg_frame_bandwidth;
@@ -250,7 +246,7 @@
       // If buffer is below drop_mark, for now just drop every other frame
       // (starting with the next frame) until it increases back over drop_mark.
       int drop_mark = (int)(oxcf->drop_frames_water_mark *
-          oxcf->optimal_buffer_level / 100);
+          rc->optimal_buffer_level / 100);
       if ((rc->buffer_level > drop_mark) &&
           (rc->decimation_factor > 0)) {
         --rc->decimation_factor;
@@ -432,7 +428,6 @@
                                              : rc->last_q[INTER_FRAME] * 2;
     }
   }
-
   return MIN(active_worst_quality, rc->worst_quality);
 }
 
@@ -444,10 +439,9 @@
   // ambient Q (at buffer = optimal level) to worst_quality level
   // (at buffer = critical level).
   const VP9_COMMON *const cm = &cpi->common;
-  const VP9EncoderConfig *oxcf = &cpi->oxcf;
   const RATE_CONTROL *rc = &cpi->rc;
   // Buffer level below which we push active_worst to worst_quality.
-  int64_t critical_level = oxcf->optimal_buffer_level >> 2;
+  int64_t critical_level = rc->optimal_buffer_level >> 2;
   int64_t buff_lvl_step = 0;
   int adjustment = 0;
   int active_worst_quality;
@@ -459,26 +453,26 @@
   else
     active_worst_quality = MIN(rc->worst_quality,
                                rc->avg_frame_qindex[KEY_FRAME] * 3 / 2);
-  if (rc->buffer_level > oxcf->optimal_buffer_level) {
+  if (rc->buffer_level > rc->optimal_buffer_level) {
     // Adjust down.
     // Maximum limit for down adjustment, ~30%.
     int max_adjustment_down = active_worst_quality / 3;
     if (max_adjustment_down) {
-      buff_lvl_step = ((oxcf->maximum_buffer_size -
-                        oxcf->optimal_buffer_level) / max_adjustment_down);
+      buff_lvl_step = ((rc->maximum_buffer_size -
+                        rc->optimal_buffer_level) / max_adjustment_down);
       if (buff_lvl_step)
-        adjustment = (int)((rc->buffer_level - oxcf->optimal_buffer_level) /
+        adjustment = (int)((rc->buffer_level - rc->optimal_buffer_level) /
                             buff_lvl_step);
       active_worst_quality -= adjustment;
     }
   } else if (rc->buffer_level > critical_level) {
     // Adjust up from ambient Q.
     if (critical_level) {
-      buff_lvl_step = (oxcf->optimal_buffer_level - critical_level);
+      buff_lvl_step = (rc->optimal_buffer_level - critical_level);
       if (buff_lvl_step) {
         adjustment =
             (int)((rc->worst_quality - rc->avg_frame_qindex[INTER_FRAME]) *
-                  (oxcf->optimal_buffer_level - rc->buffer_level) /
+                  (rc->optimal_buffer_level - rc->buffer_level) /
                   buff_lvl_step);
       }
       active_worst_quality = rc->avg_frame_qindex[INTER_FRAME] + adjustment;
@@ -644,7 +638,7 @@
       int delta_qindex = vp9_compute_qdelta(rc, last_boosted_q,
                                             last_boosted_q * 0.75);
       active_best_quality = MAX(qindex + delta_qindex, rc->best_quality);
-    } else if (cm->current_video_frame > 0) {
+    } else {
       // not first frame of one pass and kf_boost is set
       double q_adj_factor = 1.0;
       double q_val;
@@ -987,7 +981,6 @@
   } else {
     q = rc_pick_q_and_bounds_two_pass(cpi, bottom_index, top_index);
   }
-
   if (cpi->sf.use_nonrd_pick_mode) {
     if (cpi->sf.force_frame_boost == 1)
       q -= cpi->sf.max_delta_qindex;
@@ -1086,21 +1079,21 @@
     rc->last_q[KEY_FRAME] = qindex;
     rc->avg_frame_qindex[KEY_FRAME] =
         ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[KEY_FRAME] + qindex, 2);
-  } else if (!rc->is_src_frame_alt_ref &&
-             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) &&
-             !(cpi->use_svc && oxcf->rc_mode == RC_MODE_CBR)) {
-    rc->avg_frame_qindex[2] =
-        ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[2] + qindex, 2);
   } else {
-    rc->last_q[INTER_FRAME] = qindex;
-    rc->avg_frame_qindex[INTER_FRAME] =
+    if (rc->is_src_frame_alt_ref ||
+        !(cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) ||
+        (cpi->use_svc && oxcf->rc_mode == RC_MODE_CBR)) {
+      rc->last_q[INTER_FRAME] = qindex;
+      rc->avg_frame_qindex[INTER_FRAME] =
         ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[INTER_FRAME] + qindex, 2);
-    rc->ni_frames++;
-    rc->tot_q += vp9_convert_qindex_to_q(qindex);
-    rc->avg_q = rc->tot_q / rc->ni_frames;
-    // Calculate the average Q for normal inter frames (not key or GFU frames).
-    rc->ni_tot_qi += qindex;
-    rc->ni_av_qi = rc->ni_tot_qi / rc->ni_frames;
+      rc->ni_frames++;
+      rc->tot_q += vp9_convert_qindex_to_q(qindex);
+      rc->avg_q = rc->tot_q / rc->ni_frames;
+      // Calculate the average Q for normal inter frames (not key or GFU
+      // frames).
+      rc->ni_tot_qi += qindex;
+      rc->ni_av_qi = rc->ni_tot_qi / rc->ni_frames;
+    }
   }
 
   // Keep record of last boosted (KF/KF/ARF) Q value.
@@ -1227,8 +1220,8 @@
   const VP9EncoderConfig *oxcf = &cpi->oxcf;
   const RATE_CONTROL *rc = &cpi->rc;
   const SVC *const svc = &cpi->svc;
-  const int64_t diff = oxcf->optimal_buffer_level - rc->buffer_level;
-  const int64_t one_pct_bits = 1 + oxcf->optimal_buffer_level / 100;
+  const int64_t diff = rc->optimal_buffer_level - rc->buffer_level;
+  const int64_t one_pct_bits = 1 + rc->optimal_buffer_level / 100;
   int min_frame_target = MAX(rc->avg_frame_bandwidth >> 4, FRAME_OVERHEAD_BITS);
   int target = rc->avg_frame_bandwidth;
   if (svc->number_temporal_layers > 1 &&
@@ -1259,8 +1252,8 @@
   const SVC *const svc = &cpi->svc;
   int target;
   if (cpi->common.current_video_frame == 0) {
-    target = ((cpi->oxcf.starting_buffer_level / 2) > INT_MAX)
-      ? INT_MAX : (int)(cpi->oxcf.starting_buffer_level / 2);
+    target = ((rc->starting_buffer_level / 2) > INT_MAX)
+      ? INT_MAX : (int)(rc->starting_buffer_level / 2);
   } else {
     int kf_boost = 32;
     double framerate = oxcf->framerate;
diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index d6a0151..f1a4a3f 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -61,7 +61,7 @@
   int ni_av_qi;
   int ni_tot_qi;
   int ni_frames;
-  int avg_frame_qindex[3];        // 0 - KEY, 1 - INTER, 2 - ARF/GF
+  int avg_frame_qindex[FRAME_TYPES];
   double tot_q;
   double avg_q;
 
@@ -84,6 +84,10 @@
 
   int worst_quality;
   int best_quality;
+
+  int64_t starting_buffer_level;
+  int64_t optimal_buffer_level;
+  int64_t maximum_buffer_size;
   // int active_best_quality;
 } RATE_CONTROL;
 
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 1557c8d..f68aa27 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -2129,8 +2129,7 @@
 
     // Find sad for current vector.
     this_sad = cpi->fn_ptr[block_size].sdf(src_y_ptr, x->plane[0].src.stride,
-                                           ref_y_ptr, ref_y_stride,
-                                           0x7fffffff);
+                                           ref_y_ptr, ref_y_stride);
 
     // Note if it is the best so far.
     if (this_sad < best_sad) {
@@ -3794,11 +3793,6 @@
         vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) !=
             (int)ref_frame) {
       continue;
-    // If the segment skip feature is enabled....
-    // then do nothing if the current mode is not allowed..
-    } else if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) &&
-               ref_frame != INTRA_FRAME) {
-      continue;
     // Disable this drop out case if the ref frame
     // segment level feature is enabled for this segment. This is to
     // prevent the possibility that we end up unable to pick any mode.
@@ -4023,15 +4017,10 @@
     }
 
     if (!disable_skip) {
-      // Test for the condition where skip block will be activated
-      // because there are no non zero coefficients and make any
-      // necessary adjustment for rate. Ignore if skip is coded at
-      // segment level as the cost wont have been added in.
-      // Is Mb level skip allowed (i.e. not coded at segment level).
-      const int mb_skip_allowed = !vp9_segfeature_active(seg, segment_id,
-                                                         SEG_LVL_SKIP);
+      // Skip is never coded at the segment level for sub8x8 blocks and instead
+      // always coded in the bitstream at the mode info level.
 
-      if (mb_skip_allowed && ref_frame != INTRA_FRAME && !xd->lossless) {
+      if (ref_frame != INTRA_FRAME && !xd->lossless) {
         if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
             RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
           // Add in the cost of the no skip flag.
@@ -4046,7 +4035,7 @@
           rate_uv = 0;
           this_skip2 = 1;
         }
-      } else if (mb_skip_allowed) {
+      } else {
         // Add in the cost of the no skip flag.
         rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
       }
diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h
index 5ea09a8..e85d08a 100644
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h
@@ -99,41 +99,44 @@
                                     int step_param, int error_per_bit,
                                     const MV *ref_mv, MV *tmp_mv,
                                     int var_max, int rd) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  const SEARCH_METHODS method = sf->search_method;
+  vp9_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize];
   int var = 0;
 
-  if (cpi->sf.search_method == FAST_DIAMOND) {
-    var = vp9_fast_dia_search(x, mvp_full, step_param, error_per_bit, 0,
-                              &cpi->fn_ptr[bsize], 1, ref_mv, tmp_mv);
-    if (rd && var < var_max)
-      var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, &cpi->fn_ptr[bsize], 1);
-  } else if (cpi->sf.search_method == FAST_HEX) {
-    var = vp9_fast_hex_search(x, mvp_full, step_param, error_per_bit, 0,
-                              &cpi->fn_ptr[bsize], 1, ref_mv, tmp_mv);
-    if (rd && var < var_max)
-      var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, &cpi->fn_ptr[bsize], 1);
-  } else if (cpi->sf.search_method == HEX) {
-    var = vp9_hex_search(x, mvp_full, step_param, error_per_bit, 1,
-                         &cpi->fn_ptr[bsize], 1, ref_mv, tmp_mv);
-    if (rd && var < var_max)
-      var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, &cpi->fn_ptr[bsize], 1);
-  } else if (cpi->sf.search_method == SQUARE) {
-    var = vp9_square_search(x, mvp_full, step_param, error_per_bit, 1,
-                            &cpi->fn_ptr[bsize], 1, ref_mv, tmp_mv);
-    if (rd && var < var_max)
-      var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, &cpi->fn_ptr[bsize], 1);
-  } else if (cpi->sf.search_method == BIGDIA) {
-    var = vp9_bigdia_search(x, mvp_full, step_param, error_per_bit, 1,
-                            &cpi->fn_ptr[bsize], 1, ref_mv, tmp_mv);
-    if (rd && var < var_max)
-      var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, &cpi->fn_ptr[bsize], 1);
-  } else {
-    int further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
-
-    var = vp9_full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
-                                 further_steps, 1, &cpi->fn_ptr[bsize],
-                                 ref_mv, tmp_mv);
+  switch (method) {
+    case FAST_DIAMOND:
+      var = vp9_fast_dia_search(x, mvp_full, step_param, error_per_bit, 0,
+                                fn_ptr, 1, ref_mv, tmp_mv);
+      break;
+    case FAST_HEX:
+      var = vp9_fast_hex_search(x, mvp_full, step_param, error_per_bit, 0,
+                                fn_ptr, 1, ref_mv, tmp_mv);
+      break;
+    case HEX:
+      var = vp9_hex_search(x, mvp_full, step_param, error_per_bit, 1,
+                           fn_ptr, 1, ref_mv, tmp_mv);
+      break;
+    case SQUARE:
+      var = vp9_square_search(x, mvp_full, step_param, error_per_bit, 1,
+                              fn_ptr, 1, ref_mv, tmp_mv);
+      break;
+    case BIGDIA:
+      var = vp9_bigdia_search(x, mvp_full, step_param, error_per_bit, 1,
+                              fn_ptr, 1, ref_mv, tmp_mv);
+      break;
+    case NSTEP:
+      var = vp9_full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
+                                   (sf->max_step_search_steps - 1) - step_param,
+                                   1, fn_ptr, ref_mv, tmp_mv);
+      break;
+    default:
+      assert(!"Invalid search method.");
   }
 
+  if (method != NSTEP && rd && var < var_max)
+    var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, fn_ptr, 1);
+
   return var;
 }
 
diff --git a/vp9/encoder/vp9_sad.c b/vp9/encoder/vp9_sad.c
index 892e905..d062636 100644
--- a/vp9/encoder/vp9_sad.c
+++ b/vp9/encoder/vp9_sad.c
@@ -35,14 +35,12 @@
 
 #define sadMxN(m, n) \
 unsigned int vp9_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
-                                  const uint8_t *ref, int ref_stride, \
-                                  unsigned int max_sad) { \
+                                  const uint8_t *ref, int ref_stride) { \
   return sad(src, src_stride, ref, ref_stride, m, n); \
 } \
 unsigned int vp9_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \
                                       const uint8_t *ref, int ref_stride, \
-                                      const uint8_t *second_pred, \
-                                      unsigned int max_sad) { \
+                                      const uint8_t *second_pred) { \
   uint8_t comp_pred[m * n]; \
   vp9_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \
   return sad(src, src_stride, comp_pred, m, m, n); \
@@ -54,8 +52,7 @@
                                 unsigned int *sads) { \
   int i; \
   for (i = 0; i < k; ++i) \
-    sads[i] = vp9_sad##m##x##n##_c(src, src_stride, &ref[i], ref_stride, \
-                                   0x7fffffff); \
+    sads[i] = vp9_sad##m##x##n##_c(src, src_stride, &ref[i], ref_stride); \
 }
 
 #define sadMxNx4D(m, n) \
@@ -64,8 +61,7 @@
                              unsigned int *sads) { \
   int i; \
   for (i = 0; i < 4; ++i) \
-    sads[i] = vp9_sad##m##x##n##_c(src, src_stride, refs[i], ref_stride, \
-                                   0x7fffffff); \
+    sads[i] = vp9_sad##m##x##n##_c(src, src_stride, refs[i], ref_stride); \
 }
 
 // 64x64
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 15b9861..b7f8397 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -291,7 +291,7 @@
   sf->subpel_search_method = SUBPEL_TREE;
   sf->subpel_iters_per_step = 2;
   sf->subpel_force_stop = 0;
-  sf->optimize_coefficients = !oxcf->lossless;
+  sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf);
   sf->reduce_first_step_size = 0;
   sf->auto_mv_step_size = 0;
   sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index c25314b..1b99575 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -54,7 +54,7 @@
       lrc->last_q[INTER_FRAME] = oxcf->best_allowed_q;
     }
 
-    lrc->buffer_level = vp9_rescale((int)(oxcf->starting_buffer_level),
+    lrc->buffer_level = vp9_rescale((int)(oxcf->starting_buffer_level_ms),
                                     lc->target_bandwidth, 1000);
     lrc->bits_off_target = lrc->buffer_level;
   }
@@ -87,14 +87,14 @@
     }
     bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth;
     // Update buffer-related quantities.
-    lc->starting_buffer_level =
-        (int64_t)(oxcf->starting_buffer_level * bitrate_alloc);
-    lc->optimal_buffer_level =
-        (int64_t)(oxcf->optimal_buffer_level * bitrate_alloc);
-    lc->maximum_buffer_size =
-        (int64_t)(oxcf->maximum_buffer_size * bitrate_alloc);
-    lrc->bits_off_target = MIN(lrc->bits_off_target, lc->maximum_buffer_size);
-    lrc->buffer_level = MIN(lrc->buffer_level, lc->maximum_buffer_size);
+    lrc->starting_buffer_level =
+        (int64_t)(rc->starting_buffer_level * bitrate_alloc);
+    lrc->optimal_buffer_level =
+        (int64_t)(rc->optimal_buffer_level * bitrate_alloc);
+    lrc->maximum_buffer_size =
+        (int64_t)(rc->maximum_buffer_size * bitrate_alloc);
+    lrc->bits_off_target = MIN(lrc->bits_off_target, lrc->maximum_buffer_size);
+    lrc->buffer_level = MIN(lrc->buffer_level, lrc->maximum_buffer_size);
     // Update framerate-related quantities.
     if (svc->number_temporal_layers > 1) {
       lc->framerate = oxcf->framerate / oxcf->ts_rate_decimator[layer];
@@ -160,9 +160,6 @@
   cpi->rc = lc->rc;
   cpi->twopass = lc->twopass;
   cpi->oxcf.target_bandwidth = lc->target_bandwidth;
-  cpi->oxcf.starting_buffer_level = lc->starting_buffer_level;
-  cpi->oxcf.optimal_buffer_level = lc->optimal_buffer_level;
-  cpi->oxcf.maximum_buffer_size = lc->maximum_buffer_size;
   // Reset the frames_since_key and frames_to_key counters to their values
   // before the layer restore. Keep these defined for the stream (not layer).
   if (cpi->svc.number_temporal_layers > 1) {
@@ -178,9 +175,6 @@
   lc->rc = cpi->rc;
   lc->twopass = cpi->twopass;
   lc->target_bandwidth = (int)oxcf->target_bandwidth;
-  lc->starting_buffer_level = oxcf->starting_buffer_level;
-  lc->optimal_buffer_level = oxcf->optimal_buffer_level;
-  lc->maximum_buffer_size = oxcf->maximum_buffer_size;
 }
 
 void vp9_init_second_pass_spatial_svc(VP9_COMP *cpi) {
diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h
index 6881ce1..36e2027 100644
--- a/vp9/encoder/vp9_svc_layercontext.h
+++ b/vp9/encoder/vp9_svc_layercontext.h
@@ -22,9 +22,6 @@
 typedef struct {
   RATE_CONTROL rc;
   int target_bandwidth;
-  int64_t starting_buffer_level;
-  int64_t optimal_buffer_level;
-  int64_t maximum_buffer_size;
   double framerate;
   int avg_frame_size;
   TWO_PASS twopass;
diff --git a/vp9/encoder/vp9_variance.c b/vp9/encoder/vp9_variance.c
index 02bed89..eb5ae2e 100644
--- a/vp9/encoder/vp9_variance.c
+++ b/vp9/encoder/vp9_variance.c
@@ -156,16 +156,15 @@
   return vp9_variance##W##x##H##_c(temp3, W, dst, dst_stride, sse); \
 }
 
-
-void vp9_get_sse_sum_16x16_c(const uint8_t *src_ptr, int source_stride,
-                             const uint8_t *ref_ptr, int ref_stride,
-                             unsigned int *sse, int *sum) {
+void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride,
+                       const uint8_t *ref_ptr, int ref_stride,
+                       unsigned int *sse, int *sum) {
   variance(src_ptr, source_stride, ref_ptr, ref_stride, 16, 16, sse, sum);
 }
 
-void vp9_get_sse_sum_8x8_c(const uint8_t *src_ptr, int source_stride,
-                           const uint8_t *ref_ptr, int ref_stride,
-                           unsigned int *sse, int *sum) {
+void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride,
+                     const uint8_t *ref_ptr, int ref_stride,
+                     unsigned int *sse, int *sum) {
   variance(src_ptr, source_stride, ref_ptr, ref_stride, 8, 8, sse, sum);
 }
 
diff --git a/vp9/encoder/vp9_variance.h b/vp9/encoder/vp9_variance.h
index c47fe13..4a194b7 100644
--- a/vp9/encoder/vp9_variance.h
+++ b/vp9/encoder/vp9_variance.h
@@ -25,15 +25,13 @@
 typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr,
                                     int source_stride,
                                     const uint8_t *ref_ptr,
-                                    int ref_stride,
-                                    unsigned int max_sad);
+                                    int ref_stride);
 
 typedef unsigned int(*vp9_sad_avg_fn_t)(const uint8_t *src_ptr,
                                         int source_stride,
                                         const uint8_t *ref_ptr,
                                         int ref_stride,
-                                        const uint8_t *second_pred,
-                                        unsigned int max_sad);
+                                        const uint8_t *second_pred);
 
 typedef void (*vp9_sad_multi_fn_t)(const uint8_t *src_ptr,
                                    int source_stride,
diff --git a/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm b/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm
index 8723a71..28458dc 100644
--- a/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm
+++ b/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm
@@ -23,6 +23,7 @@
 pw_%2_m%1:  dw  %2, -%1,  %2, -%1,  %2, -%1,  %2, -%1
 %endmacro
 
+TRANSFORM_COEFFS 11585,  11585
 TRANSFORM_COEFFS 15137,   6270
 TRANSFORM_COEFFS 16069,   3196
 TRANSFORM_COEFFS  9102,  13623
@@ -83,7 +84,7 @@
 %endmacro
 
 ; 1D forward 8x8 DCT transform
-%macro FDCT8_1D 0
+%macro FDCT8_1D 1
   SUM_SUB            0,  7,  9
   SUM_SUB            1,  6,  9
   SUM_SUB            2,  5,  9
@@ -92,14 +93,21 @@
   SUM_SUB            0,  3,  9
   SUM_SUB            1,  2,  9
   SUM_SUB            6,  5,  9
+%if %1 == 0
   SUM_SUB            0,  1,  9
+%endif
 
   BUTTERFLY_4X       2,  3,  6270,  15137,  m8,  9,  10
 
   pmulhrsw           m6, m12
   pmulhrsw           m5, m12
+%if %1 == 0
   pmulhrsw           m0, m12
   pmulhrsw           m1, m12
+%else
+  BUTTERFLY_4X       1,  0,  11585, 11585,  m8,  9,  10
+  SWAP               0,  1
+%endif
 
   SUM_SUB            4,  5,  9
   SUM_SUB            7,  6,  9
@@ -150,10 +158,10 @@
   psllw              m7, 2
 
   ; column transform
-  FDCT8_1D
+  FDCT8_1D  0
   TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
 
-  FDCT8_1D
+  FDCT8_1D  1
   TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
 
   DIVIDE_ROUND_2X   0, 1, 9, 10
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index d52424a..72768e1 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -331,8 +331,10 @@
   oxcf->target_bandwidth = 1000 * cfg->rc_target_bitrate;
   oxcf->rc_max_intra_bitrate_pct = extra_cfg->rc_max_intra_bitrate_pct;
 
-  oxcf->best_allowed_q  = vp9_quantizer_to_qindex(cfg->rc_min_quantizer);
-  oxcf->worst_allowed_q = vp9_quantizer_to_qindex(cfg->rc_max_quantizer);
+  oxcf->best_allowed_q =
+      extra_cfg->lossless ? 0 : vp9_quantizer_to_qindex(cfg->rc_min_quantizer);
+  oxcf->worst_allowed_q =
+      extra_cfg->lossless ? 0 : vp9_quantizer_to_qindex(cfg->rc_max_quantizer);
   oxcf->cq_level        = vp9_quantizer_to_qindex(extra_cfg->cq_level);
   oxcf->fixed_q = -1;
 
@@ -343,9 +345,9 @@
   oxcf->scaled_frame_width       = cfg->rc_scaled_width;
   oxcf->scaled_frame_height      = cfg->rc_scaled_height;
 
-  oxcf->maximum_buffer_size     = cfg->rc_buf_sz;
-  oxcf->starting_buffer_level   = cfg->rc_buf_initial_sz;
-  oxcf->optimal_buffer_level    = cfg->rc_buf_optimal_sz;
+  oxcf->maximum_buffer_size_ms   = cfg->rc_buf_sz;
+  oxcf->starting_buffer_level_ms = cfg->rc_buf_initial_sz;
+  oxcf->optimal_buffer_level_ms  = cfg->rc_buf_optimal_sz;
 
   oxcf->drop_frames_water_mark   = cfg->rc_dropframe_thresh;
 
@@ -376,8 +378,6 @@
   oxcf->tile_columns = extra_cfg->tile_columns;
   oxcf->tile_rows    = extra_cfg->tile_rows;
 
-  oxcf->lossless = extra_cfg->lossless;
-
   oxcf->error_resilient_mode         = cfg->g_error_resilient;
   oxcf->frame_parallel_decoding_mode = extra_cfg->frame_parallel_decoding_mode;