Merge "win: Include <intrin.h> instead of manually declaring intrinsics."
diff --git a/build/make/configure.sh b/build/make/configure.sh
index 33f658e..ee887ab 100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -185,6 +185,25 @@
 #
 # Boolean Manipulation Functions
 #
+
+enable_codec(){
+  enabled $1 || echo "  enabling $1"
+  set_all yes $1
+
+  is_in $1 vp8 vp9 vp10 && \
+    set_all yes $1_encoder && \
+    set_all yes $1_decoder
+}
+
+disable_codec(){
+  disabled $1 || echo "  disabling $1"
+  set_all no $1
+
+  is_in $1 vp8 vp9 vp10 && \
+    set_all no $1_encoder && \
+    set_all no $1_decoder
+}
+
 enable_feature(){
   set_all yes $*
 }
@@ -521,22 +540,20 @@
         ;;
       --enable-?*|--disable-?*)
         eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
-        if echo "${ARCH_EXT_LIST}" | grep "^ *$option\$" >/dev/null; then
+        if is_in ${option} ${ARCH_EXT_LIST}; then
           [ $action = "disable" ] && RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${option} "
         elif [ $action = "disable" ] && ! disabled $option ; then
-          echo "${CMDLINE_SELECT}" | grep "^ *$option\$" >/dev/null ||
-            die_unknown $opt
+          is_in ${option} ${CMDLINE_SELECT} || die_unknown $opt
           log_echo "  disabling $option"
         elif [ $action = "enable" ] && ! enabled $option ; then
-          echo "${CMDLINE_SELECT}" | grep "^ *$option\$" >/dev/null ||
-            die_unknown $opt
+          is_in ${option} ${CMDLINE_SELECT} || die_unknown $opt
           log_echo "  enabling $option"
         fi
         ${action}_feature $option
         ;;
       --require-?*)
         eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
-        if echo "${ARCH_EXT_LIST}" none | grep "^ *$option\$" >/dev/null; then
+        if is_in ${option} ${ARCH_EXT_LIST}; then
             RTCD_OPTIONS="${RTCD_OPTIONS}${opt} "
         else
             die_unknown $opt
@@ -638,6 +655,26 @@
   xcrun --sdk $1 --show-sdk-version 2>/dev/null | cut -d. -f1
 }
 
+# Print the Xcode version.
+show_xcode_version() {
+  xcodebuild -version | head -n1 | cut -d' ' -f2
+}
+
+# Fails when Xcode version is less than 6.3.
+check_xcode_minimum_version() {
+  xcode_major=$(show_xcode_version | cut -f1 -d.)
+  xcode_minor=$(show_xcode_version | cut -f2 -d.)
+  xcode_min_major=6
+  xcode_min_minor=3
+  if [ ${xcode_major} -lt ${xcode_min_major} ]; then
+    return 1
+  fi
+  if [ ${xcode_major} -eq ${xcode_min_major} ] \
+    && [ ${xcode_minor} -lt ${xcode_min_minor} ]; then
+    return 1
+  fi
+}
+
 process_common_toolchain() {
   if [ -z "$toolchain" ]; then
     gcctarget="${CHOST:-$(gcc -dumpmachine 2> /dev/null)}"
@@ -751,13 +788,14 @@
   enabled shared && soft_enable pic
 
   # Minimum iOS version for all target platforms (darwin and iphonesimulator).
+  # Shared library framework builds are only possible on iOS 8 and later.
   if enabled shared; then
     IOS_VERSION_OPTIONS="--enable-shared"
+    IOS_VERSION_MIN="8.0"
   else
     IOS_VERSION_OPTIONS=""
+    IOS_VERSION_MIN="6.0"
   fi
-  IOS_VERSION_MIN=$("${source_path}/build/make/ios-version.sh" \
-    ${IOS_VERSION_OPTIONS})
 
   # Handle darwin variants. Newer SDKs allow targeting older
   # platforms, so use the newest one available.
@@ -1050,6 +1088,19 @@
             [ -d "${try_dir}" ] && add_ldflags -L"${try_dir}"
           done
 
+          case ${tgt_isa} in
+            armv7|armv7s|armv8|arm64)
+              if enabled neon && ! check_xcode_minimum_version; then
+                soft_disable neon
+                log_echo "  neon disabled: upgrade Xcode (need v6.3+)."
+                if enabled neon_asm; then
+                  soft_disable neon_asm
+                  log_echo "  neon_asm disabled: upgrade Xcode (need v6.3+)."
+                fi
+              fi
+              ;;
+          esac
+
           asm_conversion_cmd="${source_path}/build/make/ads2gas_apple.pl"
 
           if [ "$(show_darwin_sdk_major_version iphoneos)" -gt 8 ]; then
@@ -1105,7 +1156,7 @@
             check_add_ldflags -mfp64
             ;;
           i6400)
-            check_add_cflags -mips64r6 -mabi=64 -funroll-loops -msched-weight 
+            check_add_cflags -mips64r6 -mabi=64 -funroll-loops -msched-weight
             check_add_cflags  -mload-store-pairs -mhard-float -mfp64
             check_add_asflags -mips64r6 -mabi=64 -mhard-float -mfp64
             check_add_ldflags -mips64r6 -mabi=64 -mfp64
diff --git a/build/make/ios-version.sh b/build/make/ios-version.sh
deleted file mode 100755
index 7252eb4..0000000
--- a/build/make/ios-version.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/sh
-##
-##  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
-##
-##  Use of this source code is governed by a BSD-style license
-##  that can be found in the LICENSE file in the root of the source
-##  tree. An additional intellectual property rights grant can be found
-##  in the file PATENTS.  All contributing project authors may
-##  be found in the AUTHORS file in the root of the source tree.
-##
-
-if [ "$1" = "--enable-shared" ]; then
-  # Shared library framework builds are only possible on iOS 8 and later.
-  echo "8.0"
-else
-  echo "6.0"
-fi
diff --git a/build/make/iosbuild.sh b/build/make/iosbuild.sh
index 96dc6cc..c703f22 100755
--- a/build/make/iosbuild.sh
+++ b/build/make/iosbuild.sh
@@ -347,10 +347,11 @@
 
 if [ "$ENABLE_SHARED" = "yes" ]; then
   IOS_VERSION_OPTIONS="--enable-shared"
+  IOS_VERSION_MIN="8.0"
 else
   IOS_VERSION_OPTIONS=""
+  IOS_VERSION_MIN="6.0"
 fi
-IOS_VERSION_MIN=$("${SCRIPT_DIR}/ios-version.sh" ${IOS_VERSION_OPTIONS})
 
 if [ "${VERBOSE}" = "yes" ]; then
 cat << EOF
diff --git a/configure b/configure
index 04ea0f4..73b0e0a 100755
--- a/configure
+++ b/configure
@@ -195,12 +195,12 @@
 fi
 
 # disable codecs when their source directory does not exist
-[ -d "${source_path}/vp8" ] || disable_feature vp8
-[ -d "${source_path}/vp9" ] || disable_feature vp9
-[ -d "${source_path}/vp10" ] || disable_feature vp10
+[ -d "${source_path}/vp8" ] || disable_codec vp8
+[ -d "${source_path}/vp9" ] || disable_codec vp9
+[ -d "${source_path}/vp10" ] || disable_codec vp10
 
 # disable vp10 codec by default
-disable_feature vp10
+disable_codec vp10
 
 # install everything except the sources, by default. sources will have
 # to be enabled when doing dist builds, since that's no longer a common
@@ -391,15 +391,19 @@
     for opt do
         optval="${opt#*=}"
         case "$opt" in
-        --disable-codecs) for c in ${CODECS}; do disable_feature $c; done ;;
+        --disable-codecs)
+          for c in ${CODEC_FAMILIES}; do disable_codec $c; done
+          ;;
         --enable-?*|--disable-?*)
         eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
-        if echo "${EXPERIMENT_LIST}" | grep "^ *$option\$" >/dev/null; then
+        if is_in ${option} ${EXPERIMENT_LIST}; then
             if enabled experimental; then
                 ${action}_feature $option
             else
                 log_echo "Ignoring $opt -- not in experimental mode."
             fi
+        elif is_in ${option} "${CODECS} ${CODEC_FAMILIES}"; then
+            ${action}_codec ${option}
         else
             process_common_cmdline $opt
         fi
@@ -413,14 +417,6 @@
 post_process_cmdline() {
     c=""
 
-    # If the codec family is disabled, disable all components of that family.
-    # If the codec family is enabled, enable all components of that family.
-    log_echo "Configuring selected codecs"
-    for c in ${CODECS}; do
-        disabled ${c%%_*} && disable_feature ${c}
-        enabled ${c%%_*} && enable_feature ${c}
-    done
-
     # Enable all detected codecs, if they haven't been disabled
     for c in ${CODECS}; do soft_enable $c; done
 
diff --git a/test/datarate_test.cc b/test/datarate_test.cc
index 3941e16..2f1db9c 100644
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -90,7 +90,7 @@
           << pkt->data.frame.pts;
     }
 
-    const size_t frame_size_in_bits = pkt->data.frame.sz * 8;
+    const int64_t frame_size_in_bits = pkt->data.frame.sz * 8;
 
     // Subtract from the buffer the bits associated with a played back frame.
     bits_in_buffer_model_ -= frame_size_in_bits;
diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc
index 400939a..7a5bd5b 100644
--- a/test/hadamard_test.cc
+++ b/test/hadamard_test.cc
@@ -21,20 +21,7 @@
 
 using ::libvpx_test::ACMRandom;
 
-typedef void (*Hadamard8x8Func)(const int16_t *a, int a_stride,
-                                int16_t *b);
-
-class HadamardTest : public ::testing::TestWithParam<Hadamard8x8Func> {
- public:
-  virtual void SetUp() {
-    h_func_ = GetParam();
-    rnd_.Reset(ACMRandom::DeterministicSeed());
-  }
-
- protected:
-  Hadamard8x8Func h_func_;
-  ACMRandom rnd_;
-};
+typedef void (*HadamardFunc)(const int16_t *a, int a_stride, int16_t *b);
 
 void hadamard_loop(const int16_t *a, int a_stride, int16_t *out) {
   int16_t b[8];
@@ -59,28 +46,74 @@
   out[5] = c[3] - c[7];
 }
 
-void reference_hadamard(const int16_t *a, int a_stride, int16_t *b) {
+void reference_hadamard8x8(const int16_t *a, int a_stride, int16_t *b) {
   int16_t buf[64];
-  for (int i = 0; i < 8; i++) {
+  for (int i = 0; i < 8; ++i) {
     hadamard_loop(a + i, a_stride, buf + i * 8);
   }
 
-  for (int i = 0; i < 8; i++) {
+  for (int i = 0; i < 8; ++i) {
     hadamard_loop(buf + i, 8, b + i * 8);
   }
 }
 
-TEST_P(HadamardTest, CompareReferenceRandom) {
+void reference_hadamard16x16(const int16_t *a, int a_stride, int16_t *b) {
+  /* The source is a 16x16 block. The destination is rearranged to 8x32.
+   * Input is 9 bit. */
+  reference_hadamard8x8(a + 0 + 0 * a_stride, a_stride, b + 0);
+  reference_hadamard8x8(a + 8 + 0 * a_stride, a_stride, b + 64);
+  reference_hadamard8x8(a + 0 + 8 * a_stride, a_stride, b + 128);
+  reference_hadamard8x8(a + 8 + 8 * a_stride, a_stride, b + 192);
+
+  /* Overlay the 8x8 blocks and combine. */
+  for (int i = 0; i < 64; ++i) {
+    /* 8x8 steps the range up to 15 bits. */
+    const int16_t a0 = b[0];
+    const int16_t a1 = b[64];
+    const int16_t a2 = b[128];
+    const int16_t a3 = b[192];
+
+    /* Prevent the result from escaping int16_t. */
+    const int16_t b0 = (a0 + a1) >> 1;
+    const int16_t b1 = (a0 - a1) >> 1;
+    const int16_t b2 = (a2 + a3) >> 1;
+    const int16_t b3 = (a2 - a3) >> 1;
+
+    /* Store a 16 bit value. */
+    b[  0] = b0 + b2;
+    b[ 64] = b1 + b3;
+    b[128] = b0 - b2;
+    b[192] = b1 - b3;
+
+    ++b;
+  }
+}
+
+class HadamardTestBase : public ::testing::TestWithParam<HadamardFunc> {
+ public:
+  virtual void SetUp() {
+    h_func_ = GetParam();
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+
+ protected:
+  HadamardFunc h_func_;
+  ACMRandom rnd_;
+};
+
+class Hadamard8x8Test : public HadamardTestBase {};
+
+TEST_P(Hadamard8x8Test, CompareReferenceRandom) {
   DECLARE_ALIGNED(16, int16_t, a[64]);
   DECLARE_ALIGNED(16, int16_t, b[64]);
   int16_t b_ref[64];
-  for (int i = 0; i < 64; i++) {
+  for (int i = 0; i < 64; ++i) {
     a[i] = rnd_.Rand9Signed();
   }
   memset(b, 0, sizeof(b));
   memset(b_ref, 0, sizeof(b_ref));
 
-  reference_hadamard(a, 8, b_ref);
+  reference_hadamard8x8(a, 8, b_ref);
   ASM_REGISTER_STATE_CHECK(h_func_(a, 8, b));
 
   // The order of the output is not important. Sort before checking.
@@ -89,11 +122,11 @@
   EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
 }
 
-TEST_P(HadamardTest, VaryStride) {
+TEST_P(Hadamard8x8Test, VaryStride) {
   DECLARE_ALIGNED(16, int16_t, a[64 * 8]);
   DECLARE_ALIGNED(16, int16_t, b[64]);
   int16_t b_ref[64];
-  for (int i = 0; i < 64 * 8; i++) {
+  for (int i = 0; i < 64 * 8; ++i) {
     a[i] = rnd_.Rand9Signed();
   }
 
@@ -101,7 +134,7 @@
     memset(b, 0, sizeof(b));
     memset(b_ref, 0, sizeof(b_ref));
 
-    reference_hadamard(a, i, b_ref);
+    reference_hadamard8x8(a, i, b_ref);
     ASM_REGISTER_STATE_CHECK(h_func_(a, i, b));
 
     // The order of the output is not important. Sort before checking.
@@ -111,21 +144,77 @@
   }
 }
 
-INSTANTIATE_TEST_CASE_P(C, HadamardTest,
+INSTANTIATE_TEST_CASE_P(C, Hadamard8x8Test,
                         ::testing::Values(&vpx_hadamard_8x8_c));
 
 #if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(SSE2, HadamardTest,
+INSTANTIATE_TEST_CASE_P(SSE2, Hadamard8x8Test,
                         ::testing::Values(&vpx_hadamard_8x8_sse2));
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3 && CONFIG_USE_X86INC && ARCH_X86_64
-INSTANTIATE_TEST_CASE_P(SSSE3, HadamardTest,
+INSTANTIATE_TEST_CASE_P(SSSE3, Hadamard8x8Test,
                         ::testing::Values(&vpx_hadamard_8x8_ssse3));
 #endif  // HAVE_SSSE3 && CONFIG_USE_X86INC && ARCH_X86_64
 
 #if HAVE_NEON
-INSTANTIATE_TEST_CASE_P(NEON, HadamardTest,
+INSTANTIATE_TEST_CASE_P(NEON, Hadamard8x8Test,
                         ::testing::Values(&vpx_hadamard_8x8_neon));
 #endif  // HAVE_NEON
+
+class Hadamard16x16Test : public HadamardTestBase {};
+
+TEST_P(Hadamard16x16Test, CompareReferenceRandom) {
+  DECLARE_ALIGNED(16, int16_t, a[16 * 16]);
+  DECLARE_ALIGNED(16, int16_t, b[16 * 16]);
+  int16_t b_ref[16 * 16];
+  for (int i = 0; i < 16 * 16; ++i) {
+    a[i] = rnd_.Rand9Signed();
+  }
+  memset(b, 0, sizeof(b));
+  memset(b_ref, 0, sizeof(b_ref));
+
+  reference_hadamard16x16(a, 16, b_ref);
+  ASM_REGISTER_STATE_CHECK(h_func_(a, 16, b));
+
+  // The order of the output is not important. Sort before checking.
+  std::sort(b, b + 16 * 16);
+  std::sort(b_ref, b_ref + 16 * 16);
+  EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
+}
+
+TEST_P(Hadamard16x16Test, VaryStride) {
+  DECLARE_ALIGNED(16, int16_t, a[16 * 16 * 8]);
+  DECLARE_ALIGNED(16, int16_t, b[16 * 16]);
+  int16_t b_ref[16 * 16];
+  for (int i = 0; i < 16 * 16 * 8; ++i) {
+    a[i] = rnd_.Rand9Signed();
+  }
+
+  for (int i = 8; i < 64; i += 8) {
+    memset(b, 0, sizeof(b));
+    memset(b_ref, 0, sizeof(b_ref));
+
+    reference_hadamard16x16(a, i, b_ref);
+    ASM_REGISTER_STATE_CHECK(h_func_(a, i, b));
+
+    // The order of the output is not important. Sort before checking.
+    std::sort(b, b + 16 * 16);
+    std::sort(b_ref, b_ref + 16 * 16);
+    EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(C, Hadamard16x16Test,
+                        ::testing::Values(&vpx_hadamard_16x16_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, Hadamard16x16Test,
+                        ::testing::Values(&vpx_hadamard_16x16_sse2));
+#endif  // HAVE_SSE2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, Hadamard16x16Test,
+                        ::testing::Values(&vpx_hadamard_16x16_neon));
+#endif  // HAVE_NEON
 }  // namespace
diff --git a/test/level_test.cc b/test/level_test.cc
index 5b9ce41..62d0247 100644
--- a/test/level_test.cc
+++ b/test/level_test.cc
@@ -22,7 +22,9 @@
      : EncoderTest(GET_PARAM(0)),
        encoding_mode_(GET_PARAM(1)),
        cpu_used_(GET_PARAM(2)),
-       target_level_(0) {}
+       min_gf_internal_(24),
+       target_level_(0),
+       level_(0) {}
   virtual ~LevelTest() {}
 
   virtual void SetUp() {
@@ -47,6 +49,7 @@
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, cpu_used_);
       encoder->Control(VP9E_SET_TARGET_LEVEL, target_level_);
+      encoder->Control(VP9E_SET_MIN_GF_INTERVAL, min_gf_internal_);
       if (encoding_mode_ != ::libvpx_test::kRealTime) {
         encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
         encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
@@ -54,20 +57,33 @@
         encoder->Control(VP8E_SET_ARNR_TYPE, 3);
       }
     }
+    encoder->Control(VP9E_GET_LEVEL, &level_);
+    ASSERT_LE(level_, 51);
+    ASSERT_GE(level_, 0);
   }
 
   ::libvpx_test::TestMode encoding_mode_;
   int cpu_used_;
+  int min_gf_internal_;
   int target_level_;
+  int level_;
 };
 
+// Test for keeping level stats only
 TEST_P(LevelTest, TestTargetLevel0) {
   ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
-                                       30);
+                                       40);
   target_level_ = 0;
+  min_gf_internal_ = 4;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_EQ(11, level_);
+
+  cfg_.rc_target_bitrate = 1600;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_EQ(20, level_);
 }
 
+// Test for level control being turned off
 TEST_P(LevelTest, TestTargetLevel255) {
   ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
                                        30);
@@ -98,7 +114,6 @@
 
 VP9_INSTANTIATE_TEST_CASE(LevelTest,
                           ::testing::Values(::libvpx_test::kTwoPassGood,
-                                            ::libvpx_test::kOnePassGood,
-                                            ::libvpx_test::kRealTime),
+                                            ::libvpx_test::kOnePassGood),
                           ::testing::Range(0, 9));
 }  // namespace
diff --git a/test/realtime_test.cc b/test/realtime_test.cc
new file mode 100644
index 0000000..24749e4
--- /dev/null
+++ b/test/realtime_test.cc
@@ -0,0 +1,64 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/video_source.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+namespace {
+
+const int kVideoSourceWidth = 320;
+const int kVideoSourceHeight = 240;
+const int kFramesToEncode = 2;
+
+class RealtimeTest
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
+ protected:
+  RealtimeTest()
+      : EncoderTest(GET_PARAM(0)), frame_packets_(0) {}
+  virtual ~RealtimeTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    cfg_.g_lag_in_frames = 0;
+    SetMode(::libvpx_test::kRealTime);
+  }
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+    // TODO(tomfinegan): We're changing the pass value here to make sure
+    // we get frames when real time mode is combined with |g_pass| set to
+    // VPX_RC_FIRST_PASS. This is necessary because EncoderTest::RunLoop() sets
+    // the pass value based on the mode passed into EncoderTest::SetMode(),
+    // which overrides the one specified in SetUp() above.
+    cfg_.g_pass = VPX_RC_FIRST_PASS;
+  }
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t * /*pkt*/) {
+    frame_packets_++;
+  }
+
+  int frame_packets_;
+};
+
+TEST_P(RealtimeTest, RealtimeFirstPassProducesFrames) {
+  ::libvpx_test::RandomVideoSource video;
+  video.SetSize(kVideoSourceWidth, kVideoSourceHeight);
+  video.set_limit(kFramesToEncode);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  EXPECT_EQ(kFramesToEncode, frame_packets_);
+}
+
+VP8_INSTANTIATE_TEST_CASE(RealtimeTest,
+                          ::testing::Values(::libvpx_test::kRealTime));
+VP9_INSTANTIATE_TEST_CASE(RealtimeTest,
+                          ::testing::Values(::libvpx_test::kRealTime));
+
+}  // namespace
diff --git a/test/test.mk b/test/test.mk
index 1eb702f..04acd96 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -24,6 +24,7 @@
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += encode_api_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += error_resilience_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += i420_video_source.h
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += realtime_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += resize_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += y4m_video_source.h
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += yuv_video_source.h
diff --git a/third_party/googletest/README.libvpx b/third_party/googletest/README.libvpx
index 7201a67..1eca78d 100644
--- a/third_party/googletest/README.libvpx
+++ b/third_party/googletest/README.libvpx
@@ -12,4 +12,8 @@
 generation.
 
 Local Modifications:
-Removed unused declarations of kPathSeparatorString to have warning free build.
\ No newline at end of file
+- Removed unused declarations of kPathSeparatorString to have warning
+  free build.
+- Added GTEST_ATTRIBUTE_UNUSED_ to test registering dummies in TEST_P
+  and INSTANTIATE_TEST_CASE_P to remove warnings about unused variables
+  under GCC 5.
\ No newline at end of file
diff --git a/third_party/googletest/src/include/gtest/gtest.h b/third_party/googletest/src/include/gtest/gtest.h
index 4f3804f..581a44e 100644
--- a/third_party/googletest/src/include/gtest/gtest.h
+++ b/third_party/googletest/src/include/gtest/gtest.h
@@ -16960,7 +16960,7 @@
                       GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>()); \
       return 0; \
     } \
-    static int gtest_registering_dummy_; \
+    static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_; \
     GTEST_DISALLOW_COPY_AND_ASSIGN_(\
         GTEST_TEST_CLASS_NAME_(test_case_name, test_name)); \
   }; \
@@ -16972,7 +16972,7 @@
 # define INSTANTIATE_TEST_CASE_P(prefix, test_case_name, generator) \
   ::testing::internal::ParamGenerator<test_case_name::ParamType> \
       gtest_##prefix##test_case_name##_EvalGenerator_() { return generator; } \
-  int gtest_##prefix##test_case_name##_dummy_ = \
+  int gtest_##prefix##test_case_name##_dummy_ GTEST_ATTRIBUTE_UNUSED_ = \
       ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \
           GetTestCasePatternHolder<test_case_name>(\
               #test_case_name, __FILE__, __LINE__)->AddTestCaseInstantiation(\
diff --git a/vp8/common/findnearmv.h b/vp8/common/findnearmv.h
index 155847c..472a7b5 100644
--- a/vp8/common/findnearmv.h
+++ b/vp8/common/findnearmv.h
@@ -104,7 +104,7 @@
 extern const unsigned char vp8_mbsplit_offset[4][16];
 
 
-static INLINE int left_block_mv(const MODE_INFO *cur_mb, int b)
+static INLINE uint32_t left_block_mv(const MODE_INFO *cur_mb, int b)
 {
     if (!(b & 3))
     {
@@ -119,7 +119,8 @@
     return (cur_mb->bmi + b - 1)->mv.as_int;
 }
 
-static INLINE int above_block_mv(const MODE_INFO *cur_mb, int b, int mi_stride)
+static INLINE uint32_t above_block_mv(const MODE_INFO *cur_mb, int b,
+                                      int mi_stride)
 {
     if (!(b >> 2))
     {
diff --git a/vp8/decoder/decodeframe.c b/vp8/decoder/decodeframe.c
index 4bc87eb..566972e 100644
--- a/vp8/decoder/decodeframe.c
+++ b/vp8/decoder/decodeframe.c
@@ -986,7 +986,8 @@
     VP8_COMMON *const pc = &pbi->common;
     MACROBLOCKD *const xd  = &pbi->mb;
     const unsigned char *data = pbi->fragments.ptrs[0];
-    const unsigned char *data_end =  data + pbi->fragments.sizes[0];
+    const unsigned int data_sz = pbi->fragments.sizes[0];
+    const unsigned char *data_end = data + data_sz;
     ptrdiff_t first_partition_length_in_bytes;
 
     int i, j, k, l;
@@ -1022,7 +1023,7 @@
         const unsigned char *clear = data;
         if (pbi->decrypt_cb)
         {
-            int n = (int)VPXMIN(sizeof(clear_buffer), data_end - data);
+            int n = (int)VPXMIN(sizeof(clear_buffer), data_sz);
             pbi->decrypt_cb(pbi->decrypt_state, data, clear_buffer, n);
             clear = clear_buffer;
         }
diff --git a/vp8/encoder/denoising.c b/vp8/encoder/denoising.c
index cff99c0..26ce120 100644
--- a/vp8/encoder/denoising.c
+++ b/vp8/encoder/denoising.c
@@ -529,7 +529,7 @@
         // Bias on zero motion vector sse.
         const int zero_bias = denoiser->denoise_pars.denoise_mv_bias;
         zero_mv_sse = (unsigned int)((int64_t)zero_mv_sse * zero_bias / 100);
-        sse_diff = zero_mv_sse - best_sse;
+        sse_diff = (int)zero_mv_sse - (int)best_sse;
 
         saved_mbmi = *mbmi;
 
diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index 95bb394..c526a3e 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -18,6 +18,7 @@
 #include "onyx_int.h"
 #include "vpx_dsp/variance.h"
 #include "encodeintra.h"
+#include "vp8/common/common.h"
 #include "vp8/common/setupintrarecon.h"
 #include "vp8/common/systemdependent.h"
 #include "mcomp.h"
@@ -2417,7 +2418,7 @@
     int tmp_q;
     int frames_left = (int)(cpi->twopass.total_stats.count - cpi->common.current_video_frame);
 
-    FIRSTPASS_STATS this_frame = {0};
+    FIRSTPASS_STATS this_frame;
     FIRSTPASS_STATS this_frame_copy;
 
     double this_frame_intra_error;
@@ -2425,6 +2426,8 @@
 
     int overhead_bits;
 
+    vp8_zero(this_frame);
+
     if (!cpi->twopass.stats_in)
     {
         return ;
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 6617422..d5a0fff 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -1523,7 +1523,8 @@
 void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
 {
     VP8_COMMON *cm = &cpi->common;
-    int last_w, last_h, prev_number_of_layers;
+    int last_w, last_h;
+    unsigned int prev_number_of_layers;
 
     if (!cpi)
         return;
@@ -1786,10 +1787,8 @@
     if (last_w != cpi->oxcf.Width || last_h != cpi->oxcf.Height)
         cpi->force_next_frame_intra = 1;
 
-    if (((cm->Width + 15) & 0xfffffff0) !=
-          cm->yv12_fb[cm->lst_fb_idx].y_width ||
-        ((cm->Height + 15) & 0xfffffff0) !=
-          cm->yv12_fb[cm->lst_fb_idx].y_height ||
+    if (((cm->Width + 15) & ~15) != cm->yv12_fb[cm->lst_fb_idx].y_width ||
+        ((cm->Height + 15) & ~15) != cm->yv12_fb[cm->lst_fb_idx].y_height ||
         cm->yv12_fb[cm->lst_fb_idx].y_width == 0)
     {
         dealloc_raw_frame_buffers(cpi);
@@ -5221,7 +5220,7 @@
         vp8_second_pass(cpi);
 
     encode_frame_to_data_rate(cpi, size, dest, dest_end, frame_flags);
-    cpi->twopass.bits_left -= 8 * *size;
+    cpi->twopass.bits_left -= 8 * (int)(*size);
 
     if (!cpi->common.refresh_alt_ref_frame)
     {
@@ -5857,7 +5856,7 @@
         return -1;
 
     // Check number of rows and columns match
-    if (cpi->common.mb_rows != rows || cpi->common.mb_cols != cols)
+    if (cpi->common.mb_rows != (int)rows || cpi->common.mb_cols != (int)cols)
         return -1;
 
     // Range check the delta Q values and convert the external Q range values
@@ -5913,7 +5912,7 @@
 
 int vp8_set_active_map(VP8_COMP *cpi, unsigned char *map, unsigned int rows, unsigned int cols)
 {
-    if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols)
+    if ((int)rows == cpi->common.mb_rows && (int)cols == cpi->common.mb_cols)
     {
         if (map)
         {
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 6ede9b9..44fbbd4 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -371,7 +371,7 @@
     double key_frame_rate_correction_factor;
     double gf_rate_correction_factor;
 
-    unsigned int frames_since_golden;
+    int frames_since_golden;
     /* Count down till next GF */
     int frames_till_gf_update_due;
 
diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index 257d2a0..b19ab7a 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -22,6 +22,7 @@
 #include "vpx/vp8cx.h"
 #include "vp8/encoder/firstpass.h"
 #include "vp8/common/onyx.h"
+#include "vp8/common/common.h"
 #include <stdlib.h>
 #include <string.h>
 
@@ -760,7 +761,7 @@
                                     unsigned long          duration,
                                     unsigned long          deadline)
 {
-    unsigned int new_qc;
+    int new_qc;
 
 #if !(CONFIG_REALTIME_ONLY)
     /* Use best quality mode if no deadline is given. */
@@ -785,7 +786,9 @@
     new_qc = MODE_REALTIME;
 #endif
 
-    if (ctx->cfg.g_pass == VPX_RC_FIRST_PASS)
+    if (deadline == VPX_DL_REALTIME)
+        new_qc = MODE_REALTIME;
+    else if (ctx->cfg.g_pass == VPX_RC_FIRST_PASS)
         new_qc = MODE_FIRSTPASS;
     else if (ctx->cfg.g_pass == VPX_RC_LAST_PASS)
         new_qc = (new_qc == MODE_BESTQUALITY)
@@ -1116,7 +1119,8 @@
 {
 
     YV12_BUFFER_CONFIG sd;
-    vp8_ppflags_t flags = {0};
+    vp8_ppflags_t flags;
+    vp8_zero(flags);
 
     if (ctx->preview_ppcfg.post_proc_flag)
     {
@@ -1305,8 +1309,8 @@
         30,                 /* rc_resize_up_thresold */
 
         VPX_VBR,            /* rc_end_usage */
-        {0},                /* rc_twopass_stats_in */
-        {0},                /* rc_firstpass_mb_stats_in */
+        {NULL, 0},          /* rc_twopass_stats_in */
+        {NULL, 0},          /* rc_firstpass_mb_stats_in */
         256,                /* rc_target_bandwidth */
         4,                  /* rc_min_quantizer */
         63,                 /* rc_max_quantizer */
@@ -1334,6 +1338,8 @@
         {0},                /* ts_rate_decimator */
         0,                  /* ts_periodicity */
         {0},                /* ts_layer_id */
+        {0},                /* layer_target_bitrate */
+        0                   /* temporal_layering_mode */
     }},
 };
 
diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c
index 9c78de1..fc9288d 100644
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -522,7 +522,8 @@
     {
         YV12_BUFFER_CONFIG sd;
         int64_t time_stamp = 0, time_end_stamp = 0;
-        vp8_ppflags_t flags = {0};
+        vp8_ppflags_t flags;
+        vp8_zero(flags);
 
         if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)
         {
@@ -816,11 +817,12 @@
     },
     { /* encoder functions */
         0,
-        NULL,
-        NULL,
-        NULL,
-        NULL,
-        NULL,
-        NULL
+        NULL,  /* vpx_codec_enc_cfg_map_t */
+        NULL,  /* vpx_codec_encode_fn_t */
+        NULL,  /* vpx_codec_get_cx_data_fn_t */
+        NULL,  /* vpx_codec_enc_config_set_fn_t */
+        NULL,  /* vpx_codec_get_global_headers_fn_t */
+        NULL,  /* vpx_codec_get_preview_frame_fn_t */
+        NULL   /* vpx_codec_enc_mr_get_mem_loc_fn_t */
     }
 };
diff --git a/vp9/common/vp9_common.h b/vp9/common/vp9_common.h
index 2aff132..908fa80 100644
--- a/vp9/common/vp9_common.h
+++ b/vp9/common/vp9_common.h
@@ -67,25 +67,6 @@
 
 #define VP9_FRAME_MARKER 0x2
 
-typedef enum {
-  LEVEL_UNKNOWN = 0,
-  LEVEL_1 = 10,
-  LEVEL_1_1 = 11,
-  LEVEL_2 = 20,
-  LEVEL_2_1 = 21,
-  LEVEL_3 = 30,
-  LEVEL_3_1 = 31,
-  LEVEL_4 = 40,
-  LEVEL_4_1 = 41,
-  LEVEL_5 = 50,
-  LEVEL_5_1 = 51,
-  LEVEL_5_2 = 52,
-  LEVEL_6 = 60,
-  LEVEL_6_1 = 61,
-  LEVEL_6_2 = 62,
-  LEVEL_NOT_CARE = 255,
-} VP9_LEVEL;
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/common/vp9_common_data.c b/vp9/common/vp9_common_data.c
index a6dae6a..3409d04 100644
--- a/vp9/common/vp9_common_data.c
+++ b/vp9/common/vp9_common_data.c
@@ -159,3 +159,18 @@
   {0,  8 },  // 64X32 - {0b0000, 0b1000}
   {0,  0 },  // 64X64 - {0b0000, 0b0000}
 };
+
+#if CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH
+const uint8_t need_top_left[INTRA_MODES] = {
+    0,  // DC_PRED
+    0,  // V_PRED
+    0,  // H_PRED
+    0,  // D45_PRED
+    1,  // D135_PRED
+    1,  // D117_PRED
+    1,  // D153_PRED
+    0,  // D207_PRED
+    0,  // D63_PRED
+    1,  // TM_PRED
+};
+#endif  // CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/common/vp9_common_data.h b/vp9/common/vp9_common_data.h
index 95a1179..0ae24da 100644
--- a/vp9/common/vp9_common_data.h
+++ b/vp9/common/vp9_common_data.h
@@ -33,6 +33,9 @@
 extern const BLOCK_SIZE txsize_to_bsize[TX_SIZES];
 extern const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES];
 extern const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2];
+#if CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH
+extern const uint8_t need_top_left[INTRA_MODES];
+#endif  // CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index 1df6f08..3fd935e 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -168,8 +168,6 @@
 
   int allow_high_precision_mv;
 
-  int keep_level_stats;
-
   // Flag signaling that the frame context should be reset to default values.
   // 0 or 1 implies don't reset, 2 reset just the context specified in the
   // frame header, 3 reset all contexts.
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index fe89829..ffc6839 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -902,4 +902,10 @@
       frame_mvs += cm->mi_cols;
     }
   }
+#if CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH
+    if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) &&
+        (xd->above_mi == NULL || xd->left_mi == NULL) &&
+        !is_inter_block(mi) && need_top_left[mi->uv_mode])
+      assert(0);
+#endif  // CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH
 }
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 61cca39..73a2db0 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -891,7 +891,7 @@
     vpx_wb_write_bit(wb, cm->log2_tile_rows != 1);
 }
 
-static int get_refresh_mask(VP9_COMP *cpi) {
+int vp9_get_refresh_mask(VP9_COMP *cpi) {
   if (vp9_preserve_existing_gf(cpi)) {
     // We have decided to preserve the previously existing golden frame as our
     // new ARF frame. However, in the short term we leave it in the GF slot and,
@@ -1107,11 +1107,11 @@
         write_bitdepth_colorspace_sampling(cm, wb);
       }
 
-      vpx_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
+      vpx_wb_write_literal(wb, vp9_get_refresh_mask(cpi), REF_FRAMES);
       write_frame_size(cm, wb);
     } else {
       MV_REFERENCE_FRAME ref_frame;
-      vpx_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
+      vpx_wb_write_literal(wb, vp9_get_refresh_mask(cpi), REF_FRAMES);
       for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
         assert(get_ref_frame_map_idx(cpi, ref_frame) != INVALID_IDX);
         vpx_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame),
diff --git a/vp9/encoder/vp9_bitstream.h b/vp9/encoder/vp9_bitstream.h
index da6b414..f24d20f 100644
--- a/vp9/encoder/vp9_bitstream.h
+++ b/vp9/encoder/vp9_bitstream.h
@@ -18,6 +18,8 @@
 
 #include "vp9/encoder/vp9_encoder.h"
 
+int vp9_get_refresh_mask(VP9_COMP *cpi);
+
 void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size);
 
 static INLINE int vp9_preserve_existing_gf(VP9_COMP *cpi) {
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index bbdfbb8..069c335 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -146,9 +146,9 @@
   uint8_t sb_is_skin;
 
   // Used to save the status of whether a block has a low variance in
-  // choose_partitioning. 0 for 64x64, 1 2 for 64x32, 3 4 for 32x64, 5~8 for
-  // 32x32.
-  uint8_t variance_low[9];
+  // choose_partitioning. 0 for 64x64, 1~2 for 64x32, 3~4 for 32x64, 5~8 for
+  // 32x32, 9~24 for 16x16.
+  uint8_t variance_low[25];
 
   void (*fwd_txm4x4)(const int16_t *input, tran_low_t *output, int stride);
   void (*itxm_add)(const tran_low_t *input, uint8_t *dest, int stride, int eob);
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 21a66bb..984f98a 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -134,7 +134,7 @@
                                0, &sse);
       break;
   }
-  return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
+  return ROUND_POWER_OF_TWO((int64_t)var, num_pels_log2_lookup[bs]);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
@@ -773,9 +773,7 @@
     }
   }
 
-  for (i = 0; i < 9; i++) {
-    x->variance_low[i] = 0;
-  }
+  memset(x->variance_low, 0, sizeof(x->variance_low));
 
   if (xd->mb_to_right_edge < 0)
     pixels_wide += (xd->mb_to_right_edge >> 3);
@@ -798,7 +796,7 @@
     const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
 
     const YV12_BUFFER_CONFIG *yv12_g = NULL;
-    unsigned int y_sad, y_sad_g;
+    unsigned int y_sad, y_sad_g, y_sad_thr;
     const BLOCK_SIZE bsize = BLOCK_32X32
         + (mi_col + 4 < cm->mi_cols) * 2 + (mi_row + 4 < cm->mi_rows);
 
@@ -831,7 +829,10 @@
     mi->interp_filter = BILINEAR;
 
     y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col);
-    if (y_sad_g < y_sad) {
+    // Pick ref frame for partitioning, bias last frame when y_sad_g and y_sad
+    // are close if short_circuit_low_temp_var is on.
+    y_sad_thr = cpi->sf.short_circuit_low_temp_var ? (y_sad * 7) >> 3 : y_sad;
+    if (y_sad_g < y_sad_thr) {
       vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
                            &cm->frame_refs[GOLDEN_FRAME - 1].sf);
       mi->ref_frame[0] = GOLDEN_FRAME;
@@ -1083,28 +1084,53 @@
   }
 
   if (cpi->sf.short_circuit_low_temp_var) {
-    // Set low variance flag, only for blocks >= 32x32 and if LAST_FRAME was
-    // selected.
-    if (ref_frame_partition == LAST_FRAME) {
+    const int mv_thr = cm->width > 640 ? 8 : 4;
+    // Check temporal variance for bsize >= 16x16, if LAST_FRAME was selected
+    // and int_pro mv is small. If the temporal variance is small set the
+    // variance_low flag for the block. The variance threshold can be adjusted,
+    // the higher the more aggressive.
+    if (ref_frame_partition == LAST_FRAME &&
+        (cpi->sf.short_circuit_low_temp_var == 1 ||
+         (xd->mi[0]->mv[0].as_mv.col < mv_thr &&
+          xd->mi[0]->mv[0].as_mv.col > -mv_thr &&
+          xd->mi[0]->mv[0].as_mv.row < mv_thr &&
+          xd->mi[0]->mv[0].as_mv.row > -mv_thr))) {
       if (xd->mi[0]->sb_type == BLOCK_64X64 &&
           vt.part_variances.none.variance < (thresholds[0] >> 1)) {
         x->variance_low[0] = 1;
       } else if (xd->mi[0]->sb_type == BLOCK_64X32) {
-        if (vt.part_variances.horz[0].variance < (thresholds[0] >> 2))
-          x->variance_low[1] = 1;
-        if (vt.part_variances.horz[1].variance < (thresholds[0] >> 2))
-          x->variance_low[2] = 1;
+        for (j = 0; j < 2; j++) {
+          if (vt.part_variances.horz[j].variance < (thresholds[0] >> 2))
+            x->variance_low[j + 1] = 1;
+        }
       } else if (xd->mi[0]->sb_type == BLOCK_32X64) {
-        if (vt.part_variances.vert[0].variance < (thresholds[0] >> 2))
-          x->variance_low[3] = 1;
-        if (vt.part_variances.vert[1].variance < (thresholds[0] >> 2))
-          x->variance_low[4] = 1;
+        for (j = 0; j < 2; j++) {
+          if (vt.part_variances.vert[j].variance < (thresholds[0] >> 2))
+            x->variance_low[j + 3] = 1;
+        }
       } else {
-        // 32x32
         for (i = 0; i < 4; i++) {
-          if (!force_split[i + 1] &&
-              vt.split[i].part_variances.none.variance < (thresholds[1] >> 1))
-            x->variance_low[i + 5] = 1;
+          if (!force_split[i + 1]) {
+            // 32x32
+            if (vt.split[i].part_variances.none.variance <
+                (thresholds[1] >> 1))
+              x->variance_low[i + 5] = 1;
+          } else if (cpi->sf.short_circuit_low_temp_var == 2) {
+            int idx[4] = {0, 4, xd->mi_stride << 2, (xd->mi_stride << 2) + 4};
+            const int idx_str = cm->mi_stride * mi_row + mi_col + idx[i];
+            MODE_INFO **this_mi = cm->mi_grid_visible + idx_str;
+            // For 32x16 and 16x32 blocks, the flag is set on each 16x16 block
+            // inside.
+            if ((*this_mi)->sb_type == BLOCK_16X16 ||
+                (*this_mi)->sb_type == BLOCK_32X16 ||
+                (*this_mi)->sb_type == BLOCK_16X32) {
+              for (j = 0; j < 4; j++) {
+                if (vt.split[i].split[j].part_variances.none.variance <
+                    (thresholds[2] >> 8))
+                  x->variance_low[(i << 2) + j + 9] = 1;
+              }
+            }
+          }
         }
       }
     }
@@ -4417,6 +4443,12 @@
 
   if (!is_inter_block(mi)) {
     int plane;
+#if CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH
+    if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) &&
+        (xd->above_mi == NULL || xd->left_mi == NULL) &&
+        need_top_left[mi->uv_mode])
+      assert(0);
+#endif  // CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH
     mi->skip = 1;
     for (plane = 0; plane < MAX_MB_PLANE; ++plane)
       vp9_encode_intra_block_plane(x, VPXMAX(bsize, BLOCK_8X8), plane, 1);
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 936e625..fdf403e 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -50,16 +50,14 @@
                      pd->dst.buf, pd->dst.stride);
 }
 
-#define RDTRUNC(RM, DM, R, D)                        \
-  (((1 << (VP9_PROB_COST_SHIFT - 1)) + (R) * (RM)) & \
-   ((1 << VP9_PROB_COST_SHIFT) - 1))
-
+// TODO(aconverse): Re-pack this structure.
 typedef struct vp9_token_state {
   int           rate;
-  int           error;
+  int64_t       error;
   int           next;
   int16_t       token;
   tran_low_t    qc;
+  tran_low_t    dqc;
 } vp9_token_state;
 
 static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] ={ {10, 6}, {8, 7}, };
@@ -68,10 +66,6 @@
 {\
   rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);\
   rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);\
-  if (rd_cost0 == rd_cost1) {\
-    rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);\
-    rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);\
-  }\
 }
 
 // This function is a place holder for now but may ultimately need
@@ -102,17 +96,19 @@
   const int eob = p->eobs[block];
   const PLANE_TYPE type = get_plane_type(plane);
   const int default_eob = 16 << (tx_size << 1);
-  const int mul = 1 + (tx_size == TX_32X32);
-  const int16_t *dequant_ptr = pd->dequant;
-  const uint8_t *const band_translate = get_band_translate(tx_size);
+  const int shift = (tx_size == TX_32X32);
+  const int16_t* const dequant_ptr = pd->dequant;
+  const uint8_t* const band_translate = get_band_translate(tx_size);
   const scan_order *const so = get_scan(xd, tx_size, type, block);
   const int16_t *const scan = so->scan;
   const int16_t *const nb = so->neighbors;
+  const int dq_step[2] = { dequant_ptr[0] >> shift, dequant_ptr[1] >> shift };
   int next = eob, sz = 0;
   const int64_t rdmult = (mb->rdmult * plane_rd_mult[ref][type]) >> 1;
   const int64_t rddiv = mb->rddiv;
   int64_t rd_cost0, rd_cost1;
-  int rate0, rate1, error0, error1;
+  int rate0, rate1;
+  int64_t error0, error1;
   int16_t t0, t1;
   EXTRABIT e0;
   int best, band, pt, i, final_eob;
@@ -164,7 +160,7 @@
       /* And pick the best. */
       best = rd_cost1 < rd_cost0;
       base_bits = vp9_get_cost(t0, e0, cat6_high_cost);
-      dx = mul * (dqcoeff[rc] - coeff[rc]);
+      dx = (dqcoeff[rc] - coeff[rc]) * (1 << shift);
 #if CONFIG_VP9_HIGHBITDEPTH
       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
         dx >>= xd->bd - 8;
@@ -176,14 +172,15 @@
       tokens[i][0].next = next;
       tokens[i][0].token = t0;
       tokens[i][0].qc = x;
+      tokens[i][0].dqc = dqcoeff[rc];
       best_index[i][0] = best;
 
       /* Evaluate the second possibility for this state. */
       rate0 = tokens[next][0].rate;
       rate1 = tokens[next][1].rate;
 
-      if ((abs(x) * dequant_ptr[rc != 0] > abs(coeff[rc]) * mul) &&
-          (abs(x) * dequant_ptr[rc != 0] < abs(coeff[rc]) * mul +
+      if ((abs(x) * dequant_ptr[rc != 0] > (abs(coeff[rc]) << shift)) &&
+          (abs(x) * dequant_ptr[rc != 0] < (abs(coeff[rc]) << shift) +
                                                dequant_ptr[rc != 0]))
         shortcut = 1;
       else
@@ -192,6 +189,11 @@
       if (shortcut) {
         sz = -(x < 0);
         x -= 2 * sz + 1;
+      } else {
+        tokens[i][1] = tokens[i][0];
+        best_index[i][1] = best_index[i][0];
+        next = i;
+        continue;
       }
 
       /* Consider both possible successor states. */
@@ -242,6 +244,24 @@
       tokens[i][1].next = next;
       tokens[i][1].token = best ? t1 : t0;
       tokens[i][1].qc = x;
+
+      if (x) {
+        tran_low_t offset = dq_step[rc != 0];
+        // The 32x32 transform coefficient uses half quantization step size.
+        // Account for the rounding difference in the dequantized coefficeint
+        // value when the quantization index is dropped from an even number
+        // to an odd number.
+        if (shift & x)
+          offset += (dequant_ptr[rc != 0] & 0x01);
+
+        if (sz == 0)
+          tokens[i][1].dqc = dqcoeff[rc] - offset;
+        else
+          tokens[i][1].dqc = dqcoeff[rc] + offset;
+      } else {
+        tokens[i][1].dqc = 0;
+      }
+
       best_index[i][1] = best;
       /* Finally, make this the new head of the trellis. */
       next = i;
@@ -281,18 +301,13 @@
   UPDATE_RD_COST();
   best = rd_cost1 < rd_cost0;
   final_eob = -1;
-  memset(qcoeff, 0, sizeof(*qcoeff) * (16 << (tx_size * 2)));
-  memset(dqcoeff, 0, sizeof(*dqcoeff) * (16 << (tx_size * 2)));
+
   for (i = next; i < eob; i = next) {
     const int x = tokens[i][best].qc;
     const int rc = scan[i];
-    if (x) {
-      final_eob = i;
-    }
-
+    if (x) final_eob = i;
     qcoeff[rc] = x;
-    dqcoeff[rc] = (x * dequant_ptr[rc != 0]) / mul;
-
+    dqcoeff[rc] = tokens[i][best].dqc;
     next = tokens[i][best].next;
     best = best_index[i][best];
   }
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 8201794..3f88d9c 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -86,6 +86,25 @@
 FILE *keyfile;
 #endif
 
+static const Vp9LevelSpec vp9_level_defs[VP9_LEVELS] = {
+  {LEVEL_1,   829440,      36864,    200,    400,   2, 1,  4,  8},
+  {LEVEL_1_1, 2764800,     73728,    800,    1000,  2, 1,  4,  8},
+  {LEVEL_2,   4608000,     122880,   1800,   1500,  2, 1,  4,  8},
+  {LEVEL_2_1, 9216000,     245760,   3600,   2800,  2, 2,  4,  8},
+  {LEVEL_3,   20736000,    552960,   7200,   6000,  2, 4,  4,  8},
+  {LEVEL_3_1, 36864000,    983040,   12000,  10000, 2, 4,  4,  8},
+  {LEVEL_4,   83558400,    2228224,  18000,  16000, 4, 4,  4,  8},
+  {LEVEL_4_1, 160432128,   2228224,  30000,  18000, 4, 4,  5,  6},
+  {LEVEL_5,   311951360,   8912896,  60000,  36000, 6, 8,  6,  4},
+  {LEVEL_5_1, 588251136,   8912896,  120000, 46000, 8, 8,  10, 4},
+  // TODO(huisu): update max_cpb_size for level 5_2 ~ 6_2 when
+  // they are finalized (currently TBD).
+  {LEVEL_5_2, 1176502272,  8912896,  180000, 0,     8, 8,  10, 4},
+  {LEVEL_6,   1176502272,  35651584, 180000, 0,     8, 16, 10, 4},
+  {LEVEL_6_1, 2353004544u, 35651584, 240000, 0,     8, 16, 10, 4},
+  {LEVEL_6_2, 4706009088u, 35651584, 480000, 0,     8, 16, 10, 4},
+};
+
 static INLINE void Scale2Ratio(VPX_SCALING mode, int *hr, int *hs) {
   switch (mode) {
     case NORMAL:
@@ -159,6 +178,39 @@
   }
 }
 
+static void init_level_info(Vp9LevelInfo *level_info) {
+  Vp9LevelStats *const level_stats = &level_info->level_stats;
+  Vp9LevelSpec *const level_spec = &level_info->level_spec;
+
+  memset(level_stats, 0, sizeof(*level_stats));
+  memset(level_spec, 0, sizeof(*level_spec));
+  level_spec->level = LEVEL_UNKNOWN;
+  level_spec->min_altref_distance = INT_MAX;
+}
+
+VP9_LEVEL vp9_get_level(const Vp9LevelSpec * const level_spec) {
+  int i;
+  const Vp9LevelSpec *this_level;
+
+  vpx_clear_system_state();
+
+  for (i = 0; i < VP9_LEVELS; ++i) {
+    this_level = &vp9_level_defs[i];
+    if ((double)level_spec->max_luma_sample_rate * (1 + SAMPLE_RATE_GRACE_P) >
+        (double)this_level->max_luma_sample_rate ||
+        level_spec->max_luma_picture_size > this_level->max_luma_picture_size ||
+        level_spec->average_bitrate > this_level->average_bitrate ||
+        level_spec->max_cpb_size > this_level->max_cpb_size ||
+        level_spec->compression_ratio < this_level->compression_ratio ||
+        level_spec->max_col_tiles > this_level->max_col_tiles ||
+        level_spec->min_altref_distance < this_level->min_altref_distance ||
+        level_spec->max_ref_frame_buffers > this_level->max_ref_frame_buffers)
+      continue;
+    break;
+  }
+  return (i == VP9_LEVELS) ? LEVEL_UNKNOWN : vp9_level_defs[i].level;
+}
+
 int vp9_set_active_map(VP9_COMP* cpi,
                        unsigned char* new_map_16x16,
                        int rows,
@@ -783,7 +835,7 @@
   cm->color_range = oxcf->color_range;
 
   cpi->target_level = oxcf->target_level;
-  cm->keep_level_stats = oxcf->target_level != LEVEL_NOT_CARE;
+  cpi->keep_level_stats = oxcf->target_level != LEVEL_MAX;
 
   cm->width = oxcf->width;
   cm->height = oxcf->height;
@@ -1476,7 +1528,7 @@
   cm->color_range = oxcf->color_range;
 
   cpi->target_level = oxcf->target_level;
-  cm->keep_level_stats = oxcf->target_level != LEVEL_NOT_CARE;
+  cpi->keep_level_stats = oxcf->target_level != LEVEL_MAX;
 
   if (cm->profile <= PROFILE_1)
     assert(cm->bit_depth == VPX_BITS_8);
@@ -1660,7 +1712,6 @@
   } while (++i <= MV_MAX);
 }
 
-
 VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
                                 BufferPool *const pool) {
   unsigned int i;
@@ -1749,6 +1800,9 @@
   cpi->multi_arf_last_grp_enabled = 0;
 
   cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
+
+  init_level_info(&cpi->level_info);
+
 #if CONFIG_INTERNAL_STATS
   cpi->b_calculate_ssimg = 0;
   cpi->b_calculate_blockiness = 1;
@@ -2199,7 +2253,7 @@
 static void encoder_highbd_variance64(const uint8_t *a8, int  a_stride,
                                       const uint8_t *b8, int  b_stride,
                                       int w, int h, uint64_t *sse,
-                                      uint64_t *sum) {
+                                      int64_t *sum) {
   int i, j;
 
   uint16_t *a = CONVERT_TO_SHORTPTR(a8);
@@ -2223,7 +2277,7 @@
                                       int w, int h,
                                       unsigned int *sse, int *sum) {
   uint64_t sse_long = 0;
-  uint64_t sum_long = 0;
+  int64_t sum_long = 0;
   encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h,
                             &sse_long, &sum_long);
   *sse = (unsigned int)sse_long;
@@ -2798,7 +2852,7 @@
   } else if (vp9_preserve_existing_gf(cpi)) {
     // We have decided to preserve the previously existing golden frame as our
     // new ARF frame. However, in the short term in function
-    // vp9_bitstream.c::get_refresh_mask() we left it in the GF slot and, if
+    // vp9_get_refresh_mask() we left it in the GF slot and, if
     // we're updating the GF with the current decoded frame, we save it to the
     // ARF slot instead.
     // We now have to update the ARF with the current frame and swap gld_fb_idx
@@ -4420,6 +4474,124 @@
 }
 #endif  // CONFIG_INTERNAL_STATS
 
+static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) {
+  VP9_COMMON *const cm = &cpi->common;
+  Vp9LevelInfo *const level_info = &cpi->level_info;
+  Vp9LevelSpec *const level_spec = &level_info->level_spec;
+  Vp9LevelStats *const level_stats = &level_info->level_stats;
+  int i, idx;
+  uint64_t luma_samples, dur_end;
+  const uint32_t luma_pic_size = cm->width * cm->height;
+  double cpb_data_size;
+
+  vpx_clear_system_state();
+
+  // update level_stats
+  level_stats->total_compressed_size += *size;
+  if (cm->show_frame) {
+    level_stats->total_uncompressed_size +=
+        luma_pic_size +
+        2 * (luma_pic_size >> (cm->subsampling_x + cm->subsampling_y));
+    level_stats->time_encoded =
+        (cpi->last_end_time_stamp_seen - cpi->first_time_stamp_ever) /
+        (double)TICKS_PER_SEC;
+  }
+
+  if (arf_src_index > 0) {
+    if (!level_stats->seen_first_altref) {
+      level_stats->seen_first_altref = 1;
+    } else if (level_stats->frames_since_last_altref <
+             level_spec->min_altref_distance) {
+      level_spec->min_altref_distance = level_stats->frames_since_last_altref;
+    }
+    level_stats->frames_since_last_altref = 0;
+  } else {
+    ++level_stats->frames_since_last_altref;
+  }
+
+  if (level_stats->frame_window_buffer.len < FRAME_WINDOW_SIZE - 1) {
+    idx = (level_stats->frame_window_buffer.start +
+           level_stats->frame_window_buffer.len++) % FRAME_WINDOW_SIZE;
+  } else {
+    idx = level_stats->frame_window_buffer.start;
+    level_stats->frame_window_buffer.start = (idx + 1) % FRAME_WINDOW_SIZE;
+  }
+  level_stats->frame_window_buffer.buf[idx].ts = cpi->last_time_stamp_seen;
+  level_stats->frame_window_buffer.buf[idx].size = (uint32_t)(*size);
+  level_stats->frame_window_buffer.buf[idx].luma_samples = luma_pic_size;
+
+  if (cm->frame_type == KEY_FRAME) {
+    level_stats->ref_refresh_map = 0;
+  } else {
+    int count = 0;
+    level_stats->ref_refresh_map |= vp9_get_refresh_mask(cpi);
+    // Also need to consider the case where the encoder refers to a buffer
+    // that has been implicitly refreshed after encoding a keyframe.
+    if (!cm->intra_only) {
+      level_stats->ref_refresh_map |= (1 << cpi->lst_fb_idx);
+      level_stats->ref_refresh_map |= (1 << cpi->gld_fb_idx);
+      level_stats->ref_refresh_map |= (1 << cpi->alt_fb_idx);
+    }
+    for (i = 0; i < REF_FRAMES; ++i) {
+      count += (level_stats->ref_refresh_map >> i) & 1;
+    }
+    if (count > level_spec->max_ref_frame_buffers) {
+      level_spec->max_ref_frame_buffers = count;
+    }
+  }
+
+  // update average_bitrate
+  level_spec->average_bitrate =
+      (double)level_stats->total_compressed_size / 125.0 /
+      level_stats->time_encoded;
+
+  // update max_luma_sample_rate
+  luma_samples = 0;
+  for (i = 0; i < level_stats->frame_window_buffer.len; ++i) {
+    idx = (level_stats->frame_window_buffer.start +
+           level_stats->frame_window_buffer.len - 1 - i) % FRAME_WINDOW_SIZE;
+    if (i == 0) {
+      dur_end = level_stats->frame_window_buffer.buf[idx].ts;
+    }
+    if (dur_end - level_stats->frame_window_buffer.buf[idx].ts >=
+        TICKS_PER_SEC) {
+      break;
+    }
+    luma_samples += level_stats->frame_window_buffer.buf[idx].luma_samples;
+  }
+  if (luma_samples > level_spec->max_luma_sample_rate) {
+    level_spec->max_luma_sample_rate = luma_samples;
+  }
+
+  // update max_cpb_size
+  cpb_data_size = 0;
+  for (i = 0; i < CPB_WINDOW_SIZE; ++i) {
+    if (i >= level_stats->frame_window_buffer.len) break;
+    idx = (level_stats->frame_window_buffer.start +
+           level_stats->frame_window_buffer.len - 1 - i) % FRAME_WINDOW_SIZE;
+    cpb_data_size += level_stats->frame_window_buffer.buf[idx].size;
+  }
+  cpb_data_size = cpb_data_size / 125.0;
+  if (cpb_data_size > level_spec->max_cpb_size) {
+    level_spec->max_cpb_size = cpb_data_size;
+  }
+
+  // update max_luma_picture_size
+  if (luma_pic_size > level_spec->max_luma_picture_size) {
+    level_spec->max_luma_picture_size = luma_pic_size;
+  }
+
+  // update compression_ratio
+  level_spec->compression_ratio =
+      (double)level_stats->total_uncompressed_size * cm->bit_depth /
+      level_stats->total_compressed_size / 8.0;
+
+  // update max_col_tiles
+  if (level_spec->max_col_tiles < (1 << cm->log2_tile_cols)) {
+    level_spec->max_col_tiles = (1 << cm->log2_tile_cols);
+  }
+}
+
 int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
                             size_t *size, uint8_t *dest,
                             int64_t *time_stamp, int64_t *time_end, int flush) {
@@ -4690,6 +4862,9 @@
   if (cpi->b_calculate_psnr && oxcf->pass != 1 && cm->show_frame)
     generate_psnr_packet(cpi);
 
+  if (cpi->keep_level_stats && oxcf->pass != 1)
+    update_level_info(cpi, size, arf_src_index);
+
 #if CONFIG_INTERNAL_STATS
 
   if (oxcf->pass != 1) {
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 6be61ac..b65dfa8 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -20,6 +20,7 @@
 #include "vpx_dsp/ssim.h"
 #endif
 #include "vpx_dsp/variance.h"
+#include "vpx_ports/system_state.h"
 #include "vpx_util/vpx_thread.h"
 
 #include "vp9/common/vp9_alloccommon.h"
@@ -51,6 +52,9 @@
 extern "C" {
 #endif
 
+// vp9 uses 10,000,000 ticks/second as time stamp
+#define TICKS_PER_SEC 10000000
+
 typedef struct {
   int nmvjointcost[MV_JOINTS];
   int nmvcosts[2][MV_VALS];
@@ -297,6 +301,69 @@
   double worst;
 } ImageStat;
 
+#define CPB_WINDOW_SIZE 4
+#define FRAME_WINDOW_SIZE 128
+#define SAMPLE_RATE_GRACE_P 0.015
+#define VP9_LEVELS 14
+
+typedef enum {
+  LEVEL_UNKNOWN = 0,
+  LEVEL_1 = 10,
+  LEVEL_1_1 = 11,
+  LEVEL_2 = 20,
+  LEVEL_2_1 = 21,
+  LEVEL_3 = 30,
+  LEVEL_3_1 = 31,
+  LEVEL_4 = 40,
+  LEVEL_4_1 = 41,
+  LEVEL_5 = 50,
+  LEVEL_5_1 = 51,
+  LEVEL_5_2 = 52,
+  LEVEL_6 = 60,
+  LEVEL_6_1 = 61,
+  LEVEL_6_2 = 62,
+  LEVEL_MAX = 255
+} VP9_LEVEL;
+
+typedef struct {
+  VP9_LEVEL level;
+  uint64_t max_luma_sample_rate;
+  uint32_t max_luma_picture_size;
+  double average_bitrate;  // in kilobits per second
+  double max_cpb_size;  // in kilobits
+  double compression_ratio;
+  uint8_t max_col_tiles;
+  uint32_t min_altref_distance;
+  uint8_t max_ref_frame_buffers;
+} Vp9LevelSpec;
+
+typedef struct {
+  int64_t ts;  // timestamp
+  uint32_t luma_samples;
+  uint32_t size;  // in bytes
+} FrameRecord;
+
+typedef struct {
+  FrameRecord buf[FRAME_WINDOW_SIZE];
+  uint8_t start;
+  uint8_t len;
+} FrameWindowBuffer;
+
+typedef struct {
+  uint8_t seen_first_altref;
+  uint32_t frames_since_last_altref;
+  uint64_t total_compressed_size;
+  uint64_t total_uncompressed_size;
+  double time_encoded;  // in seconds
+  FrameWindowBuffer frame_window_buffer;
+  int ref_refresh_map;
+} Vp9LevelStats;
+
+typedef struct {
+  Vp9LevelStats level_stats;
+  Vp9LevelSpec level_spec;
+} Vp9LevelInfo;
+
 typedef struct VP9_COMP {
   QUANTS quants;
   ThreadData td;
@@ -519,6 +586,9 @@
   VPxWorker *workers;
   struct EncWorkerData *tile_thr_data;
   VP9LfSync lf_row_sync;
+
+  int keep_level_stats;
+  Vp9LevelInfo level_info;
 } VP9_COMP;
 
 void vp9_initialize_enc(void);
@@ -674,6 +744,8 @@
   return cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL;
 }
 
+VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec);
+
 void vp9_new_framerate(VP9_COMP *cpi, double framerate);
 
 #define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl))
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index a70eaea..f6e61b6 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -41,8 +41,6 @@
 #define OUTPUT_FPF          0
 #define ARF_STATS_OUTPUT    0
 
-#define GROUP_ADAPTIVE_MAXQ 1
-
 #define BOOST_BREAKOUT      12.5
 #define BOOST_FACTOR        12.5
 #define FACTOR_PT_LOW       0.70
@@ -1343,6 +1341,7 @@
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   const int is_two_pass_svc = (svc->number_spatial_layers > 1) ||
                               (svc->number_temporal_layers > 1);
+  RATE_CONTROL *const rc = &cpi->rc;
   TWO_PASS *const twopass = is_two_pass_svc ?
       &svc->layer_context[svc->spatial_layer_id].twopass : &cpi->twopass;
   double frame_rate;
@@ -1399,15 +1398,21 @@
   }
 
   // Reset the vbr bits off target counters
-  cpi->rc.vbr_bits_off_target = 0;
-  cpi->rc.vbr_bits_off_target_fast = 0;
-
-  cpi->rc.rate_error_estimate = 0;
+  rc->vbr_bits_off_target = 0;
+  rc->vbr_bits_off_target_fast = 0;
+  rc->rate_error_estimate = 0;
 
   // Static sequence monitor variables.
   twopass->kf_zeromotion_pct = 100;
   twopass->last_kfgroup_zeromotion_pct = 100;
 
+  // Initialize bits per macro_block estimate correction factor.
+  twopass->bpm_factor = 1.0;
+  // Initiallize actual and target bits counters for ARF groups so that
+  // at the start we have a neutral bpm adjustment.
+  twopass->rolling_arf_group_target_bits = 1;
+  twopass->rolling_arf_group_actual_bits = 1;
+
   if (oxcf->resize_mode != RESIZE_NONE) {
     init_subsampling(cpi);
   }
@@ -1932,9 +1937,7 @@
   double boost_score = 0.0;
   double old_boost_score = 0.0;
   double gf_group_err = 0.0;
-#if GROUP_ADAPTIVE_MAXQ
   double gf_group_raw_error = 0.0;
-#endif
   double gf_group_skip_pct = 0.0;
   double gf_group_inactive_zone_rows = 0.0;
   double gf_first_frame_err = 0.0;
@@ -1984,9 +1987,7 @@
   // the error score / cost of this frame has already been accounted for.
   if (arf_active_or_kf) {
     gf_group_err -= gf_first_frame_err;
-#if GROUP_ADAPTIVE_MAXQ
     gf_group_raw_error -= this_frame->coded_error;
-#endif
     gf_group_skip_pct -= this_frame->intra_skip_pct;
     gf_group_inactive_zone_rows -= this_frame->inactive_zone_rows;
   }
@@ -2040,9 +2041,7 @@
     // Accumulate error score of frames in this gf group.
     mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
     gf_group_err += mod_frame_err;
-#if GROUP_ADAPTIVE_MAXQ
     gf_group_raw_error += this_frame->coded_error;
-#endif
     gf_group_skip_pct += this_frame->intra_skip_pct;
     gf_group_inactive_zone_rows += this_frame->inactive_zone_rows;
 
@@ -2142,9 +2141,7 @@
       if (EOF == input_stats(twopass, this_frame))
         break;
       gf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
-#if GROUP_ADAPTIVE_MAXQ
       gf_group_raw_error += this_frame->coded_error;
-#endif
       gf_group_skip_pct += this_frame->intra_skip_pct;
       gf_group_inactive_zone_rows += this_frame->inactive_zone_rows;
     }
@@ -2159,7 +2156,6 @@
   // Calculate the bits to be allocated to the gf/arf group as a whole
   gf_group_bits = calculate_total_gf_group_bits(cpi, gf_group_err);
 
-#if GROUP_ADAPTIVE_MAXQ
   // Calculate an estimate of the maxq needed for the group.
   // We are more agressive about correcting for sections
   // where there could be significant overshoot than for easier
@@ -2181,7 +2177,6 @@
     twopass->active_worst_quality =
         (tmp_q + (twopass->active_worst_quality * 3)) >> 2;
   }
-#endif
 
   // Calculate the extra bits to be used for boosted frame(s)
   gf_arf_bits = calculate_boost_bits(rc->baseline_gf_interval,
@@ -2718,13 +2713,6 @@
       ((double)cm->mb_rows * section_length);
     int tmp_q;
 
-    // Initialize bits per macro_block estimate correction factor.
-    twopass->bpm_factor = 1.0;
-    // Initiallize actual and target bits counters for ARF groups so that
-    // at the start we have a neutral bpm adjustment.
-    twopass->rolling_arf_group_target_bits = 1;
-    twopass->rolling_arf_group_actual_bits = 1;
-
     tmp_q = get_twopass_worst_quality(cpi, section_error,
         section_intra_skip + section_inactive_zone, section_target_bandwidth);
 
diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index 3ae5b21..14a0b16 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -59,8 +59,8 @@
   // Try sub-pixel MC
   // if (bestsme > error_thresh && bestsme < INT_MAX)
   {
-    int distortion;
-    unsigned int sse;
+    uint32_t distortion;
+    uint32_t sse;
     cpi->find_fractional_mv_step(
         x, dst_mv, ref_mv, cpi->common.allow_high_precision_mv, x->errorperbit,
         &v_fn_ptr, 0, mv_sf->subpel_iters_per_step,
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index f3ffe35..e747277 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -162,6 +162,33 @@
   return &buf[(r >> 3) * stride + (c >> 3)];
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+/* checks if (r, c) has better score than previous best */
+#define CHECK_BETTER(v, r, c) \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {              \
+    int64_t tmpmse;                                                    \
+    if (second_pred == NULL) {                                         \
+      thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c),      \
+                         sp(r), z, src_stride, &sse);                  \
+    } else {                                                           \
+      thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c),     \
+                          sp(r), z, src_stride, &sse, second_pred);    \
+    }                                                                  \
+    tmpmse = thismse;                                                  \
+    tmpmse += MVC(r, c);                                               \
+    if (tmpmse >= INT_MAX) {                                           \
+      v = INT_MAX;                                                     \
+    } else if ((v = (uint32_t)tmpmse) < besterr) {                     \
+      besterr = v;                                                     \
+      br = r;                                                          \
+      bc = c;                                                          \
+      *distortion = thismse;                                           \
+      *sse1 = sse;                                                     \
+    }                                                                  \
+  } else {                                                             \
+    v = INT_MAX;                                                       \
+  }
+#else
 /* checks if (r, c) has better score than previous best */
 #define CHECK_BETTER(v, r, c) \
   if (c >= minc && c <= maxc && r >= minr && r <= maxr) {              \
@@ -182,6 +209,7 @@
     v = INT_MAX;                                                       \
   }
 
+#endif
 #define FIRST_LEVEL_CHECKS                              \
   {                                                     \
     unsigned int left, right, up, down, diag;           \
@@ -310,10 +338,10 @@
                                        const uint8_t *second_pred,
                                        int w, int h, int offset,
                                        int *mvjcost, int *mvcost[2],
-                                       unsigned int *sse1,
-                                       int *distortion) {
-  unsigned int besterr;
+                                       uint32_t *sse1,
+                                       uint32_t *distortion) {
 #if CONFIG_VP9_HIGHBITDEPTH
+  uint64_t besterr;
   if (second_pred != NULL) {
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       DECLARE_ALIGNED(16, uint16_t, comp_pred16[64 * 64]);
@@ -329,9 +357,13 @@
   } else {
     besterr = vfp->vf(y + offset, y_stride, src, src_stride, sse1);
   }
-  *distortion = besterr;
+  *distortion = (uint32_t)besterr;
   besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+  if (besterr >= UINT32_MAX)
+    return UINT32_MAX;
+  return (uint32_t)besterr;
 #else
+  uint32_t besterr;
   (void) xd;
   if (second_pred != NULL) {
     DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
@@ -342,8 +374,8 @@
   }
   *distortion = besterr;
   besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
   return besterr;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 }
 
 static INLINE int divide_and_round(const int n, const int d) {
@@ -373,7 +405,7 @@
                          (cost_list[4] - 2 * cost_list[0] + cost_list[2]));
 }
 
-int vp9_skip_sub_pixel_tree(
+uint32_t vp9_skip_sub_pixel_tree(
     const MACROBLOCK *x,
     MV *bestmv, const MV *ref_mv,
     int allow_hp,
@@ -383,8 +415,8 @@
     int iters_per_step,
     int *cost_list,
     int *mvjcost, int *mvcost[2],
-    int *distortion,
-    unsigned int *sse1,
+    uint32_t *distortion,
+    uint32_t *sse1,
     const uint8_t *second_pred,
     int w, int h) {
   SETUP_SUBPEL_SEARCH;
@@ -418,7 +450,7 @@
   return besterr;
 }
 
-int vp9_find_best_sub_pixel_tree_pruned_evenmore(
+uint32_t vp9_find_best_sub_pixel_tree_pruned_evenmore(
     const MACROBLOCK *x,
     MV *bestmv, const MV *ref_mv,
     int allow_hp,
@@ -428,8 +460,8 @@
     int iters_per_step,
     int *cost_list,
     int *mvjcost, int *mvcost[2],
-    int *distortion,
-    unsigned int *sse1,
+    uint32_t *distortion,
+    uint32_t *sse1,
     const uint8_t *second_pred,
     int w, int h) {
   SETUP_SUBPEL_SEARCH;
@@ -498,7 +530,7 @@
   return besterr;
 }
 
-int vp9_find_best_sub_pixel_tree_pruned_more(const MACROBLOCK *x,
+uint32_t vp9_find_best_sub_pixel_tree_pruned_more(const MACROBLOCK *x,
                                              MV *bestmv, const MV *ref_mv,
                                              int allow_hp,
                                              int error_per_bit,
@@ -507,8 +539,8 @@
                                              int iters_per_step,
                                              int *cost_list,
                                              int *mvjcost, int *mvcost[2],
-                                             int *distortion,
-                                             unsigned int *sse1,
+                                             uint32_t *distortion,
+                                             uint32_t *sse1,
                                              const uint8_t *second_pred,
                                              int w, int h) {
   SETUP_SUBPEL_SEARCH;
@@ -572,7 +604,7 @@
   return besterr;
 }
 
-int vp9_find_best_sub_pixel_tree_pruned(const MACROBLOCK *x,
+uint32_t vp9_find_best_sub_pixel_tree_pruned(const MACROBLOCK *x,
                                         MV *bestmv, const MV *ref_mv,
                                         int allow_hp,
                                         int error_per_bit,
@@ -581,8 +613,8 @@
                                         int iters_per_step,
                                         int *cost_list,
                                         int *mvjcost, int *mvcost[2],
-                                        int *distortion,
-                                        unsigned int *sse1,
+                                        uint32_t *distortion,
+                                        uint32_t *sse1,
                                         const uint8_t *second_pred,
                                         int w, int h) {
   SETUP_SUBPEL_SEARCH;
@@ -674,19 +706,19 @@
     {0, -1}, {0, 1}, {-1, 0}, {1, 0}
 };
 
-int vp9_find_best_sub_pixel_tree(const MACROBLOCK *x,
-                                 MV *bestmv, const MV *ref_mv,
-                                 int allow_hp,
-                                 int error_per_bit,
-                                 const vp9_variance_fn_ptr_t *vfp,
-                                 int forced_stop,
-                                 int iters_per_step,
-                                 int *cost_list,
-                                 int *mvjcost, int *mvcost[2],
-                                 int *distortion,
-                                 unsigned int *sse1,
-                                 const uint8_t *second_pred,
-                                 int w, int h) {
+uint32_t vp9_find_best_sub_pixel_tree(const MACROBLOCK *x,
+                                      MV *bestmv, const MV *ref_mv,
+                                      int allow_hp,
+                                      int error_per_bit,
+                                      const vp9_variance_fn_ptr_t *vfp,
+                                      int forced_stop,
+                                      int iters_per_step,
+                                      int *cost_list,
+                                      int *mvjcost, int *mvcost[2],
+                                      uint32_t *distortion,
+                                      uint32_t *sse1,
+                                      const uint8_t *second_pred,
+                                      int w, int h) {
   const uint8_t *const z = x->plane[0].src.buf;
   const uint8_t *const src_address = z;
   const int src_stride = x->plane[0].src.stride;
@@ -1381,12 +1413,22 @@
   const struct buf_2d *const what = &x->plane[0].src;
   const struct buf_2d *const in_what = &xd->plane[0].pre[0];
   const MV mv = {best_mv->row * 8, best_mv->col * 8};
-  unsigned int unused;
-
+  uint32_t unused;
+#if CONFIG_VP9_HIGHBITDEPTH
+  uint64_t err= vfp->vf(what->buf, what->stride,
+                        get_buf_from_mv(in_what, best_mv),
+                        in_what->stride, &unused);
+  err += (use_mvcost ?  mv_err_cost(&mv, center_mv, x->nmvjointcost,
+                                    x->mvcost, x->errorperbit) : 0);
+  if (err >= INT_MAX)
+    return INT_MAX;
+  return (int)err;
+#else
   return vfp->vf(what->buf, what->stride,
                  get_buf_from_mv(in_what, best_mv), in_what->stride, &unused) +
       (use_mvcost ?  mv_err_cost(&mv, center_mv, x->nmvjointcost,
                                  x->mvcost, x->errorperbit) : 0);
+#endif
 }
 
 int vp9_get_mvpred_av_var(const MACROBLOCK *x,
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index 1b0c860..86cd267 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -74,7 +74,7 @@
                                            BLOCK_SIZE bsize,
                                            int mi_row, int mi_col);
 
-typedef int (fractional_mv_step_fp) (
+typedef uint32_t (fractional_mv_step_fp) (
     const MACROBLOCK *x,
     MV *bestmv, const MV *ref_mv,
     int allow_hp,
@@ -84,7 +84,7 @@
     int iters_per_step,
     int *cost_list,
     int *mvjcost, int *mvcost[2],
-    int *distortion, unsigned int *sse1,
+    uint32_t *distortion, uint32_t *sse1,
     const uint8_t *second_pred,
     int w, int h);
 
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 554409b..c13f24f 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -40,6 +40,14 @@
   int in_use;
 } PRED_BUFFER;
 
+
+static const int pos_shift_16x16[4][4] = {
+  {9, 10, 13, 14},
+  {11, 12, 15, 16},
+  {17, 18, 21, 22},
+  {19, 20, 23, 24}
+};
+
 static int mv_refs_rt(VP9_COMP *cpi, const VP9_COMMON *cm,
                       const MACROBLOCK *x,
                       const MACROBLOCKD *xd,
@@ -149,7 +157,7 @@
   const int ref = mi->ref_frame[0];
   const MV ref_mv = x->mbmi_ext->ref_mvs[ref][0].as_mv;
   MV center_mv;
-  int dis;
+  uint32_t dis;
   int rate_mode;
   const int tmp_col_min = x->mv_col_min;
   const int tmp_col_max = x->mv_col_max;
@@ -582,39 +590,46 @@
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
-                      int *skippable, int64_t *sse, int plane,
-                      BLOCK_SIZE bsize, TX_SIZE tx_size) {
+static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
+                      int *skippable, int64_t *sse, BLOCK_SIZE bsize,
+                      TX_SIZE tx_size) {
   MACROBLOCKD *xd = &x->e_mbd;
   unsigned int var_y, sse_y;
-  (void)plane;
+
   (void)tx_size;
-  model_rd_for_sb_y(cpi, bsize, x, xd, rate, dist, &var_y, &sse_y);
+  model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc->rate, &this_rdc->dist, &var_y,
+                    &sse_y);
   *sse = INT_MAX;
   *skippable = 0;
   return;
 }
 #else
-static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
-                      int *skippable, int64_t *sse, int plane,
-                      BLOCK_SIZE bsize, TX_SIZE tx_size) {
+static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
+                      int *skippable, int64_t *sse, BLOCK_SIZE bsize,
+                      TX_SIZE tx_size) {
   MACROBLOCKD *xd = &x->e_mbd;
-  const struct macroblockd_plane *pd = &xd->plane[plane];
-  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *pd = &xd->plane[0];
+  struct macroblock_plane *const p = &x->plane[0];
   const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
   const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
   const int step = 1 << (tx_size << 1);
   const int block_step = (1 << tx_size);
   int block = 0, r, c;
-  int shift = tx_size == TX_32X32 ? 0 : 2;
   const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 :
-      xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+      xd->mb_to_right_edge >> 5);
   const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 :
-      xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+      xd->mb_to_bottom_edge >> 5);
   int eob_cost = 0;
+  const int bw = 4 * num_4x4_w;
+  const int bh = 4 * num_4x4_h;
 
   (void)cpi;
-  vp9_subtract_plane(x, bsize, plane);
+
+  // The max tx_size passed in is TX_16X16.
+  assert(tx_size != TX_32X32);
+
+  vpx_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
+                     pd->dst.buf, pd->dst.stride);
   *skippable = 1;
   // Keep track of the row and column of the blocks we use so that we know
   // if we are in the unrestricted motion border.
@@ -626,18 +641,11 @@
         tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
         tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
         uint16_t *const eob = &p->eobs[block];
-        const int diff_stride = 4 * num_4x4_blocks_wide_lookup[bsize];
+        const int diff_stride = bw;
         const int16_t *src_diff;
         src_diff = &p->src_diff[(r * diff_stride + c) << 2];
 
         switch (tx_size) {
-          case TX_32X32:
-            vpx_fdct32x32_rd(src_diff, coeff, diff_stride);
-            vp9_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin,
-                                  p->round_fp, p->quant_fp, p->quant_shift,
-                                  qcoeff, dqcoeff, pd->dequant, eob,
-                                  scan_order->scan, scan_order->iscan);
-            break;
           case TX_16X16:
             vpx_hadamard_16x16(src_diff, diff_stride, (int16_t *)coeff);
             vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
@@ -670,18 +678,17 @@
     }
   }
 
-  if (*skippable && *sse < INT64_MAX) {
-    *rate = 0;
-    *dist = (*sse << 6) >> shift;
-    *sse = *dist;
-    return;
+  this_rdc->rate = 0;
+  if (*sse < INT64_MAX) {
+    *sse = (*sse << 6) >> 2;
+    if (*skippable) {
+      this_rdc->dist = *sse;
+      return;
+    }
   }
 
   block = 0;
-  *rate = 0;
-  *dist = 0;
-  if (*sse < INT64_MAX)
-    *sse = (*sse << 6) >> shift;
+  this_rdc->dist = 0;
   for (r = 0; r < max_blocks_high; r += block_step) {
     for (c = 0; c < num_4x4_w; c += block_step) {
       if (c < max_blocks_wide) {
@@ -691,25 +698,26 @@
         uint16_t *const eob = &p->eobs[block];
 
         if (*eob == 1)
-          *rate += (int)abs(qcoeff[0]);
+          this_rdc->rate += (int)abs(qcoeff[0]);
         else if (*eob > 1)
-          *rate += vpx_satd((const int16_t *)qcoeff, step << 4);
+          this_rdc->rate += vpx_satd((const int16_t *)qcoeff, step << 4);
 
-        *dist += vp9_block_error_fp(coeff, dqcoeff, step << 4) >> shift;
+        this_rdc->dist +=
+            vp9_block_error_fp(coeff, dqcoeff, step << 4) >> 2;
       }
       block += step;
     }
   }
 
   // If skippable is set, rate gets clobbered later.
-  *rate <<= (2 + VP9_PROB_COST_SHIFT);
-  *rate += (eob_cost << VP9_PROB_COST_SHIFT);
+  this_rdc->rate <<= (2 + VP9_PROB_COST_SHIFT);
+  this_rdc->rate += (eob_cost << VP9_PROB_COST_SHIFT);
 }
 #endif
 
 static void model_rd_for_sb_uv(VP9_COMP *cpi, BLOCK_SIZE plane_bsize,
                                MACROBLOCK *x, MACROBLOCKD *xd,
-                               int *out_rate_sum, int64_t *out_dist_sum,
+                               RD_COST *this_rdc,
                                unsigned int *var_y, unsigned int *sse_y,
                                int start_plane, int stop_plane) {
   // Note our transform coeffs are 8 times an orthogonal transform.
@@ -720,8 +728,8 @@
   int64_t dist;
   int i;
 
-  *out_rate_sum = 0;
-  *out_dist_sum = 0;
+  this_rdc->rate = 0;
+  this_rdc->dist = 0;
 
   for (i = start_plane; i <= stop_plane; ++i) {
     struct macroblock_plane *const p = &x->plane[i];
@@ -752,8 +760,8 @@
                                  dc_quant >> 3, &rate, &dist);
   #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-    *out_rate_sum += rate >> 1;
-    *out_dist_sum += dist << 3;
+    this_rdc->rate += rate >> 1;
+    this_rdc->dist += dist << 3;
 
   #if CONFIG_VP9_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -768,8 +776,8 @@
                                  ac_quant >> 3, &rate, &dist);
   #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-    *out_rate_sum += rate;
-    *out_dist_sum += dist << 4;
+    this_rdc->rate += rate;
+    this_rdc->dist += dist << 4;
   }
 }
 
@@ -906,8 +914,7 @@
   MACROBLOCK *x;
   PREDICTION_MODE mode;
   int skippable;
-  int rate;
-  int64_t dist;
+  RD_COST *rdc;
 };
 
 static void estimate_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
@@ -924,8 +931,7 @@
   const int src_stride = p->src.stride;
   const int dst_stride = pd->dst.stride;
   int i, j;
-  int rate;
-  int64_t dist;
+  RD_COST this_rdc;
 
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
 
@@ -942,19 +948,19 @@
   if (plane == 0) {
     int64_t this_sse = INT64_MAX;
     // TODO(jingning): This needs further refactoring.
-    block_yrd(cpi, x, &rate, &dist, &args->skippable, &this_sse, 0,
-              bsize_tx, VPXMIN(tx_size, TX_16X16));
+    block_yrd(cpi, x, &this_rdc, &args->skippable, &this_sse, bsize_tx,
+              VPXMIN(tx_size, TX_16X16));
   } else {
     unsigned int var = 0;
     unsigned int sse = 0;
-    model_rd_for_sb_uv(cpi, plane_bsize, x, xd, &rate, &dist, &var, &sse,
-                       plane, plane);
+    model_rd_for_sb_uv(cpi, plane_bsize, x, xd, &this_rdc, &var, &sse, plane,
+                       plane);
   }
 
   p->src.buf = src_buf_base;
   pd->dst.buf = dst_buf_base;
-  args->rate += rate;
-  args->dist += dist;
+  args->rdc->rate += this_rdc.rate;
+  args->rdc->dist += this_rdc.dist;
 }
 
 static const THR_MODES mode_idx[MAX_REF_FRAMES - 1][4] = {
@@ -1007,7 +1013,7 @@
   MODE_INFO *const mi = xd->mi[0];
   RD_COST this_rdc, best_rdc;
   PREDICTION_MODE this_mode;
-  struct estimate_block_intra_args args = { cpi, x, DC_PRED, 1, 0, 0 };
+  struct estimate_block_intra_args args = { cpi, x, DC_PRED, 1, 0 };
   const TX_SIZE intra_tx_size =
       VPXMIN(max_txsize_lookup[bsize],
              tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
@@ -1031,22 +1037,20 @@
   // Change the limit of this loop to add other intra prediction
   // mode tests.
   for (this_mode = DC_PRED; this_mode <= H_PRED; ++this_mode) {
+    this_rdc.dist = this_rdc.rate = 0;
     args.mode = this_mode;
     args.skippable = 1;
-    args.rate = 0;
-    args.dist = 0;
+    args.rdc = &this_rdc;
     mi->tx_size = intra_tx_size;
     vp9_foreach_transformed_block_in_plane(xd, bsize, 0,
                                            estimate_block_intra, &args);
     if (args.skippable) {
       x->skip_txfm[0] = SKIP_TXFM_AC_DC;
-      args.rate = vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), 1);
+      this_rdc.rate = vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), 1);
     } else {
       x->skip_txfm[0] = SKIP_TXFM_NONE;
-      args.rate += vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), 0);
+      this_rdc.rate += vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), 0);
     }
-    this_rdc.rate = args.rate;
-    this_rdc.dist = args.dist;
     this_rdc.rate += bmode_costs[this_mode];
     this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
                              this_rdc.rate, this_rdc.dist);
@@ -1270,9 +1274,11 @@
 }
 #endif  // CONFIG_VP9_TEMPORAL_DENOISING
 
-static INLINE int set_force_skip_low_temp_var(uint8_t *variance_low,
+static INLINE int get_force_skip_low_temp_var(uint8_t *variance_low,
                                               int mi_row, int mi_col,
                                               BLOCK_SIZE bsize) {
+  const int i = (mi_row & 0x7) >> 1;
+  const int j = (mi_col & 0x7) >> 1;
   int force_skip_low_temp_var = 0;
   // Set force_skip_low_temp_var based on the block size and block offset.
   if (bsize == BLOCK_64X64) {
@@ -1299,6 +1305,19 @@
     } else if ((mi_col & 0x7) && (mi_row & 0x7)) {
       force_skip_low_temp_var = variance_low[8];
     }
+  } else if (bsize == BLOCK_16X16) {
+    force_skip_low_temp_var = variance_low[pos_shift_16x16[i][j]];
+  } else if (bsize == BLOCK_32X16) {
+    // The col shift index for the second 16x16 block.
+    const int j2 = ((mi_col + 2) & 0x7) >> 1;
+    // Only if each 16x16 block inside has low temporal variance.
+    force_skip_low_temp_var = variance_low[pos_shift_16x16[i][j]] &&
+                              variance_low[pos_shift_16x16[i][j2]];
+  } else if (bsize == BLOCK_16X32) {
+    // The row shift index for the second 16x16 block.
+    const int i2 = ((mi_row + 2) & 0x7) >> 1;
+    force_skip_low_temp_var = variance_low[pos_shift_16x16[i][j]] &&
+                              variance_low[pos_shift_16x16[i2][j]];
   }
   return force_skip_low_temp_var;
 }
@@ -1450,7 +1469,7 @@
 
   if (cpi->sf.short_circuit_low_temp_var) {
     force_skip_low_temp_var =
-        set_force_skip_low_temp_var(&x->variance_low[0], mi_row, mi_col, bsize);
+        get_force_skip_low_temp_var(&x->variance_low[0], mi_row, mi_col, bsize);
   }
 
   if (!((cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) &&
@@ -1503,6 +1522,12 @@
       continue;
     }
 
+    if (cpi->sf.short_circuit_low_temp_var == 2 &&
+        force_skip_low_temp_var && ref_frame == LAST_FRAME &&
+        this_mode == NEWMV) {
+      continue;
+    }
+
     if (cpi->use_svc) {
       if (svc_force_zero_mode[ref_frame - 1] &&
           frame_mv[this_mode][ref_frame].as_int != 0)
@@ -1539,7 +1564,8 @@
           !cpi->use_svc &&
           cpi->oxcf.rc_mode == VPX_CBR) {
         int tmp_sad;
-        int dis, cost_list[5];
+        uint32_t dis;
+        int cost_list[5];
 
         if (bsize < BLOCK_16X16)
           continue;
@@ -1640,8 +1666,9 @@
 
     if ((this_mode == NEWMV || filter_ref == SWITCHABLE) && pred_filter_search
         && (ref_frame == LAST_FRAME ||
-            (ref_frame == GOLDEN_FRAME && cpi->use_svc))
-        && (((mi->mv[0].as_mv.row | mi->mv[0].as_mv.col) & 0x07) != 0)) {
+           (ref_frame == GOLDEN_FRAME &&
+           (cpi->use_svc || cpi->oxcf.rc_mode == VPX_VBR))) &&
+           (((mi->mv[0].as_mv.row | mi->mv[0].as_mv.col) & 0x07) != 0)) {
       int pf_rate[3];
       int64_t pf_dist[3];
       unsigned int pf_var[3];
@@ -1696,7 +1723,7 @@
       vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
 
       // For large partition blocks, extra testing is done.
-      if (bsize > BLOCK_32X32 &&
+      if (cpi->oxcf.rc_mode == VPX_CBR && bsize > BLOCK_32X32 &&
         !cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id) &&
         cm->base_qindex) {
         model_rd_for_sb_y_large(cpi, bsize, x, xd, &this_rdc.rate,
@@ -1710,8 +1737,8 @@
 
     if (!this_early_term) {
       this_sse = (int64_t)sse_y;
-      block_yrd(cpi, x, &this_rdc.rate, &this_rdc.dist, &is_skippable,
-                &this_sse, 0, bsize, VPXMIN(mi->tx_size, TX_16X16));
+      block_yrd(cpi, x, &this_rdc, &is_skippable, &this_sse, bsize,
+                VPXMIN(mi->tx_size, TX_16X16));
       x->skip_txfm[0] = is_skippable;
       if (is_skippable) {
         this_rdc.rate = vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
@@ -1737,17 +1764,15 @@
     }
 
     if (x->color_sensitivity[0] || x->color_sensitivity[1]) {
-      int uv_rate = 0;
-      int64_t uv_dist = 0;
+      RD_COST rdc_uv;
       const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, &xd->plane[1]);
       if (x->color_sensitivity[0])
         vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 1);
       if (x->color_sensitivity[1])
         vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 2);
-      model_rd_for_sb_uv(cpi, uv_bsize, x, xd, &uv_rate, &uv_dist,
-                         &var_y, &sse_y, 1, 2);
-      this_rdc.rate += uv_rate;
-      this_rdc.dist += uv_dist;
+      model_rd_for_sb_uv(cpi, uv_bsize, x, xd, &rdc_uv, &var_y, &sse_y, 1, 2);
+      this_rdc.rate += rdc_uv.rate;
+      this_rdc.dist += rdc_uv.dist;
     }
 
     this_rdc.rate += rate_mv;
@@ -1842,12 +1867,13 @@
     inter_mode_thresh = (inter_mode_thresh << 1) + inter_mode_thresh;
   }
   // Perform intra prediction search, if the best SAD is above a certain
-  // threshold. Skip intra prediction if force_skip_low_temp_var is set.
-  if (!force_skip_low_temp_var && perform_intra_pred &&
+  // threshold.
+  if ((!force_skip_low_temp_var || bsize < BLOCK_32X32) &&
+      perform_intra_pred &&
       (best_rdc.rdcost == INT64_MAX ||
        (!x->skip && best_rdc.rdcost > inter_mode_thresh &&
         bsize <= cpi->sf.max_intra_bsize))) {
-    struct estimate_block_intra_args args = { cpi, x, DC_PRED, 1, 0, 0 };
+    struct estimate_block_intra_args args = { cpi, x, DC_PRED, 1, 0 };
     int i;
     TX_SIZE best_intra_tx_size = TX_SIZES;
     TX_SIZE intra_tx_size =
@@ -1896,10 +1922,10 @@
 
       mi->mode = this_mode;
       mi->ref_frame[0] = INTRA_FRAME;
+      this_rdc.dist = this_rdc.rate = 0;
       args.mode = this_mode;
       args.skippable = 1;
-      args.rate = 0;
-      args.dist = 0;
+      args.rdc = &this_rdc;
       mi->tx_size = intra_tx_size;
       vp9_foreach_transformed_block_in_plane(xd, bsize, 0,
                                              estimate_block_intra, &args);
@@ -1907,10 +1933,10 @@
       // mirrors the behavior used by inter
       if (args.skippable) {
         x->skip_txfm[0] = SKIP_TXFM_AC_DC;
-        args.rate = vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), 1);
+        this_rdc.rate = vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), 1);
       } else {
         x->skip_txfm[0] = SKIP_TXFM_NONE;
-        args.rate += vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), 0);
+        this_rdc.rate += vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), 0);
       }
       // Inter and intra RD will mismatch in scale for non-screen content.
       if (cpi->oxcf.content == VP9E_CONTENT_SCREEN) {
@@ -1921,8 +1947,6 @@
           vp9_foreach_transformed_block_in_plane(xd, bsize, 2,
                                                  estimate_block_intra, &args);
       }
-      this_rdc.rate = args.rate;
-      this_rdc.dist = args.dist;
       this_rdc.rate += cpi->mbmode_cost[this_mode];
       this_rdc.rate += ref_frame_cost[INTRA_FRAME];
       this_rdc.rate += intra_cost_penalty;
@@ -2152,7 +2176,7 @@
             const int tmp_col_max = x->mv_col_max;
             const int tmp_row_min = x->mv_row_min;
             const int tmp_row_max = x->mv_row_max;
-            int dummy_dist;
+            uint32_t dummy_dist;
 
             if (i == 0) {
               mvp_full.row = b_mv[NEARESTMV].as_mv.row >> 3;
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 9766c05..d68b684 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -94,7 +94,7 @@
       const int coeff_sign = (coeff >> 31);
       const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
       const int64_t tmp = abs_coeff + round_ptr[rc != 0];
-      const uint32_t abs_qcoeff = (uint32_t)((tmp * quant_ptr[rc != 0]) >> 16);
+      const int abs_qcoeff = (int)((tmp * quant_ptr[rc != 0]) >> 16);
       qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
       dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
       if (abs_qcoeff)
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index a304182..0ed0850 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -1214,6 +1214,11 @@
   for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
     if (!(cpi->sf.intra_uv_mode_mask[max_tx_size] & (1 << mode)))
       continue;
+#if CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH
+    if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) &&
+        (xd->above_mi == NULL || xd->left_mi == NULL) && need_top_left[mode])
+      continue;
+#endif  // CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH
 
     xd->mi[0]->uv_mode = mode;
 
@@ -1710,8 +1715,8 @@
     x->mv_row_max = tmp_row_max;
 
     if (bestsme < INT_MAX) {
-      int dis; /* TODO: use dis in distortion calculation later. */
-      unsigned int sse;
+      uint32_t dis; /* TODO: use dis in distortion calculation later. */
+      uint32_t sse;
       bestsme = cpi->find_fractional_mv_step(
           x, &tmp_mv,
           &ref_mv[id].as_mv,
@@ -1911,7 +1916,7 @@
               INT_MAX, 1);
 
           if (bestsme < INT_MAX) {
-            int distortion;
+            uint32_t distortion;
             cpi->find_fractional_mv_step(
                 x,
                 new_mv,
@@ -2341,7 +2346,7 @@
   x->mv_row_max = tmp_row_max;
 
   if (bestsme < INT_MAX) {
-    int dis;  /* TODO: use dis in distortion calculation later. */
+    uint32_t dis;  /* TODO: use dis in distortion calculation later. */
     cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv,
                                  cm->allow_high_precision_mv,
                                  x->errorperbit,
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 0090b4f..e7f04a2 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -429,7 +429,7 @@
     sf->mv.search_method = NSTEP;
     sf->mv.reduce_first_step_size = 1;
     sf->skip_encode_sb = 0;
-    if (!cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR && cpi->oxcf.pass == 0 &&
+    if (!cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR &&
         content != VP9E_CONTENT_SCREEN) {
       // Enable short circuit for low temporal variance.
       sf->short_circuit_low_temp_var = 1;
@@ -450,6 +450,17 @@
     sf->adaptive_rd_thresh = 4;
     sf->mv.subpel_force_stop = (content == VP9E_CONTENT_SCREEN) ? 3 : 2;
     sf->lpf_pick = LPF_PICK_MINIMAL_LPF;
+    // Only keep INTRA_DC mode for speed 8.
+    if (!is_keyframe) {
+      int i = 0;
+      for (i = 0; i < BLOCK_SIZES; ++i)
+        sf->intra_y_mode_bsize_mask[i] = INTRA_DC;
+    }
+    if (!cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR &&
+        content != VP9E_CONTENT_SCREEN) {
+      // More aggressive short circuit for speed 8.
+      sf->short_circuit_low_temp_var = 2;
+    }
   }
 }
 
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index 71ff0ac..e88a7df 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -449,6 +449,10 @@
 
   // Skip a number of expensive mode evaluations for blocks with very low
   // temporal variance.
+  // 1: Skip golden non-zeromv and ALL INTRA for bsize >= 32x32.
+  // 2: Skip golden non-zeromv and newmv-last for bsize >= 16x16, skip ALL
+  // INTRA for bsize >= 32x32 and vert/horz INTRA for bsize 16x16, 16x32 and
+  // 32x16.
   int short_circuit_low_temp_var;
 } SPEED_FEATURES;
 
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 2ba2750..02bcf5a 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -264,8 +264,8 @@
   int step_param;
   int sadpb = x->sadperbit16;
   int bestsme = INT_MAX;
-  int distortion;
-  unsigned int sse;
+  uint32_t distortion;
+  uint32_t sse;
   int cost_list[5];
 
   MV best_ref_mv1 = {0, 0};
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 1a11a6d..9ad86cb 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -205,7 +205,7 @@
         level != LEVEL_4 && level != LEVEL_4_1 && level != LEVEL_5 &&
         level != LEVEL_5_1 && level != LEVEL_5_2 && level != LEVEL_6 &&
         level != LEVEL_6_1 && level != LEVEL_6_2 &&
-        level != LEVEL_UNKNOWN && level != LEVEL_NOT_CARE)
+        level != LEVEL_UNKNOWN && level != LEVEL_MAX)
     ERROR("target_level is invalid");
   }
 
@@ -807,6 +807,13 @@
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static vpx_codec_err_t ctrl_get_level(vpx_codec_alg_priv_t *ctx, va_list args) {
+  int *const arg = va_arg(args, int *);
+  if (arg == NULL) return VPX_CODEC_INVALID_PARAM;
+  *arg = (int)vp9_get_level(&ctx->cpi->level_info.level_spec);
+  return VPX_CODEC_OK;
+}
+
 static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
                                     vpx_codec_priv_enc_mr_cfg_t *data) {
   vpx_codec_err_t res = VPX_CODEC_OK;
@@ -898,6 +905,11 @@
       break;
   }
 
+  if (deadline == VPX_DL_REALTIME) {
+    ctx->oxcf.pass = 0;
+    new_mode = REALTIME;
+  }
+
   if (ctx->oxcf.mode != new_mode) {
     ctx->oxcf.mode = new_mode;
     vp9_change_config(ctx->cpi, &ctx->oxcf);
@@ -964,9 +976,6 @@
   return index_sz;
 }
 
-// vp9 uses 10,000,000 ticks/second as time stamp
-#define TICKS_PER_SEC 10000000LL
-
 static int64_t timebase_units_to_ticks(const vpx_rational_t *timebase,
                                        int64_t n) {
   return n * TICKS_PER_SEC * timebase->num / timebase->den;
@@ -974,7 +983,7 @@
 
 static int64_t ticks_to_timebase_units(const vpx_rational_t *timebase,
                                        int64_t n) {
-  const int64_t round = TICKS_PER_SEC * timebase->num / 2 - 1;
+  const int64_t round = (int64_t)TICKS_PER_SEC * timebase->num / 2 - 1;
   return (n * timebase->den + round) / timebase->num / TICKS_PER_SEC;
 }
 
@@ -1547,6 +1556,7 @@
   {VP9_GET_REFERENCE,                 ctrl_get_reference},
   {VP9E_GET_SVC_LAYER_ID,             ctrl_get_svc_layer_id},
   {VP9E_GET_ACTIVEMAP,                ctrl_get_active_map},
+  {VP9E_GET_LEVEL,                    ctrl_get_level},
 
   { -1, NULL},
 };
diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index b059d47..2752a86 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -562,7 +562,13 @@
    *
    * Supported in codecs: VP9
    */
-  VP9E_SET_TARGET_LEVEL
+  VP9E_SET_TARGET_LEVEL,
+
+  /*!\brief Codec control function to get bitstream level.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_GET_LEVEL
 };
 
 /*!\brief vpx 1-D scaling mode
@@ -821,6 +827,9 @@
 VPX_CTRL_USE_TYPE(VP9E_SET_TARGET_LEVEL,  unsigned int)
 #define VPX_CTRL_VP9E_SET_TARGET_LEVEL
 
+VPX_CTRL_USE_TYPE(VP9E_GET_LEVEL, int *)
+#define VPX_CTRL_VP9E_GET_LEVEL
+
 /*!\endcond */
 /*! @} - end defgroup vp8_encoder */
 #ifdef __cplusplus
diff --git a/vpx_dsp/arm/hadamard_neon.c b/vpx_dsp/arm/hadamard_neon.c
index cc9e804..21e3e3d 100644
--- a/vpx_dsp/arm/hadamard_neon.c
+++ b/vpx_dsp/arm/hadamard_neon.c
@@ -160,3 +160,42 @@
   vst1q_s16(coeff + 48, a6);
   vst1q_s16(coeff + 56, a7);
 }
+
+void vpx_hadamard_16x16_neon(const int16_t *src_diff, int src_stride,
+                             int16_t *coeff) {
+  int i;
+
+  /* Rearrange 16x16 to 8x32 and remove stride.
+   * Top left first. */
+  vpx_hadamard_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0);
+  /* Top right. */
+  vpx_hadamard_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64);
+  /* Bottom left. */
+  vpx_hadamard_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128);
+  /* Bottom right. */
+  vpx_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192);
+
+  for (i = 0; i < 64; i += 8) {
+    const int16x8_t a0 = vld1q_s16(coeff + 0);
+    const int16x8_t a1 = vld1q_s16(coeff + 64);
+    const int16x8_t a2 = vld1q_s16(coeff + 128);
+    const int16x8_t a3 = vld1q_s16(coeff + 192);
+
+    const int16x8_t b0 = vhaddq_s16(a0, a1);
+    const int16x8_t b1 = vhsubq_s16(a0, a1);
+    const int16x8_t b2 = vhaddq_s16(a2, a3);
+    const int16x8_t b3 = vhsubq_s16(a2, a3);
+
+    const int16x8_t c0 = vaddq_s16(b0, b2);
+    const int16x8_t c1 = vaddq_s16(b1, b3);
+    const int16x8_t c2 = vsubq_s16(b0, b2);
+    const int16x8_t c3 = vsubq_s16(b1, b3);
+
+    vst1q_s16(coeff + 0, c0);
+    vst1q_s16(coeff + 64, c1);
+    vst1q_s16(coeff + 128, c2);
+    vst1q_s16(coeff + 192, c3);
+
+    coeff += 8;
+  }
+}
diff --git a/vpx_dsp/inv_txfm.h b/vpx_dsp/inv_txfm.h
index c407dd8..9cfe1be 100644
--- a/vpx_dsp/inv_txfm.h
+++ b/vpx_dsp/inv_txfm.h
@@ -118,13 +118,13 @@
 static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
                                              int bd) {
   trans = HIGHBD_WRAPLOW(trans, bd);
-  return clip_pixel_highbd(dest + trans, bd);
+  return clip_pixel_highbd(dest + (int)trans, bd);
 }
 #endif
 
 static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) {
   trans = WRAPLOW(trans);
-  return clip_pixel(dest + trans);
+  return clip_pixel(dest + (int)trans);
 }
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c
index 6426ccc..80fcd66 100644
--- a/vpx_dsp/quantize.c
+++ b/vpx_dsp/quantize.c
@@ -53,7 +53,7 @@
     const int coeff_sign = (coeff >> 31);
     const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
     const int64_t tmp = abs_coeff + round_ptr[0];
-    const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 16);
+    const int abs_qcoeff = (int)((tmp * quant) >> 16);
     qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
     dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr;
     if (abs_qcoeff)
@@ -109,7 +109,7 @@
     const int coeff_sign = (coeff >> 31);
     const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
     const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 1);
-    const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 15);
+    const int abs_qcoeff = (int)((tmp * quant) >> 15);
     qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
     dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / 2;
     if (abs_qcoeff)
diff --git a/vpx_dsp/variance.c b/vpx_dsp/variance.c
index e8bddb0..d960c54 100644
--- a/vpx_dsp/variance.c
+++ b/vpx_dsp/variance.c
@@ -275,7 +275,7 @@
 #if CONFIG_VP9_HIGHBITDEPTH
 static void highbd_variance64(const uint8_t *a8, int  a_stride,
                               const uint8_t *b8, int  b_stride,
-                              int w, int h, uint64_t *sse, uint64_t *sum) {
+                              int w, int h, uint64_t *sse, int64_t *sum) {
   int i, j;
 
   uint16_t *a = CONVERT_TO_SHORTPTR(a8);
@@ -298,7 +298,7 @@
                               const uint8_t *b8, int  b_stride,
                               int w, int h, uint32_t *sse, int *sum) {
   uint64_t sse_long = 0;
-  uint64_t sum_long = 0;
+  int64_t sum_long = 0;
   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
   *sse = (uint32_t)sse_long;
   *sum = (int)sum_long;
@@ -308,7 +308,7 @@
                                const uint8_t *b8, int  b_stride,
                                int w, int h, uint32_t *sse, int *sum) {
   uint64_t sse_long = 0;
-  uint64_t sum_long = 0;
+  int64_t sum_long = 0;
   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
   *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
   *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
@@ -318,7 +318,7 @@
                                const uint8_t *b8, int  b_stride,
                                int w, int h, uint32_t *sse, int *sum) {
   uint64_t sse_long = 0;
-  uint64_t sum_long = 0;
+  int64_t sum_long = 0;
   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
   *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
   *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
@@ -341,8 +341,10 @@
                                              int b_stride, \
                                              uint32_t *sse) { \
   int sum; \
+  int64_t var; \
   highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
-  return *sse - (((int64_t)sum * sum) / (W * H)); \
+  var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
+  return (var >= 0) ? (uint32_t)var : 0; \
 } \
 \
 uint32_t vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, \
@@ -351,8 +353,10 @@
                                              int b_stride, \
                                              uint32_t *sse) { \
   int sum; \
+  int64_t var; \
   highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
-  return *sse - (((int64_t)sum * sum) / (W * H)); \
+  var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
+  return (var >= 0) ? (uint32_t)var : 0; \
 }
 
 #define HIGHBD_GET_VAR(S) \
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 4144281..a62acb7 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1020,7 +1020,7 @@
   specialize qw/vpx_hadamard_8x8 sse2 neon/, "$ssse3_x86_64_x86inc";
 
   add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
-  specialize qw/vpx_hadamard_16x16 sse2/;
+  specialize qw/vpx_hadamard_16x16 sse2 neon/;
 
   add_proto qw/int vpx_satd/, "const int16_t *coeff, int length";
   specialize qw/vpx_satd sse2 neon/;
diff --git a/vpx_dsp/x86/inv_txfm_sse2.c b/vpx_dsp/x86/inv_txfm_sse2.c
index 9c0d3eb..df5068c 100644
--- a/vpx_dsp/x86/inv_txfm_sse2.c
+++ b/vpx_dsp/x86/inv_txfm_sse2.c
@@ -158,8 +158,8 @@
   const __m128i zero = _mm_setzero_si128();
   int a;
 
-  a = dct_const_round_shift(input[0] * cospi_16_64);
-  a = dct_const_round_shift(a * cospi_16_64);
+  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
+  a = (int)dct_const_round_shift(a * cospi_16_64);
   a = ROUND_POWER_OF_TWO(a, 4);
 
   dc_value = _mm_set1_epi16(a);
@@ -527,8 +527,8 @@
   const __m128i zero = _mm_setzero_si128();
   int a;
 
-  a = dct_const_round_shift(input[0] * cospi_16_64);
-  a = dct_const_round_shift(a * cospi_16_64);
+  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
+  a = (int)dct_const_round_shift(a * cospi_16_64);
   a = ROUND_POWER_OF_TWO(a, 5);
 
   dc_value = _mm_set1_epi16(a);
@@ -1305,8 +1305,8 @@
   const __m128i zero = _mm_setzero_si128();
   int a, i;
 
-  a = dct_const_round_shift(input[0] * cospi_16_64);
-  a = dct_const_round_shift(a * cospi_16_64);
+  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
+  a = (int)dct_const_round_shift(a * cospi_16_64);
   a = ROUND_POWER_OF_TWO(a, 6);
 
   dc_value = _mm_set1_epi16(a);
@@ -3462,8 +3462,8 @@
   const __m128i zero = _mm_setzero_si128();
   int a, j;
 
-  a = dct_const_round_shift(input[0] * cospi_16_64);
-  a = dct_const_round_shift(a * cospi_16_64);
+  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
+  a = (int)dct_const_round_shift(a * cospi_16_64);
   a = ROUND_POWER_OF_TWO(a, 6);
 
   dc_value = _mm_set1_epi16(a);