diff --git a/README b/README
index c70a11e..460ad73 100644
--- a/README
+++ b/README
@@ -47,7 +47,6 @@
   --help output of the configure script. As of this writing, the list of
   available targets is:
 
-    armv6-darwin-gcc
     armv6-linux-rvct
     armv6-linux-gcc
     armv6-none-rvct
diff --git a/build/make/Makefile b/build/make/Makefile
index 6f54901..dfb7e4b 100644
--- a/build/make/Makefile
+++ b/build/make/Makefile
@@ -119,29 +119,25 @@
 test-no-data-check::
 exampletest-no-data-check utiltest-no-data-check:
 
-# Add compiler flags for intrinsic files
+# Force to realign stack always on OS/2
 ifeq ($(TOOLCHAIN), x86-os2-gcc)
-STACKREALIGN=-mstackrealign
-else
-STACKREALIGN=
+CFLAGS += -mstackrealign
 endif
 
 $(BUILD_PFX)%_mmx.c.d: CFLAGS += -mmmx
 $(BUILD_PFX)%_mmx.c.o: CFLAGS += -mmmx
-$(BUILD_PFX)%_sse2.c.d: CFLAGS += -msse2 $(STACKREALIGN)
-$(BUILD_PFX)%_sse2.c.o: CFLAGS += -msse2 $(STACKREALIGN)
-$(BUILD_PFX)%_sse3.c.d: CFLAGS += -msse3 $(STACKREALIGN)
-$(BUILD_PFX)%_sse3.c.o: CFLAGS += -msse3 $(STACKREALIGN)
-$(BUILD_PFX)%_ssse3.c.d: CFLAGS += -mssse3 $(STACKREALIGN)
-$(BUILD_PFX)%_ssse3.c.o: CFLAGS += -mssse3 $(STACKREALIGN)
-$(BUILD_PFX)%_sse4.c.d: CFLAGS += -msse4.1 $(STACKREALIGN)
-$(BUILD_PFX)%_sse4.c.o: CFLAGS += -msse4.1 $(STACKREALIGN)
-$(BUILD_PFX)%_avx.c.d: CFLAGS += -mavx $(STACKREALIGN)
-$(BUILD_PFX)%_avx.c.o: CFLAGS += -mavx $(STACKREALIGN)
-$(BUILD_PFX)%_avx2.c.d: CFLAGS += -mavx2 $(STACKREALIGN)
-$(BUILD_PFX)%_avx2.c.o: CFLAGS += -mavx2 $(STACKREALIGN)
-$(BUILD_PFX)%vp9_reconintra.c.d: CFLAGS += $(STACKREALIGN)
-$(BUILD_PFX)%vp9_reconintra.c.o: CFLAGS += $(STACKREALIGN)
+$(BUILD_PFX)%_sse2.c.d: CFLAGS += -msse2
+$(BUILD_PFX)%_sse2.c.o: CFLAGS += -msse2
+$(BUILD_PFX)%_sse3.c.d: CFLAGS += -msse3
+$(BUILD_PFX)%_sse3.c.o: CFLAGS += -msse3
+$(BUILD_PFX)%_ssse3.c.d: CFLAGS += -mssse3
+$(BUILD_PFX)%_ssse3.c.o: CFLAGS += -mssse3
+$(BUILD_PFX)%_sse4.c.d: CFLAGS += -msse4.1
+$(BUILD_PFX)%_sse4.c.o: CFLAGS += -msse4.1
+$(BUILD_PFX)%_avx.c.d: CFLAGS += -mavx
+$(BUILD_PFX)%_avx.c.o: CFLAGS += -mavx
+$(BUILD_PFX)%_avx2.c.d: CFLAGS += -mavx2
+$(BUILD_PFX)%_avx2.c.o: CFLAGS += -mavx2
 
 $(BUILD_PFX)%.c.d: %.c
 	$(if $(quiet),@echo "    [DEP] $@")
diff --git a/build/make/configure.sh b/build/make/configure.sh
index b305d33..ee887ab 100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -185,6 +185,25 @@
 #
 # Boolean Manipulation Functions
 #
+
+enable_codec(){
+  enabled $1 || echo "  enabling $1"
+  set_all yes $1
+
+  is_in $1 vp8 vp9 vp10 && \
+    set_all yes $1_encoder && \
+    set_all yes $1_decoder
+}
+
+disable_codec(){
+  disabled $1 || echo "  disabling $1"
+  set_all no $1
+
+  is_in $1 vp8 vp9 vp10 && \
+    set_all no $1_encoder && \
+    set_all no $1_decoder
+}
+
 enable_feature(){
   set_all yes $*
 }
@@ -521,22 +540,20 @@
         ;;
       --enable-?*|--disable-?*)
         eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
-        if echo "${ARCH_EXT_LIST}" | grep "^ *$option\$" >/dev/null; then
+        if is_in ${option} ${ARCH_EXT_LIST}; then
           [ $action = "disable" ] && RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${option} "
         elif [ $action = "disable" ] && ! disabled $option ; then
-          echo "${CMDLINE_SELECT}" | grep "^ *$option\$" >/dev/null ||
-            die_unknown $opt
+          is_in ${option} ${CMDLINE_SELECT} || die_unknown $opt
           log_echo "  disabling $option"
         elif [ $action = "enable" ] && ! enabled $option ; then
-          echo "${CMDLINE_SELECT}" | grep "^ *$option\$" >/dev/null ||
-            die_unknown $opt
+          is_in ${option} ${CMDLINE_SELECT} || die_unknown $opt
           log_echo "  enabling $option"
         fi
         ${action}_feature $option
         ;;
       --require-?*)
         eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
-        if echo "${ARCH_EXT_LIST}" none | grep "^ *$option\$" >/dev/null; then
+        if is_in ${option} ${ARCH_EXT_LIST}; then
             RTCD_OPTIONS="${RTCD_OPTIONS}${opt} "
         else
             die_unknown $opt
@@ -638,6 +655,26 @@
   xcrun --sdk $1 --show-sdk-version 2>/dev/null | cut -d. -f1
 }
 
+# Print the Xcode version.
+show_xcode_version() {
+  xcodebuild -version | head -n1 | cut -d' ' -f2
+}
+
+# Fails when Xcode version is less than 6.3.
+check_xcode_minimum_version() {
+  xcode_major=$(show_xcode_version | cut -f1 -d.)
+  xcode_minor=$(show_xcode_version | cut -f2 -d.)
+  xcode_min_major=6
+  xcode_min_minor=3
+  if [ ${xcode_major} -lt ${xcode_min_major} ]; then
+    return 1
+  fi
+  if [ ${xcode_major} -eq ${xcode_min_major} ] \
+    && [ ${xcode_minor} -lt ${xcode_min_minor} ]; then
+    return 1
+  fi
+}
+
 process_common_toolchain() {
   if [ -z "$toolchain" ]; then
     gcctarget="${CHOST:-$(gcc -dumpmachine 2> /dev/null)}"
@@ -751,7 +788,14 @@
   enabled shared && soft_enable pic
 
   # Minimum iOS version for all target platforms (darwin and iphonesimulator).
-  IOS_VERSION_MIN="6.0"
+  # Shared library framework builds are only possible on iOS 8 and later.
+  if enabled shared; then
+    IOS_VERSION_OPTIONS="--enable-shared"
+    IOS_VERSION_MIN="8.0"
+  else
+    IOS_VERSION_OPTIONS=""
+    IOS_VERSION_MIN="6.0"
+  fi
 
   # Handle darwin variants. Newer SDKs allow targeting older
   # platforms, so use the newest one available.
@@ -1018,18 +1062,7 @@
           NM="$(${XCRUN_FIND} nm)"
           RANLIB="$(${XCRUN_FIND} ranlib)"
           AS_SFX=.s
-
-          # Special handling of ld for armv6 because libclang_rt.ios.a does
-          # not contain armv6 support in Apple's clang package:
-          #   Apple LLVM version 5.1 (clang-503.0.40) (based on LLVM 3.4svn).
-          # TODO(tomfinegan): Remove this. Our minimum iOS version (6.0)
-          # renders support for armv6 unnecessary because the 3GS and up
-          # support neon.
-          if [ "${tgt_isa}" = "armv6" ]; then
-            LD="$(${XCRUN_FIND} ld)"
-          else
-            LD="${CXX:-$(${XCRUN_FIND} ld)}"
-          fi
+          LD="${CXX:-$(${XCRUN_FIND} ld)}"
 
           # ASFLAGS is written here instead of using check_add_asflags
           # because we need to overwrite all of ASFLAGS and purge the
@@ -1055,6 +1088,19 @@
             [ -d "${try_dir}" ] && add_ldflags -L"${try_dir}"
           done
 
+          case ${tgt_isa} in
+            armv7|armv7s|armv8|arm64)
+              if enabled neon && ! check_xcode_minimum_version; then
+                soft_disable neon
+                log_echo "  neon disabled: upgrade Xcode (need v6.3+)."
+                if enabled neon_asm; then
+                  soft_disable neon_asm
+                  log_echo "  neon_asm disabled: upgrade Xcode (need v6.3+)."
+                fi
+              fi
+              ;;
+          esac
+
           asm_conversion_cmd="${source_path}/build/make/ads2gas_apple.pl"
 
           if [ "$(show_darwin_sdk_major_version iphoneos)" -gt 8 ]; then
@@ -1069,7 +1115,7 @@
           if enabled rvct; then
             # Check if we have CodeSourcery GCC in PATH. Needed for
             # libraries
-            hash arm-none-linux-gnueabi-gcc 2>&- || \
+            which arm-none-linux-gnueabi-gcc 2>&- || \
               die "Couldn't find CodeSourcery GCC from PATH"
 
             # Use armcc as a linker to enable translation of
@@ -1110,7 +1156,7 @@
             check_add_ldflags -mfp64
             ;;
           i6400)
-            check_add_cflags -mips64r6 -mabi=64 -funroll-loops -msched-weight 
+            check_add_cflags -mips64r6 -mabi=64 -funroll-loops -msched-weight
             check_add_cflags  -mload-store-pairs -mhard-float -mfp64
             check_add_asflags -mips64r6 -mabi=64 -mhard-float -mfp64
             check_add_ldflags -mips64r6 -mabi=64 -mfp64
diff --git a/build/make/gen_msvs_vcxproj.sh b/build/make/gen_msvs_vcxproj.sh
index 182ea28..e98611d 100755
--- a/build/make/gen_msvs_vcxproj.sh
+++ b/build/make/gen_msvs_vcxproj.sh
@@ -211,7 +211,7 @@
 done
 
 # Make one call to fix_path for file_list to improve performance.
-fix_file_list
+fix_file_list file_list
 
 outfile=${outfile:-/dev/stdout}
 guid=${guid:-`generate_uuid`}
diff --git a/build/make/ios-Info.plist b/build/make/ios-Info.plist
new file mode 100644
index 0000000..d157b11
--- /dev/null
+++ b/build/make/ios-Info.plist
@@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>CFBundleDevelopmentRegion</key>
+	<string>en</string>
+	<key>CFBundleExecutable</key>
+	<string>VPX</string>
+	<key>CFBundleIdentifier</key>
+	<string>org.webmproject.VPX</string>
+	<key>CFBundleInfoDictionaryVersion</key>
+	<string>6.0</string>
+	<key>CFBundleName</key>
+	<string>VPX</string>
+	<key>CFBundlePackageType</key>
+	<string>FMWK</string>
+	<key>CFBundleShortVersionString</key>
+	<string>${VERSION}</string>
+	<key>CFBundleSignature</key>
+	<string>????</string>
+	<key>CFBundleSupportedPlatforms</key>
+	<array>
+		<string>iPhoneOS</string>
+	</array>
+	<key>CFBundleVersion</key>
+	<string>${VERSION}</string>
+	<key>MinimumOSVersion</key>
+	<string>${IOS_VERSION_MIN}</string>
+	<key>UIDeviceFamily</key>
+	<array>
+		<integer>1</integer>
+		<integer>2</integer>
+	</array>
+	<key>VPXFullVersion</key>
+	<string>${FULLVERSION}</string>
+</dict>
+</plist>
diff --git a/build/make/iosbuild.sh b/build/make/iosbuild.sh
index ae5ba18..c703f22 100755
--- a/build/make/iosbuild.sh
+++ b/build/make/iosbuild.sh
@@ -24,6 +24,7 @@
                 --disable-unit-tests"
 DIST_DIR="_dist"
 FRAMEWORK_DIR="VPX.framework"
+FRAMEWORK_LIB="VPX.framework/VPX"
 HEADER_DIR="${FRAMEWORK_DIR}/Headers/vpx"
 SCRIPT_DIR=$(dirname "$0")
 LIBVPX_SOURCE_DIR=$(cd ${SCRIPT_DIR}/../..; pwd)
@@ -137,6 +138,44 @@
   printf "#endif  // ${include_guard}" >> "${config_file}"
 }
 
+# Verifies that $FRAMEWORK_LIB fat library contains requested builds.
+verify_framework_targets() {
+  local requested_cpus=""
+  local cpu=""
+
+  # Extract CPU from full target name.
+  for target; do
+    cpu="${target%%-*}"
+    if [ "${cpu}" = "x86" ]; then
+      # lipo -info outputs i386 for libvpx x86 targets.
+      cpu="i386"
+    fi
+    requested_cpus="${requested_cpus}${cpu} "
+  done
+
+  # Get target CPUs present in framework library.
+  local targets_built=$(${LIPO} -info ${FRAMEWORK_LIB})
+
+  # $LIPO -info outputs a string like the following:
+  #   Architectures in the fat file: $FRAMEWORK_LIB <architectures>
+  # Capture only the architecture strings.
+  targets_built=${targets_built##*: }
+
+  # Sort CPU strings to make the next step a simple string compare.
+  local actual=$(echo ${targets_built} | tr " " "\n" | sort | tr "\n" " ")
+  local requested=$(echo ${requested_cpus} | tr " " "\n" | sort | tr "\n" " ")
+
+  vlog "Requested ${FRAMEWORK_LIB} CPUs: ${requested}"
+  vlog "Actual ${FRAMEWORK_LIB} CPUs: ${actual}"
+
+  if [ "${requested}" != "${actual}" ]; then
+    elog "Actual ${FRAMEWORK_LIB} targets do not match requested target list."
+    elog "  Requested target CPUs: ${requested}"
+    elog "  Actual target CPUs: ${actual}"
+    return 1
+  fi
+}
+
 # Configures and builds each target specified by $1, and then builds
 # VPX.framework.
 build_framework() {
@@ -157,7 +196,12 @@
   for target in ${targets}; do
     build_target "${target}"
     target_dist_dir="${BUILD_ROOT}/${target}/${DIST_DIR}"
-    lib_list="${lib_list} ${target_dist_dir}/lib/libvpx.a"
+    if [ "${ENABLE_SHARED}" = "yes" ]; then
+      local suffix="dylib"
+    else
+      local suffix="a"
+    fi
+    lib_list="${lib_list} ${target_dist_dir}/lib/libvpx.${suffix}"
   done
 
   cd "${ORIG_PWD}"
@@ -176,13 +220,25 @@
   # Copy in vpx_version.h.
   cp -p "${BUILD_ROOT}/${target}/vpx_version.h" "${HEADER_DIR}"
 
-  vlog "Created fat library ${FRAMEWORK_DIR}/VPX containing:"
+  if [ "${ENABLE_SHARED}" = "yes" ]; then
+    # Adjust the dylib's name so dynamic linking in apps works as expected.
+    install_name_tool -id '@rpath/VPX.framework/VPX' ${FRAMEWORK_DIR}/VPX
+
+    # Copy in Info.plist.
+    cat "${SCRIPT_DIR}/ios-Info.plist" \
+      | sed "s/\${FULLVERSION}/${FULLVERSION}/g" \
+      | sed "s/\${VERSION}/${VERSION}/g" \
+      | sed "s/\${IOS_VERSION_MIN}/${IOS_VERSION_MIN}/g" \
+      > "${FRAMEWORK_DIR}/Info.plist"
+  fi
+
+  # Confirm VPX.framework/VPX contains the targets requested.
+  verify_framework_targets ${targets}
+
+  vlog "Created fat library ${FRAMEWORK_LIB} containing:"
   for lib in ${lib_list}; do
     vlog "  $(echo ${lib} | awk -F / '{print $2, $NF}')"
   done
-
-  # TODO(tomfinegan): Verify that expected targets are included within
-  # VPX.framework/VPX via lipo -info.
 }
 
 # Trap function. Cleans up the subtree used to build all targets contained in
@@ -213,6 +269,7 @@
 cat << EOF
   Usage: ${0##*/} [arguments]
     --help: Display this message and exit.
+    --enable-shared: Build a dynamic framework for use on iOS 8 or later.
     --extra-configure-args <args>: Extra args to pass when configuring libvpx.
     --macosx: Uses darwin15 targets instead of iphonesimulator targets for x86
               and x86_64. Allows linking to framework when builds target MacOSX
@@ -251,6 +308,9 @@
       iosbuild_usage
       exit
       ;;
+    --enable-shared)
+      ENABLE_SHARED=yes
+      ;;
     --preserve-build-output)
       PRESERVE_BUILD_OUTPUT=yes
       ;;
@@ -278,6 +338,21 @@
   shift
 done
 
+if [ "${ENABLE_SHARED}" = "yes" ]; then
+  CONFIGURE_ARGS="--enable-shared ${CONFIGURE_ARGS}"
+fi
+
+FULLVERSION=$("${SCRIPT_DIR}"/version.sh --bare "${LIBVPX_SOURCE_DIR}")
+VERSION=$(echo "${FULLVERSION}" | sed -E 's/^v([0-9]+\.[0-9]+\.[0-9]+).*$/\1/')
+
+if [ "$ENABLE_SHARED" = "yes" ]; then
+  IOS_VERSION_OPTIONS="--enable-shared"
+  IOS_VERSION_MIN="8.0"
+else
+  IOS_VERSION_OPTIONS=""
+  IOS_VERSION_MIN="6.0"
+fi
+
 if [ "${VERBOSE}" = "yes" ]; then
 cat << EOF
   BUILD_ROOT=${BUILD_ROOT}
@@ -285,6 +360,7 @@
   CONFIGURE_ARGS=${CONFIGURE_ARGS}
   EXTRA_CONFIGURE_ARGS=${EXTRA_CONFIGURE_ARGS}
   FRAMEWORK_DIR=${FRAMEWORK_DIR}
+  FRAMEWORK_LIB=${FRAMEWORK_LIB}
   HEADER_DIR=${HEADER_DIR}
   LIBVPX_SOURCE_DIR=${LIBVPX_SOURCE_DIR}
   LIPO=${LIPO}
@@ -292,8 +368,13 @@
   ORIG_PWD=${ORIG_PWD}
   PRESERVE_BUILD_OUTPUT=${PRESERVE_BUILD_OUTPUT}
   TARGETS="$(print_list "" ${TARGETS})"
+  ENABLE_SHARED=${ENABLE_SHARED}
   OSX_TARGETS="${OSX_TARGETS}"
   SIM_TARGETS="${SIM_TARGETS}"
+  SCRIPT_DIR="${SCRIPT_DIR}"
+  FULLVERSION="${FULLVERSION}"
+  VERSION="${VERSION}"
+  IOS_VERSION_MIN="${IOS_VERSION_MIN}"
 EOF
 fi
 
diff --git a/build/make/msvs_common.sh b/build/make/msvs_common.sh
index 90c1488..88f1cf9 100644
--- a/build/make/msvs_common.sh
+++ b/build/make/msvs_common.sh
@@ -39,11 +39,12 @@
 }
 
 # Corrects the paths in file_list in one pass for efficiency.
+# $1 is the name of the array to be modified.
 fix_file_list() {
-    # TODO(jzern): this could be more generic and take the array as a param.
-    files=$(fix_path "${file_list[@]}")
+    declare -n array_ref=$1
+    files=$(fix_path "${array_ref[@]}")
     local IFS=$'\n'
-    file_list=($files)
+    array_ref=($files)
 }
 
 generate_uuid() {
diff --git a/build/make/version.sh b/build/make/version.sh
index b340142..6967527 100755
--- a/build/make/version.sh
+++ b/build/make/version.sh
@@ -24,8 +24,9 @@
 id=${3:-VERSION_STRING}
 
 git_version_id=""
-if [ -d "${source_path}/.git" ]; then
+if [ -e "${source_path}/.git" ]; then
     # Source Path is a git working copy. Check for local modifications.
+    # Note that git submodules may have a file as .git, not a directory.
     export GIT_DIR="${source_path}/.git"
     git_version_id=`git describe --match=v[0-9]* 2>/dev/null`
 fi
diff --git a/configure b/configure
index 45ec93f..ae9bb5d 100755
--- a/configure
+++ b/configure
@@ -98,7 +98,6 @@
 
 # all_platforms is a list of all supported target platforms. Maintain
 # alphabetically by architecture, generic-gnu last.
-all_platforms="${all_platforms} armv6-darwin-gcc"
 all_platforms="${all_platforms} armv6-linux-rvct"
 all_platforms="${all_platforms} armv6-linux-gcc"
 all_platforms="${all_platforms} armv6-none-rvct"
@@ -191,12 +190,12 @@
 fi
 
 # disable codecs when their source directory does not exist
-[ -d "${source_path}/vp8" ] || disable_feature vp8
-[ -d "${source_path}/vp9" ] || disable_feature vp9
-[ -d "${source_path}/vp10" ] || disable_feature vp10
+[ -d "${source_path}/vp8" ] || disable_codec vp8
+[ -d "${source_path}/vp9" ] || disable_codec vp9
+[ -d "${source_path}/vp10" ] || disable_codec vp10
 
 # disable vp10 codec by default
-disable_feature vp10
+disable_codec vp10
 
 # install everything except the sources, by default. sources will have
 # to be enabled when doing dist builds, since that's no longer a common
@@ -406,15 +405,19 @@
     for opt do
         optval="${opt#*=}"
         case "$opt" in
-        --disable-codecs) for c in ${CODECS}; do disable_feature $c; done ;;
+        --disable-codecs)
+          for c in ${CODEC_FAMILIES}; do disable_codec $c; done
+          ;;
         --enable-?*|--disable-?*)
         eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
-        if echo "${EXPERIMENT_LIST}" | grep "^ *$option\$" >/dev/null; then
+        if is_in ${option} ${EXPERIMENT_LIST}; then
             if enabled experimental; then
                 ${action}_feature $option
             else
                 log_echo "Ignoring $opt -- not in experimental mode."
             fi
+        elif is_in ${option} "${CODECS} ${CODEC_FAMILIES}"; then
+            ${action}_codec ${option}
         else
             process_common_cmdline $opt
         fi
@@ -428,14 +431,6 @@
 post_process_cmdline() {
     c=""
 
-    # If the codec family is disabled, disable all components of that family.
-    # If the codec family is enabled, enable all components of that family.
-    log_echo "Configuring selected codecs"
-    for c in ${CODECS}; do
-        disabled ${c%%_*} && disable_feature ${c}
-        enabled ${c%%_*} && enable_feature ${c}
-    done
-
     # Enable all detected codecs, if they haven't been disabled
     for c in ${CODECS}; do soft_enable $c; done
 
@@ -530,13 +525,18 @@
         # Can only build shared libs on a subset of platforms. Doing this check
         # here rather than at option parse time because the target auto-detect
         # magic happens after the command line has been parsed.
-        if ! enabled linux && ! enabled os2; then
+        case "${tgt_os}" in
+        linux|os2|darwin*|iphonesimulator*)
+            # Supported platforms
+            ;;
+        *)
             if enabled gnu; then
                 echo "--enable-shared is only supported on ELF; assuming this is OK"
             else
-                die "--enable-shared only supported on ELF and OS/2 for now"
+                die "--enable-shared only supported on ELF, OS/2, and Darwin for now"
             fi
-        fi
+            ;;
+        esac
     fi
     if [ -z "$CC" ] || enabled external_build; then
         echo "Bypassing toolchain for environment detection."
diff --git a/examples/simple_encoder.c b/examples/simple_encoder.c
index a307729..64f0a01 100644
--- a/examples/simple_encoder.c
+++ b/examples/simple_encoder.c
@@ -109,8 +109,8 @@
 void usage_exit(void) {
   fprintf(stderr,
           "Usage: %s <codec> <width> <height> <infile> <outfile> "
-              "<keyframe-interval> [<error-resilient>]\nSee comments in "
-              "simple_encoder.c for more information.\n",
+              "<keyframe-interval> <error-resilient> <frames to encode>\n"
+              "See comments in simple_encoder.c for more information.\n",
           exec_name);
   exit(EXIT_FAILURE);
 }
@@ -147,6 +147,7 @@
   return got_pkts;
 }
 
+// TODO(tomfinegan): Improve command line parsing and add args for bitrate/fps.
 int main(int argc, char **argv) {
   FILE *infile = NULL;
   vpx_codec_ctx_t codec;
@@ -157,12 +158,11 @@
   VpxVideoInfo info = {0};
   VpxVideoWriter *writer = NULL;
   const VpxInterface *encoder = NULL;
-  const int fps = 30;        // TODO(dkovalev) add command line argument
-  const int bitrate = 200;   // kbit/s TODO(dkovalev) add command line argument
+  const int fps = 30;
+  const int bitrate = 200;
   int keyframe_interval = 0;
-
-  // TODO(dkovalev): Add some simple command line parsing code to make the
-  // command line more flexible.
+  int max_frames = 0;
+  int frames_encoded = 0;
   const char *codec_arg = NULL;
   const char *width_arg = NULL;
   const char *height_arg = NULL;
@@ -172,7 +172,7 @@
 
   exec_name = argv[0];
 
-  if (argc < 7)
+  if (argc != 9)
     die("Invalid number of arguments");
 
   codec_arg = argv[1];
@@ -181,6 +181,7 @@
   infile_arg = argv[4];
   outfile_arg = argv[5];
   keyframe_interval_arg = argv[6];
+  max_frames = strtol(argv[8], NULL, 0);
 
   encoder = get_vpx_encoder_by_name(codec_arg);
   if (!encoder)
@@ -219,7 +220,7 @@
   cfg.g_timebase.num = info.time_base.numerator;
   cfg.g_timebase.den = info.time_base.denominator;
   cfg.rc_target_bitrate = bitrate;
-  cfg.g_error_resilient = argc > 7 ? strtol(argv[7], NULL, 0) : 0;
+  cfg.g_error_resilient = strtol(argv[7], NULL, 0);
 
   writer = vpx_video_writer_open(outfile_arg, kContainerIVF, &info);
   if (!writer)
@@ -237,6 +238,9 @@
     if (keyframe_interval > 0 && frame_count % keyframe_interval == 0)
       flags |= VPX_EFLAG_FORCE_KF;
     encode_frame(&codec, &raw, frame_count++, flags, writer);
+    frames_encoded++;
+    if (max_frames > 0 && frames_encoded >= max_frames)
+      break;
   }
 
   // Flush encoder.
diff --git a/examples/twopass_encoder.c b/examples/twopass_encoder.c
index aecc11d..15a6617 100644
--- a/examples/twopass_encoder.c
+++ b/examples/twopass_encoder.c
@@ -59,7 +59,9 @@
 static const char *exec_name;
 
 void usage_exit(void) {
-  fprintf(stderr, "Usage: %s <codec> <width> <height> <infile> <outfile>\n",
+  fprintf(stderr,
+          "Usage: %s <codec> <width> <height> <infile> <outfile> "
+              "<frame limit>\n",
           exec_name);
   exit(EXIT_FAILURE);
 }
@@ -129,7 +131,8 @@
 static vpx_fixed_buf_t pass0(vpx_image_t *raw,
                              FILE *infile,
                              const VpxInterface *encoder,
-                             const vpx_codec_enc_cfg_t *cfg) {
+                             const vpx_codec_enc_cfg_t *cfg,
+                             int max_frames) {
   vpx_codec_ctx_t codec;
   int frame_count = 0;
   vpx_fixed_buf_t stats = {NULL, 0};
@@ -142,6 +145,8 @@
     ++frame_count;
     get_frame_stats(&codec, raw, frame_count, 1, 0, VPX_DL_GOOD_QUALITY,
                     &stats);
+    if (max_frames > 0 && frame_count >= max_frames)
+      break;
   }
 
   // Flush encoder.
@@ -159,7 +164,8 @@
                   FILE *infile,
                   const char *outfile_name,
                   const VpxInterface *encoder,
-                  const vpx_codec_enc_cfg_t *cfg) {
+                  const vpx_codec_enc_cfg_t *cfg,
+                  int max_frames) {
   VpxVideoInfo info = {
     encoder->fourcc,
     cfg->g_w,
@@ -181,6 +187,9 @@
   while (vpx_img_read(raw, infile)) {
     ++frame_count;
     encode_frame(&codec, raw, frame_count, 1, 0, VPX_DL_GOOD_QUALITY, writer);
+
+    if (max_frames > 0 && frame_count >= max_frames)
+      break;
   }
 
   // Flush encoder.
@@ -213,11 +222,14 @@
   const char *const height_arg = argv[3];
   const char *const infile_arg = argv[4];
   const char *const outfile_arg = argv[5];
+  int max_frames = 0;
   exec_name = argv[0];
 
-  if (argc != 6)
+  if (argc != 7)
     die("Invalid number of arguments.");
 
+  max_frames = strtol(argv[6], NULL, 0);
+
   encoder = get_vpx_encoder_by_name(codec_arg);
   if (!encoder)
     die("Unsupported codec.");
@@ -249,13 +261,13 @@
 
   // Pass 0
   cfg.g_pass = VPX_RC_FIRST_PASS;
-  stats = pass0(&raw, infile, encoder, &cfg);
+  stats = pass0(&raw, infile, encoder, &cfg, max_frames);
 
   // Pass 1
   rewind(infile);
   cfg.g_pass = VPX_RC_LAST_PASS;
   cfg.rc_twopass_stats_in = stats;
-  pass1(&raw, infile, outfile_arg, encoder, &cfg);
+  pass1(&raw, infile, outfile_arg, encoder, &cfg, max_frames);
   free(stats.buf);
 
   vpx_img_free(&raw);
diff --git a/examples/vpx_temporal_svc_encoder.c b/examples/vpx_temporal_svc_encoder.c
index 16abb9d..e6c09fb 100644
--- a/examples/vpx_temporal_svc_encoder.c
+++ b/examples/vpx_temporal_svc_encoder.c
@@ -715,7 +715,7 @@
     vpx_codec_control(&codec, VP8E_SET_CPUUSED, speed);
     vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3);
     vpx_codec_control(&codec, VP9E_SET_FRAME_PERIODIC_BOOST, 0);
-    vpx_codec_control(&codec, VP9E_SET_NOISE_SENSITIVITY, 0);
+    vpx_codec_control(&codec, VP9E_SET_NOISE_SENSITIVITY, kDenoiserOff);
     vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
     vpx_codec_control(&codec, VP9E_SET_TUNE_CONTENT, 0);
     vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (cfg.g_threads >> 1));
diff --git a/ivfdec.c b/ivfdec.c
index 6dcd66f..7fc25a0 100644
--- a/ivfdec.c
+++ b/ivfdec.c
@@ -23,7 +23,7 @@
   // we can guess the framerate using only the timebase in this
   // case. Other files would require reading ahead to guess the
   // timebase, like we do for webm.
-  if (*num < 1000) {
+  if (*den > 0 && *den < 1000000000 && *num > 0 && *num < 1000) {
     // Correct for the factor of 2 applied to the timebase in the encoder.
     if (*num & 1)
       *den *= 2;
diff --git a/libs.mk b/libs.mk
index c2a4725..f563bd3 100644
--- a/libs.mk
+++ b/libs.mk
@@ -183,6 +183,9 @@
 endif
 CODEC_EXPORTS-yes += vpx/exports_com
 CODEC_EXPORTS-$(CONFIG_ENCODERS) += vpx/exports_enc
+ifeq ($(CONFIG_SPATIAL_SVC),yes)
+CODEC_EXPORTS-$(CONFIG_ENCODERS) += vpx/exports_spatial_svc
+endif
 CODEC_EXPORTS-$(CONFIG_DECODERS) += vpx/exports_dec
 
 INSTALL-LIBS-yes += include/vpx/vpx_codec.h
@@ -270,6 +273,12 @@
 LIBVPX_SO_SYMLINKS      := $(addprefix $(LIBSUBDIR)/, \
                              libvpx.dylib  )
 else
+ifeq ($(filter iphonesimulator%,$(TGT_OS)),$(TGT_OS))
+LIBVPX_SO               := libvpx.$(SO_VERSION_MAJOR).dylib
+SHARED_LIB_SUF          := .dylib
+EXPORT_FILE             := libvpx.syms
+LIBVPX_SO_SYMLINKS      := $(addprefix $(LIBSUBDIR)/, libvpx.dylib)
+else
 ifeq ($(filter os2%,$(TGT_OS)),$(TGT_OS))
 LIBVPX_SO               := libvpx$(SO_VERSION_MAJOR).dll
 SHARED_LIB_SUF          := _dll.a
@@ -285,6 +294,7 @@
                              libvpx.so.$(SO_VERSION_MAJOR).$(SO_VERSION_MINOR))
 endif
 endif
+endif
 
 LIBS-$(CONFIG_SHARED) += $(BUILD_PFX)$(LIBVPX_SO)\
                            $(notdir $(LIBVPX_SO_SYMLINKS)) \
diff --git a/md5_utils.c b/md5_utils.c
index f4f893a..a9b979a 100644
--- a/md5_utils.c
+++ b/md5_utils.c
@@ -150,12 +150,23 @@
 #define MD5STEP(f,w,x,y,z,in,s) \
   (w += f(x,y,z) + in, w = (w<<s | w>>(32-s)) + x)
 
+#if defined(__clang__) && defined(__has_attribute)
+#if __has_attribute(no_sanitize)
+#define VPX_NO_UNSIGNED_OVERFLOW_CHECK \
+  __attribute__((no_sanitize("unsigned-integer-overflow")))
+#endif
+#endif
+
+#ifndef VPX_NO_UNSIGNED_OVERFLOW_CHECK
+#define VPX_NO_UNSIGNED_OVERFLOW_CHECK
+#endif
+
 /*
  * The core of the MD5 algorithm, this alters an existing MD5 hash to
  * reflect the addition of 16 longwords of new data.  MD5Update blocks
  * the data and converts bytes into longwords for this routine.
  */
-void
+VPX_NO_UNSIGNED_OVERFLOW_CHECK void
 MD5Transform(UWORD32 buf[4], UWORD32 const in[16]) {
   register UWORD32 a, b, c, d;
 
@@ -238,4 +249,6 @@
   buf[3] += d;
 }
 
+#undef VPX_NO_UNSIGNED_OVERFLOW_CHECK
+
 #endif
diff --git a/test/acm_random.h b/test/acm_random.h
index ff5c93e..b94b6e1 100644
--- a/test/acm_random.h
+++ b/test/acm_random.h
@@ -32,6 +32,12 @@
     return (value >> 15) & 0xffff;
   }
 
+  int16_t Rand9Signed(void) {
+    // Use 9 bits: values between 255 (0x0FF) and -256 (0x100).
+    const uint32_t value = random_.Generate(512);
+    return static_cast<int16_t>(value) - 256;
+  }
+
   uint8_t Rand8(void) {
     const uint32_t value =
         random_.Generate(testing::internal::Random::kMaxRange);
diff --git a/test/add_noise_test.cc b/test/add_noise_test.cc
new file mode 100644
index 0000000..e9945c4
--- /dev/null
+++ b/test/add_noise_test.cc
@@ -0,0 +1,197 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <math.h>
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
+
+namespace {
+
+// TODO(jimbankoski): make width and height integers not unsigned.
+typedef void (*AddNoiseFunc)(unsigned char *start, char *noise,
+                             char blackclamp[16], char whiteclamp[16],
+                             char bothclamp[16], unsigned int width,
+                             unsigned int height, int pitch);
+
+class AddNoiseTest
+    : public ::testing::TestWithParam<AddNoiseFunc> {
+ public:
+  virtual void TearDown() {
+    libvpx_test::ClearSystemState();
+  }
+  virtual ~AddNoiseTest() {}
+};
+
+double stddev6(char a, char b, char c, char d, char e, char f) {
+  const double n = (a + b + c + d + e + f) / 6.0;
+  const double v = ((a - n) * (a - n) + (b - n) * (b - n) + (c - n) * (c - n) +
+                    (d - n) * (d - n) + (e - n) * (e - n) + (f - n) * (f - n)) /
+                   6.0;
+  return sqrt(v);
+}
+
+// TODO(jimbankoski): The following 2 functions are duplicated in each codec.
+// For now the vp9 one has been copied into the test as is. We should normalize
+// these in vpx_dsp and not have 3 copies of these unless there is different
+// noise we add for each codec.
+
+double gaussian(double sigma, double mu, double x) {
+  return 1 / (sigma * sqrt(2.0 * 3.14159265)) *
+         (exp(-(x - mu) * (x - mu) / (2 * sigma * sigma)));
+}
+
+int setup_noise(int size_noise, char *noise) {
+  char char_dist[300];
+  const int ai = 4;
+  const int qi = 24;
+  const double sigma = ai + .5 + .6 * (63 - qi) / 63.0;
+
+  /* set up a lookup table of 256 entries that matches
+   * a gaussian distribution with sigma determined by q.
+   */
+  int next = 0;
+
+  for (int i = -32; i < 32; i++) {
+    int a_i = (int) (0.5 + 256 * gaussian(sigma, 0, i));
+
+    if (a_i) {
+      for (int j = 0; j < a_i; j++) {
+        char_dist[next + j] = (char)(i);
+      }
+
+      next = next + a_i;
+    }
+  }
+
+  for (; next < 256; next++)
+    char_dist[next] = 0;
+
+  for (int i = 0; i < size_noise; i++) {
+    noise[i] = char_dist[rand() & 0xff];  // NOLINT
+  }
+
+  // Returns the most negative value in distribution.
+  return char_dist[0];
+}
+
+TEST_P(AddNoiseTest, CheckNoiseAdded) {
+  DECLARE_ALIGNED(16, char, blackclamp[16]);
+  DECLARE_ALIGNED(16, char, whiteclamp[16]);
+  DECLARE_ALIGNED(16, char, bothclamp[16]);
+  const int width  = 64;
+  const int height = 64;
+  const int image_size = width * height;
+  char noise[3072];
+
+  const int clamp = setup_noise(3072, noise);
+  for (int i = 0; i < 16; i++) {
+    blackclamp[i] = -clamp;
+    whiteclamp[i] = -clamp;
+    bothclamp[i] = -2 * clamp;
+  }
+
+  uint8_t *const s = reinterpret_cast<uint8_t *>(vpx_calloc(image_size, 1));
+  memset(s, 99, image_size);
+
+  ASM_REGISTER_STATE_CHECK(GetParam()(s, noise, blackclamp, whiteclamp,
+                                      bothclamp, width, height, width));
+
+  // Check to make sure we don't end up having either the same or no added
+  // noise either vertically or horizontally.
+  for (int i = 0; i < image_size - 6 * width - 6; ++i) {
+    const double hd = stddev6(s[i] - 99, s[i + 1] - 99, s[i + 2] - 99,
+                              s[i + 3] - 99, s[i + 4] - 99, s[i + 5] - 99);
+    const double vd = stddev6(s[i] - 99, s[i + width] - 99,
+                              s[i + 2 * width] - 99, s[i + 3 * width] - 99,
+                              s[i + 4 * width] - 99, s[i + 5 * width] - 99);
+
+    EXPECT_NE(hd, 0);
+    EXPECT_NE(vd, 0);
+  }
+
+  // Initialize pixels in the image to 255 and check for roll over.
+  memset(s, 255, image_size);
+
+  ASM_REGISTER_STATE_CHECK(GetParam()(s, noise, blackclamp, whiteclamp,
+                                      bothclamp, width, height, width));
+
+  // Check to make sure don't roll over.
+  for (int i = 0; i < image_size; ++i) {
+    EXPECT_GT((int)s[i], 10) << "i = " << i;
+  }
+
+  // Initialize pixels in the image to 0 and check for roll under.
+  memset(s, 0, image_size);
+
+  ASM_REGISTER_STATE_CHECK(GetParam()(s, noise, blackclamp, whiteclamp,
+                                      bothclamp, width, height, width));
+
+  // Check to make sure don't roll under.
+  for (int i = 0; i < image_size; ++i) {
+    EXPECT_LT((int)s[i], 245) << "i = " << i;
+  }
+
+  vpx_free(s);
+}
+
+TEST_P(AddNoiseTest, CheckCvsAssembly) {
+  DECLARE_ALIGNED(16, char, blackclamp[16]);
+  DECLARE_ALIGNED(16, char, whiteclamp[16]);
+  DECLARE_ALIGNED(16, char, bothclamp[16]);
+  const int width  = 64;
+  const int height = 64;
+  const int image_size = width * height;
+  char noise[3072];
+
+  const int clamp = setup_noise(3072, noise);
+  for (int i = 0; i < 16; i++) {
+    blackclamp[i] = -clamp;
+    whiteclamp[i] = -clamp;
+    bothclamp[i] = -2 * clamp;
+  }
+
+  uint8_t *const s = reinterpret_cast<uint8_t *>(vpx_calloc(image_size, 1));
+  uint8_t *const d = reinterpret_cast<uint8_t *>(vpx_calloc(image_size, 1));
+
+  memset(s, 99, image_size);
+  memset(d, 99, image_size);
+
+  srand(0);
+  ASM_REGISTER_STATE_CHECK(GetParam()(s, noise, blackclamp, whiteclamp,
+                                      bothclamp, width, height, width));
+  srand(0);
+  ASM_REGISTER_STATE_CHECK(vpx_plane_add_noise_c(d, noise, blackclamp,
+                                                 whiteclamp, bothclamp,
+                                                 width, height, width));
+
+  for (int i = 0; i < image_size; ++i) {
+    EXPECT_EQ((int)s[i], (int)d[i]) << "i = " << i;
+  }
+
+  vpx_free(d);
+  vpx_free(s);
+}
+
+INSTANTIATE_TEST_CASE_P(C, AddNoiseTest,
+                        ::testing::Values(vpx_plane_add_noise_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, AddNoiseTest,
+                        ::testing::Values(vpx_plane_add_noise_sse2));
+#endif
+
+#if HAVE_MSA
+INSTANTIATE_TEST_CASE_P(MSA, AddNoiseTest,
+                        ::testing::Values(vpx_plane_add_noise_msa));
+#endif
+}  // namespace
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 120d475..21f185a 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -138,7 +138,8 @@
   // and filter_max_width          = 16
   //
   uint8_t intermediate_buffer[(kMaxDimension+8) * kMaxDimension];
-  const int intermediate_next_stride = 1 - intermediate_height * output_width;
+  const int intermediate_next_stride =
+      1 - static_cast<int>(intermediate_height * output_width);
 
   // Horizontal pass (src -> transposed intermediate).
   uint8_t *output_ptr = intermediate_buffer;
@@ -250,7 +251,8 @@
    * and filter_max_width = 16
    */
   uint16_t intermediate_buffer[(kMaxDimension+8) * kMaxDimension];
-  const int intermediate_next_stride = 1 - intermediate_height * output_width;
+  const int intermediate_next_stride =
+      1 - static_cast<int>(intermediate_height * output_width);
 
   // Horizontal pass (src -> transposed intermediate).
   {
diff --git a/test/datarate_test.cc b/test/datarate_test.cc
index 5467c46..2f1db9c 100644
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -90,7 +90,7 @@
           << pkt->data.frame.pts;
     }
 
-    const size_t frame_size_in_bits = pkt->data.frame.sz * 8;
+    const int64_t frame_size_in_bits = pkt->data.frame.sz * 8;
 
     // Subtract from the buffer the bits associated with a played back frame.
     bits_in_buffer_model_ -= frame_size_in_bits;
@@ -450,7 +450,28 @@
   int denoiser_offon_period_;
 };
 
-// Check basic rate targeting,
+// Check basic rate targeting for VBR mode.
+TEST_P(DatarateTestVP9Large, BasicRateTargetingVBR) {
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_error_resilient = 0;
+  cfg_.rc_end_usage = VPX_VBR;
+  cfg_.g_lag_in_frames = 0;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 300);
+  for (int i = 400; i <= 800; i += 400) {
+    cfg_.rc_target_bitrate = i;
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75)
+        << " The datarate for the file is lower than target by too much!";
+    ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.25)
+        << " The datarate for the file is greater than target by too much!";
+  }
+}
+
+// Check basic rate targeting for CBR,
 TEST_P(DatarateTestVP9Large, BasicRateTargeting) {
   cfg_.rc_buf_initial_sz = 500;
   cfg_.rc_buf_optimal_sz = 500;
@@ -474,7 +495,7 @@
   }
 }
 
-// Check basic rate targeting,
+// Check basic rate targeting for CBR.
 TEST_P(DatarateTestVP9Large, BasicRateTargeting444) {
   ::libvpx_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 140);
 
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index 3c16e6e..e6224b2 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -365,10 +365,10 @@
 
       for (int j = 0; j < kNumCoeffs; ++j) {
 #if CONFIG_VP9_HIGHBITDEPTH
-        const uint32_t diff =
+        const int32_t diff =
             bit_depth_ == VPX_BITS_8 ?  dst[j] - src[j] : dst16[j] - src16[j];
 #else
-        const uint32_t diff = dst[j] - src[j];
+        const int32_t diff = dst[j] - src[j];
 #endif
         const uint32_t error = diff * diff;
         if (max_error < error)
diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc
index 3cb0a8e..278d72d 100644
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -147,10 +147,10 @@
 
     for (int j = 0; j < kNumCoeffs; ++j) {
 #if CONFIG_VP9_HIGHBITDEPTH
-      const uint32_t diff =
+      const int32_t diff =
           bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
 #else
-      const uint32_t diff = dst[j] - src[j];
+      const int32_t diff = dst[j] - src[j];
 #endif
       const uint32_t error = diff * diff;
       if (max_error < error)
diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index 59ce895..f6b6567 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -302,22 +302,12 @@
         make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 3, VPX_BITS_8, 16)));
 #endif  // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
-#if CONFIG_USE_X86INC && HAVE_MMX && !CONFIG_VP9_HIGHBITDEPTH && \
-    !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(
-    MMX, Trans4x4WHT,
-    ::testing::Values(
-        make_tuple(&vp9_fwht4x4_mmx, &vpx_iwht4x4_16_add_c, 0,
-                   VPX_BITS_8, 16)));
-#endif
-
-#if CONFIG_USE_X86INC && HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && \
-    !CONFIG_EMULATE_HARDWARE
+#if CONFIG_USE_X86INC && HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans4x4WHT,
     ::testing::Values(
-        make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_sse2, 0,
-                   VPX_BITS_8, 16)));
+        make_tuple(&vp9_fwht4x4_sse2, &vpx_iwht4x4_16_add_c, 0, VPX_BITS_8, 16),
+        make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_sse2, 0, VPX_BITS_8, 16)));
 #endif
 
 #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index edf4682..29f2158 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -425,10 +425,10 @@
 
       for (int j = 0; j < kNumCoeffs; ++j) {
 #if CONFIG_VP9_HIGHBITDEPTH
-        const uint32_t diff =
+        const int diff =
             bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
 #else
-        const uint32_t diff = dst[j] - src[j];
+        const int diff = dst[j] - src[j];
 #endif
         const uint32_t error = diff * diff;
         EXPECT_GE(1u << 2 * (bit_depth_ - 8), error)
@@ -458,7 +458,7 @@
         coeff_r[j] = static_cast<tran_low_t>(round(out_r[j]));
 
       for (int j = 0; j < kNumCoeffs; ++j) {
-        const uint32_t diff = coeff[j] - coeff_r[j];
+        const int32_t diff = coeff[j] - coeff_r[j];
         const uint32_t error = diff * diff;
         EXPECT_GE(9u << 2 * (bit_depth_ - 8), error)
             << "Error: 8x8 DCT has error " << error
@@ -511,10 +511,10 @@
 
       for (int j = 0; j < kNumCoeffs; ++j) {
 #if CONFIG_VP9_HIGHBITDEPTH
-        const uint32_t diff =
+        const int diff =
             bit_depth_ == VPX_BITS_8 ? dst[j] - ref[j] : dst16[j] - ref16[j];
 #else
-        const uint32_t diff = dst[j] - ref[j];
+        const int diff = dst[j] - ref[j];
 #endif
         const uint32_t error = diff * diff;
         EXPECT_EQ(0u, error)
diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc
new file mode 100644
index 0000000..7a5bd5b
--- /dev/null
+++ b/test/hadamard_test.cc
@@ -0,0 +1,220 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <algorithm>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_dsp_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+
+namespace {
+
+using ::libvpx_test::ACMRandom;
+
+typedef void (*HadamardFunc)(const int16_t *a, int a_stride, int16_t *b);
+
+void hadamard_loop(const int16_t *a, int a_stride, int16_t *out) {
+  int16_t b[8];
+  for (int i = 0; i < 8; i += 2) {
+    b[i + 0] = a[i * a_stride] + a[(i + 1) * a_stride];
+    b[i + 1] = a[i * a_stride] - a[(i + 1) * a_stride];
+  }
+  int16_t c[8];
+  for (int i = 0; i < 8; i += 4) {
+    c[i + 0] = b[i + 0] + b[i + 2];
+    c[i + 1] = b[i + 1] + b[i + 3];
+    c[i + 2] = b[i + 0] - b[i + 2];
+    c[i + 3] = b[i + 1] - b[i + 3];
+  }
+  out[0] = c[0] + c[4];
+  out[7] = c[1] + c[5];
+  out[3] = c[2] + c[6];
+  out[4] = c[3] + c[7];
+  out[2] = c[0] - c[4];
+  out[6] = c[1] - c[5];
+  out[1] = c[2] - c[6];
+  out[5] = c[3] - c[7];
+}
+
+void reference_hadamard8x8(const int16_t *a, int a_stride, int16_t *b) {
+  int16_t buf[64];
+  for (int i = 0; i < 8; ++i) {
+    hadamard_loop(a + i, a_stride, buf + i * 8);
+  }
+
+  for (int i = 0; i < 8; ++i) {
+    hadamard_loop(buf + i, 8, b + i * 8);
+  }
+}
+
+void reference_hadamard16x16(const int16_t *a, int a_stride, int16_t *b) {
+  /* The source is a 16x16 block. The destination is rearranged to 8x32.
+   * Input is 9 bit. */
+  reference_hadamard8x8(a + 0 + 0 * a_stride, a_stride, b + 0);
+  reference_hadamard8x8(a + 8 + 0 * a_stride, a_stride, b + 64);
+  reference_hadamard8x8(a + 0 + 8 * a_stride, a_stride, b + 128);
+  reference_hadamard8x8(a + 8 + 8 * a_stride, a_stride, b + 192);
+
+  /* Overlay the 8x8 blocks and combine. */
+  for (int i = 0; i < 64; ++i) {
+    /* 8x8 steps the range up to 15 bits. */
+    const int16_t a0 = b[0];
+    const int16_t a1 = b[64];
+    const int16_t a2 = b[128];
+    const int16_t a3 = b[192];
+
+    /* Prevent the result from escaping int16_t. */
+    const int16_t b0 = (a0 + a1) >> 1;
+    const int16_t b1 = (a0 - a1) >> 1;
+    const int16_t b2 = (a2 + a3) >> 1;
+    const int16_t b3 = (a2 - a3) >> 1;
+
+    /* Store a 16 bit value. */
+    b[  0] = b0 + b2;
+    b[ 64] = b1 + b3;
+    b[128] = b0 - b2;
+    b[192] = b1 - b3;
+
+    ++b;
+  }
+}
+
+class HadamardTestBase : public ::testing::TestWithParam<HadamardFunc> {
+ public:
+  virtual void SetUp() {
+    h_func_ = GetParam();
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+
+ protected:
+  HadamardFunc h_func_;
+  ACMRandom rnd_;
+};
+
+class Hadamard8x8Test : public HadamardTestBase {};
+
+TEST_P(Hadamard8x8Test, CompareReferenceRandom) {
+  DECLARE_ALIGNED(16, int16_t, a[64]);
+  DECLARE_ALIGNED(16, int16_t, b[64]);
+  int16_t b_ref[64];
+  for (int i = 0; i < 64; ++i) {
+    a[i] = rnd_.Rand9Signed();
+  }
+  memset(b, 0, sizeof(b));
+  memset(b_ref, 0, sizeof(b_ref));
+
+  reference_hadamard8x8(a, 8, b_ref);
+  ASM_REGISTER_STATE_CHECK(h_func_(a, 8, b));
+
+  // The order of the output is not important. Sort before checking.
+  std::sort(b, b + 64);
+  std::sort(b_ref, b_ref + 64);
+  EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
+}
+
+TEST_P(Hadamard8x8Test, VaryStride) {
+  DECLARE_ALIGNED(16, int16_t, a[64 * 8]);
+  DECLARE_ALIGNED(16, int16_t, b[64]);
+  int16_t b_ref[64];
+  for (int i = 0; i < 64 * 8; ++i) {
+    a[i] = rnd_.Rand9Signed();
+  }
+
+  for (int i = 8; i < 64; i += 8) {
+    memset(b, 0, sizeof(b));
+    memset(b_ref, 0, sizeof(b_ref));
+
+    reference_hadamard8x8(a, i, b_ref);
+    ASM_REGISTER_STATE_CHECK(h_func_(a, i, b));
+
+    // The order of the output is not important. Sort before checking.
+    std::sort(b, b + 64);
+    std::sort(b_ref, b_ref + 64);
+    EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(C, Hadamard8x8Test,
+                        ::testing::Values(&vpx_hadamard_8x8_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, Hadamard8x8Test,
+                        ::testing::Values(&vpx_hadamard_8x8_sse2));
+#endif  // HAVE_SSE2
+
+#if HAVE_SSSE3 && CONFIG_USE_X86INC && ARCH_X86_64
+INSTANTIATE_TEST_CASE_P(SSSE3, Hadamard8x8Test,
+                        ::testing::Values(&vpx_hadamard_8x8_ssse3));
+#endif  // HAVE_SSSE3 && CONFIG_USE_X86INC && ARCH_X86_64
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, Hadamard8x8Test,
+                        ::testing::Values(&vpx_hadamard_8x8_neon));
+#endif  // HAVE_NEON
+
+class Hadamard16x16Test : public HadamardTestBase {};
+
+TEST_P(Hadamard16x16Test, CompareReferenceRandom) {
+  DECLARE_ALIGNED(16, int16_t, a[16 * 16]);
+  DECLARE_ALIGNED(16, int16_t, b[16 * 16]);
+  int16_t b_ref[16 * 16];
+  for (int i = 0; i < 16 * 16; ++i) {
+    a[i] = rnd_.Rand9Signed();
+  }
+  memset(b, 0, sizeof(b));
+  memset(b_ref, 0, sizeof(b_ref));
+
+  reference_hadamard16x16(a, 16, b_ref);
+  ASM_REGISTER_STATE_CHECK(h_func_(a, 16, b));
+
+  // The order of the output is not important. Sort before checking.
+  std::sort(b, b + 16 * 16);
+  std::sort(b_ref, b_ref + 16 * 16);
+  EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
+}
+
+TEST_P(Hadamard16x16Test, VaryStride) {
+  DECLARE_ALIGNED(16, int16_t, a[16 * 16 * 8]);
+  DECLARE_ALIGNED(16, int16_t, b[16 * 16]);
+  int16_t b_ref[16 * 16];
+  for (int i = 0; i < 16 * 16 * 8; ++i) {
+    a[i] = rnd_.Rand9Signed();
+  }
+
+  for (int i = 8; i < 64; i += 8) {
+    memset(b, 0, sizeof(b));
+    memset(b_ref, 0, sizeof(b_ref));
+
+    reference_hadamard16x16(a, i, b_ref);
+    ASM_REGISTER_STATE_CHECK(h_func_(a, i, b));
+
+    // The order of the output is not important. Sort before checking.
+    std::sort(b, b + 16 * 16);
+    std::sort(b_ref, b_ref + 16 * 16);
+    EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(C, Hadamard16x16Test,
+                        ::testing::Values(&vpx_hadamard_16x16_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, Hadamard16x16Test,
+                        ::testing::Values(&vpx_hadamard_16x16_sse2));
+#endif  // HAVE_SSE2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, Hadamard16x16Test,
+                        ::testing::Values(&vpx_hadamard_16x16_neon));
+#endif  // HAVE_NEON
+}  // namespace
diff --git a/test/level_test.cc b/test/level_test.cc
new file mode 100644
index 0000000..62d0247
--- /dev/null
+++ b/test/level_test.cc
@@ -0,0 +1,119 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+class LevelTest
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
+ protected:
+  LevelTest()
+     : EncoderTest(GET_PARAM(0)),
+       encoding_mode_(GET_PARAM(1)),
+       cpu_used_(GET_PARAM(2)),
+       min_gf_internal_(24),
+       target_level_(0),
+       level_(0) {}
+  virtual ~LevelTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+    if (encoding_mode_ != ::libvpx_test::kRealTime) {
+      cfg_.g_lag_in_frames = 25;
+      cfg_.rc_end_usage = VPX_VBR;
+    } else {
+      cfg_.g_lag_in_frames = 0;
+      cfg_.rc_end_usage = VPX_CBR;
+    }
+    cfg_.rc_2pass_vbr_minsection_pct = 5;
+    cfg_.rc_2pass_vbr_maxsection_pct = 2000;
+    cfg_.rc_target_bitrate = 400;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_min_quantizer = 0;
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(VP8E_SET_CPUUSED, cpu_used_);
+      encoder->Control(VP9E_SET_TARGET_LEVEL, target_level_);
+      encoder->Control(VP9E_SET_MIN_GF_INTERVAL, min_gf_internal_);
+      if (encoding_mode_ != ::libvpx_test::kRealTime) {
+        encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
+        encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
+        encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
+        encoder->Control(VP8E_SET_ARNR_TYPE, 3);
+      }
+    }
+    encoder->Control(VP9E_GET_LEVEL, &level_);
+    ASSERT_LE(level_, 51);
+    ASSERT_GE(level_, 0);
+  }
+
+  ::libvpx_test::TestMode encoding_mode_;
+  int cpu_used_;
+  int min_gf_internal_;
+  int target_level_;
+  int level_;
+};
+
+// Test for keeping level stats only
+TEST_P(LevelTest, TestTargetLevel0) {
+  ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
+                                       40);
+  target_level_ = 0;
+  min_gf_internal_ = 4;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_EQ(11, level_);
+
+  cfg_.rc_target_bitrate = 1600;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_EQ(20, level_);
+}
+
+// Test for level control being turned off
+TEST_P(LevelTest, TestTargetLevel255) {
+  ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
+                                       30);
+  target_level_ = 255;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+TEST_P(LevelTest, TestTargetLevelApi) {
+  ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0, 1);
+  static const vpx_codec_iface_t *codec = &vpx_codec_vp9_cx_algo;
+  vpx_codec_ctx_t enc;
+  vpx_codec_enc_cfg_t cfg;
+  EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_config_default(codec, &cfg, 0));
+  EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_init(&enc, codec, &cfg, 0));
+  for (int level = 0; level <= 256; ++level) {
+    if (level == 10 || level == 11 || level == 20 || level == 21 ||
+        level == 30 || level == 31 || level == 40 || level == 41 ||
+        level == 50 || level == 51 || level == 52 || level == 60 ||
+        level == 61 || level == 62 || level == 0 || level == 255)
+      EXPECT_EQ(VPX_CODEC_OK,
+                vpx_codec_control(&enc, VP9E_SET_TARGET_LEVEL, level));
+    else
+      EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
+                vpx_codec_control(&enc, VP9E_SET_TARGET_LEVEL, level));
+  }
+  EXPECT_EQ(VPX_CODEC_OK, vpx_codec_destroy(&enc));
+}
+
+VP9_INSTANTIATE_TEST_CASE(LevelTest,
+                          ::testing::Values(::libvpx_test::kTwoPassGood,
+                                            ::libvpx_test::kOnePassGood),
+                          ::testing::Range(0, 9));
+}  // namespace
diff --git a/test/lpf_8_test.cc b/test/lpf_8_test.cc
index 778a36c..94646e4 100644
--- a/test/lpf_8_test.cc
+++ b/test/lpf_8_test.cc
@@ -430,16 +430,6 @@
 
 using std::tr1::make_tuple;
 
-#if HAVE_MMX && CONFIG_USE_X86INC && !CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
-    MMX, Loop8Test6Param,
-    ::testing::Values(
-        make_tuple(&vpx_lpf_horizontal_4_mmx,
-                   &vpx_lpf_horizontal_4_c, 8),
-        make_tuple(&vpx_lpf_vertical_4_mmx,
-                   &vpx_lpf_vertical_4_c, 8)));
-#endif  // HAVE_MMX
-
 #if HAVE_SSE2
 #if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
@@ -497,12 +487,16 @@
 INSTANTIATE_TEST_CASE_P(
     SSE2, Loop8Test6Param,
     ::testing::Values(
+        make_tuple(&vpx_lpf_horizontal_4_sse2,
+                   &vpx_lpf_horizontal_4_c, 8),
         make_tuple(&vpx_lpf_horizontal_8_sse2,
                    &vpx_lpf_horizontal_8_c, 8),
         make_tuple(&vpx_lpf_horizontal_edge_8_sse2,
                    &vpx_lpf_horizontal_edge_8_c, 8),
         make_tuple(&vpx_lpf_horizontal_edge_16_sse2,
                    &vpx_lpf_horizontal_edge_16_c, 8),
+        make_tuple(&vpx_lpf_vertical_4_sse2,
+                   &vpx_lpf_vertical_4_c, 8),
         make_tuple(&vpx_lpf_vertical_8_sse2,
                    &vpx_lpf_vertical_8_c, 8),
         make_tuple(&vpx_lpf_vertical_16_sse2,
diff --git a/test/minmax_test.cc b/test/minmax_test.cc
new file mode 100644
index 0000000..dbe4342
--- /dev/null
+++ b/test/minmax_test.cc
@@ -0,0 +1,132 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+
+namespace {
+
+using ::libvpx_test::ACMRandom;
+
+typedef void (*MinMaxFunc)(const uint8_t *a, int a_stride,
+                           const uint8_t *b, int b_stride,
+                           int *min, int *max);
+
+class MinMaxTest : public ::testing::TestWithParam<MinMaxFunc> {
+ public:
+  virtual void SetUp() {
+    mm_func_ = GetParam();
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+
+ protected:
+  MinMaxFunc mm_func_;
+  ACMRandom rnd_;
+};
+
+void reference_minmax(const uint8_t *a, int a_stride,
+                      const uint8_t *b, int b_stride,
+                      int *min_ret, int *max_ret) {
+  int min = 255;
+  int max = 0;
+  for (int i = 0; i < 8; i++) {
+    for (int j = 0; j < 8; j++) {
+      const int diff = abs(a[i * a_stride + j] - b[i * b_stride + j]);
+      if (min > diff) min = diff;
+      if (max < diff) max = diff;
+    }
+  }
+
+  *min_ret = min;
+  *max_ret = max;
+}
+
+TEST_P(MinMaxTest, MinValue) {
+  for (int i = 0; i < 64; i++) {
+    uint8_t a[64], b[64];
+    memset(a, 0, sizeof(a));
+    memset(b, 255, sizeof(b));
+    b[i] = i;  // Set a minimum difference of i.
+
+    int min, max;
+    ASM_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max));
+    EXPECT_EQ(255, max);
+    EXPECT_EQ(i, min);
+  }
+}
+
+TEST_P(MinMaxTest, MaxValue) {
+  for (int i = 0; i < 64; i++) {
+    uint8_t a[64], b[64];
+    memset(a, 0, sizeof(a));
+    memset(b, 0, sizeof(b));
+    b[i] = i;  // Set a maximum difference of i.
+
+    int min, max;
+    ASM_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max));
+    EXPECT_EQ(i, max);
+    EXPECT_EQ(0, min);
+  }
+}
+
+TEST_P(MinMaxTest, CompareReference) {
+  uint8_t a[64], b[64];
+  for (int j = 0; j < 64; j++) {
+    a[j] = rnd_.Rand8();
+    b[j] = rnd_.Rand8();
+  }
+
+  int min_ref, max_ref, min, max;
+  reference_minmax(a, 8, b, 8, &min_ref, &max_ref);
+  ASM_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max));
+  EXPECT_EQ(max_ref, max);
+  EXPECT_EQ(min_ref, min);
+}
+
+TEST_P(MinMaxTest, CompareReferenceAndVaryStride) {
+  uint8_t a[8 * 64], b[8 * 64];
+  for (int i = 0; i < 8 * 64; i++) {
+    a[i] = rnd_.Rand8();
+    b[i] = rnd_.Rand8();
+  }
+  for (int a_stride = 8; a_stride <= 64; a_stride += 8) {
+    for (int b_stride = 8; b_stride <= 64; b_stride += 8) {
+      int min_ref, max_ref, min, max;
+      reference_minmax(a, a_stride, b, b_stride, &min_ref, &max_ref);
+      ASM_REGISTER_STATE_CHECK(mm_func_(a, a_stride, b, b_stride, &min, &max));
+      EXPECT_EQ(max_ref, max) << "when a_stride = " << a_stride
+                              << " and b_stride = " << b_stride;;
+      EXPECT_EQ(min_ref, min) << "when a_stride = " << a_stride
+                              << " and b_stride = " << b_stride;;
+    }
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(C, MinMaxTest, ::testing::Values(&vpx_minmax_8x8_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, MinMaxTest,
+                        ::testing::Values(&vpx_minmax_8x8_sse2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, MinMaxTest,
+                        ::testing::Values(&vpx_minmax_8x8_neon));
+#endif
+
+}  // namespace
diff --git a/test/realtime_test.cc b/test/realtime_test.cc
new file mode 100644
index 0000000..24749e4
--- /dev/null
+++ b/test/realtime_test.cc
@@ -0,0 +1,64 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/video_source.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+namespace {
+
+const int kVideoSourceWidth = 320;
+const int kVideoSourceHeight = 240;
+const int kFramesToEncode = 2;
+
+class RealtimeTest
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
+ protected:
+  RealtimeTest()
+      : EncoderTest(GET_PARAM(0)), frame_packets_(0) {}
+  virtual ~RealtimeTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    cfg_.g_lag_in_frames = 0;
+    SetMode(::libvpx_test::kRealTime);
+  }
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+    // TODO(tomfinegan): We're changing the pass value here to make sure
+    // we get frames when real time mode is combined with |g_pass| set to
+    // VPX_RC_FIRST_PASS. This is necessary because EncoderTest::RunLoop() sets
+    // the pass value based on the mode passed into EncoderTest::SetMode(),
+    // which overrides the one specified in SetUp() above.
+    cfg_.g_pass = VPX_RC_FIRST_PASS;
+  }
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t * /*pkt*/) {
+    frame_packets_++;
+  }
+
+  int frame_packets_;
+};
+
+TEST_P(RealtimeTest, RealtimeFirstPassProducesFrames) {
+  ::libvpx_test::RandomVideoSource video;
+  video.SetSize(kVideoSourceWidth, kVideoSourceHeight);
+  video.set_limit(kFramesToEncode);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  EXPECT_EQ(kFramesToEncode, frame_packets_);
+}
+
+VP8_INSTANTIATE_TEST_CASE(RealtimeTest,
+                          ::testing::Values(::libvpx_test::kRealTime));
+VP9_INSTANTIATE_TEST_CASE(RealtimeTest,
+                          ::testing::Values(::libvpx_test::kRealTime));
+
+}  // namespace
diff --git a/test/register_state_check.h b/test/register_state_check.h
index 489c419..5336f2f 100644
--- a/test/register_state_check.h
+++ b/test/register_state_check.h
@@ -36,16 +36,10 @@
 #include <windows.h>
 #include <winnt.h>
 
-namespace testing {
-namespace internal {
-
 inline bool operator==(const M128A& lhs, const M128A& rhs) {
   return (lhs.Low == rhs.Low && lhs.High == rhs.High);
 }
 
-}  // namespace internal
-}  // namespace testing
-
 namespace libvpx_test {
 
 // Compares the state of xmm[6-15] at construction with their state at
diff --git a/test/resize_test.cc b/test/resize_test.cc
index eaebd75..90f5452 100644
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@@ -7,6 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include <stdio.h>
+
 #include <climits>
 #include <vector>
 #include "third_party/googletest/src/include/gtest/gtest.h"
@@ -558,9 +560,13 @@
     }
   }
 
+#if CONFIG_VP9_DECODER
   // Verify that we get 1 resize down event in this test.
   ASSERT_EQ(1, resize_count) << "Resizing should occur.";
   EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+#else
+  printf("Warning: VP9 decoder unavailable, unable to check resize count!\n");
+#endif
 }
 
 // Verify the dynamic resizer behavior for real time, 1 pass CBR mode.
@@ -602,9 +608,13 @@
     }
   }
 
+#if CONFIG_VP9_DECODER
   // Verify that we get 2 resize events in this test.
   ASSERT_EQ(resize_count, 2) << "Resizing should occur twice.";
   EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+#else
+  printf("Warning: VP9 decoder unavailable, unable to check resize count!\n");
+#endif
 }
 
 vpx_img_fmt_t CspForFrameNumber(int frame) {
diff --git a/test/sad_test.cc b/test/sad_test.cc
index ab723e7..f277294 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -749,17 +749,6 @@
 
 //------------------------------------------------------------------------------
 // x86 functions
-#if HAVE_MMX
-const SadMxNParam mmx_tests[] = {
-  make_tuple(16, 16, &vpx_sad16x16_mmx, -1),
-  make_tuple(16, 8, &vpx_sad16x8_mmx, -1),
-  make_tuple(8, 16, &vpx_sad8x16_mmx, -1),
-  make_tuple(8, 8, &vpx_sad8x8_mmx, -1),
-  make_tuple(4, 4, &vpx_sad4x4_mmx, -1),
-};
-INSTANTIATE_TEST_CASE_P(MMX, SADTest, ::testing::ValuesIn(mmx_tests));
-#endif  // HAVE_MMX
-
 #if HAVE_SSE2
 #if CONFIG_USE_X86INC
 const SadMxNParam sse2_tests[] = {
diff --git a/test/simple_encoder.sh b/test/simple_encoder.sh
index c4a6280..ee633ae 100755
--- a/test/simple_encoder.sh
+++ b/test/simple_encoder.sh
@@ -23,7 +23,7 @@
   fi
 }
 
-# Runs simple_encoder using the codec specified by $1.
+# Runs simple_encoder using the codec specified by $1 with a frame limit of 100.
 simple_encoder() {
   local encoder="${LIBVPX_BIN_PATH}/simple_encoder${VPX_TEST_EXE_SUFFIX}"
   local codec="$1"
@@ -35,7 +35,7 @@
   fi
 
   eval "${VPX_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
-      "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" 9999 \
+      "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" 9999 0 100 \
       ${devnull}
 
   [ -e "${output_file}" ] || return 1
@@ -47,16 +47,13 @@
   fi
 }
 
-# TODO(tomfinegan): Add a frame limit param to simple_encoder and enable this
-# test. VP9 is just too slow right now: This test takes 4m30s+ on a fast
-# machine.
-DISABLED_simple_encoder_vp9() {
+simple_encoder_vp9() {
   if [ "$(vp9_encode_available)" = "yes" ]; then
     simple_encoder vp9 || return 1
   fi
 }
 
 simple_encoder_tests="simple_encoder_vp8
-                      DISABLED_simple_encoder_vp9"
+                      simple_encoder_vp9"
 
 run_tests simple_encoder_verify_environment "${simple_encoder_tests}"
diff --git a/test/superframe_test.cc b/test/superframe_test.cc
index 3e2f49a..5022866 100644
--- a/test/superframe_test.cc
+++ b/test/superframe_test.cc
@@ -112,13 +112,13 @@
   ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
                                        30, 1, 0, 40);
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-#if CONFIG_BIDIR_PRED
+#if CONFIG_EXT_REFS
   // NOTE: The use of BWDREF_FRAME will enable the coding of more non-show
   //       frames besides ALTREF_FRAME.
   EXPECT_GE(sf_count_, 1);
 #else
   EXPECT_EQ(sf_count_, 1);
-#endif  // CONFIG_BIDIR_PRED
+#endif  // CONFIG_EXT_REFS
 }
 
 VP9_INSTANTIATE_TEST_CASE(SuperframeTest, ::testing::Combine(
diff --git a/test/test.mk b/test/test.mk
index 28c0caa..cdef53c 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -25,6 +25,7 @@
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += encode_api_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += error_resilience_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += i420_video_source.h
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += realtime_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += resize_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += y4m_video_source.h
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += yuv_video_source.h
@@ -43,6 +44,7 @@
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += frame_size_tests.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_lossless_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_ethread_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += level_test.cc
 
 LIBVPX_TEST_SRCS-yes                   += decode_test_driver.cc
 LIBVPX_TEST_SRCS-yes                   += decode_test_driver.h
@@ -108,6 +110,7 @@
 LIBVPX_TEST_SRCS-yes                   += vp8_fragments_test.cc
 endif
 
+LIBVPX_TEST_SRCS-$(CONFIG_POSTPROC)    += add_noise_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_POSTPROC)    += pp_filter_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += vp8_decrypt_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += quantize_test.cc
@@ -148,6 +151,8 @@
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct32x32_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += hadamard_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += minmax_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_error_block_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_quantize_test.cc
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index 3e65fec..2acf744 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -191,14 +191,15 @@
 INTRA_PRED_TEST(SSE2, TestIntraPred4, vpx_dc_predictor_4x4_sse2,
                 vpx_dc_left_predictor_4x4_sse2, vpx_dc_top_predictor_4x4_sse2,
                 vpx_dc_128_predictor_4x4_sse2, vpx_v_predictor_4x4_sse2,
-                vpx_h_predictor_4x4_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
+                vpx_h_predictor_4x4_sse2, vpx_d45_predictor_4x4_sse2, NULL,
+                NULL, NULL, vpx_d207_predictor_4x4_sse2, NULL,
                 vpx_tm_predictor_4x4_sse2)
 #endif  // HAVE_SSE2 && CONFIG_USE_X86INC
 
 #if HAVE_SSSE3 && CONFIG_USE_X86INC
 INTRA_PRED_TEST(SSSE3, TestIntraPred4, NULL, NULL, NULL, NULL, NULL,
-                NULL, vpx_d45_predictor_4x4_ssse3, NULL, NULL,
-                vpx_d153_predictor_4x4_ssse3, vpx_d207_predictor_4x4_ssse3,
+                NULL, NULL, NULL, NULL,
+                vpx_d153_predictor_4x4_ssse3, NULL,
                 vpx_d63_predictor_4x4_ssse3, NULL)
 #endif  // HAVE_SSSE3 && CONFIG_USE_X86INC
 
@@ -240,13 +241,13 @@
 INTRA_PRED_TEST(SSE2, TestIntraPred8, vpx_dc_predictor_8x8_sse2,
                 vpx_dc_left_predictor_8x8_sse2, vpx_dc_top_predictor_8x8_sse2,
                 vpx_dc_128_predictor_8x8_sse2, vpx_v_predictor_8x8_sse2,
-                vpx_h_predictor_8x8_sse2, NULL, NULL, NULL, NULL, NULL,
-                NULL, vpx_tm_predictor_8x8_sse2)
+                vpx_h_predictor_8x8_sse2, vpx_d45_predictor_8x8_sse2, NULL,
+                NULL, NULL, NULL, NULL, vpx_tm_predictor_8x8_sse2)
 #endif  // HAVE_SSE2 && CONFIG_USE_X86INC
 
 #if HAVE_SSSE3 && CONFIG_USE_X86INC
 INTRA_PRED_TEST(SSSE3, TestIntraPred8, NULL, NULL, NULL, NULL, NULL,
-                NULL, vpx_d45_predictor_8x8_ssse3, NULL, NULL,
+                NULL, NULL, NULL, NULL,
                 vpx_d153_predictor_8x8_ssse3, vpx_d207_predictor_8x8_ssse3,
                 vpx_d63_predictor_8x8_ssse3, NULL)
 #endif  // HAVE_SSSE3 && CONFIG_USE_X86INC
diff --git a/test/twopass_encoder.sh b/test/twopass_encoder.sh
index 1189e51..7a223f2 100755
--- a/test/twopass_encoder.sh
+++ b/test/twopass_encoder.sh
@@ -23,7 +23,8 @@
   fi
 }
 
-# Runs twopass_encoder using the codec specified by $1.
+# Runs twopass_encoder using the codec specified by $1 with a frame limit of
+# 100.
 twopass_encoder() {
   local encoder="${LIBVPX_BIN_PATH}/twopass_encoder${VPX_TEST_EXE_SUFFIX}"
   local codec="$1"
@@ -35,7 +36,7 @@
   fi
 
   eval "${VPX_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
-      "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" \
+      "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" 100 \
       ${devnull}
 
   [ -e "${output_file}" ] || return 1
@@ -47,16 +48,13 @@
   fi
 }
 
-# TODO(tomfinegan): Add a frame limit param to twopass_encoder and enable this
-# test. VP9 is just too slow right now: This test takes 31m16s+ on a fast
-# machine.
-DISABLED_twopass_encoder_vp9() {
+twopass_encoder_vp9() {
   if [ "$(vp9_encode_available)" = "yes" ]; then
     twopass_encoder vp9 || return 1
   fi
 }
 
 twopass_encoder_tests="twopass_encoder_vp8
-                       DISABLED_twopass_encoder_vp9"
+                       twopass_encoder_vp9"
 
 run_tests twopass_encoder_verify_environment "${twopass_encoder_tests}"
diff --git a/test/variance_test.cc b/test/variance_test.cc
index 8ac8511..7eaed27 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -1062,30 +1062,6 @@
     ::testing::ValuesIn(kArrayHBDSubpelAvgVariance_c));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-#if HAVE_MMX
-INSTANTIATE_TEST_CASE_P(MMX, VpxMseTest,
-                        ::testing::Values(make_tuple(4, 4, &vpx_mse16x16_mmx)));
-
-INSTANTIATE_TEST_CASE_P(MMX, SumOfSquaresTest,
-                        ::testing::Values(vpx_get_mb_ss_mmx));
-
-INSTANTIATE_TEST_CASE_P(
-    MMX, VpxVarianceTest,
-    ::testing::Values(make_tuple(4, 4, &vpx_variance16x16_mmx, 0),
-                      make_tuple(4, 3, &vpx_variance16x8_mmx, 0),
-                      make_tuple(3, 4, &vpx_variance8x16_mmx, 0),
-                      make_tuple(3, 3, &vpx_variance8x8_mmx, 0),
-                      make_tuple(2, 2, &vpx_variance4x4_mmx, 0)));
-
-INSTANTIATE_TEST_CASE_P(
-    MMX, VpxSubpelVarianceTest,
-    ::testing::Values(make_tuple(4, 4, &vpx_sub_pixel_variance16x16_mmx, 0),
-                      make_tuple(4, 3, &vpx_sub_pixel_variance16x8_mmx, 0),
-                      make_tuple(3, 4, &vpx_sub_pixel_variance8x16_mmx, 0),
-                      make_tuple(3, 3, &vpx_sub_pixel_variance8x8_mmx, 0),
-                      make_tuple(2, 2, &vpx_sub_pixel_variance4x4_mmx, 0)));
-#endif  // HAVE_MMX
-
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(SSE2, SumOfSquaresTest,
                         ::testing::Values(vpx_get_mb_ss_sse2));
@@ -1126,8 +1102,8 @@
                       make_tuple(3, 4, &vpx_sub_pixel_variance8x16_sse2, 0),
                       make_tuple(3, 3, &vpx_sub_pixel_variance8x8_sse2, 0),
                       make_tuple(3, 2, &vpx_sub_pixel_variance8x4_sse2, 0),
-                      make_tuple(2, 3, &vpx_sub_pixel_variance4x8_sse, 0),
-                      make_tuple(2, 2, &vpx_sub_pixel_variance4x4_sse, 0)));
+                      make_tuple(2, 3, &vpx_sub_pixel_variance4x8_sse2, 0),
+                      make_tuple(2, 2, &vpx_sub_pixel_variance4x4_sse2, 0)));
 
 INSTANTIATE_TEST_CASE_P(
     SSE2, VpxSubpelAvgVarianceTest,
@@ -1143,8 +1119,8 @@
         make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_sse2, 0),
         make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_sse2, 0),
         make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_sse2, 0),
-        make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_sse, 0),
-        make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_sse, 0)));
+        make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_sse2, 0),
+        make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_sse2, 0)));
 #endif  // CONFIG_USE_X86INC
 
 #if HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
diff --git a/test/vp9_arf_freq_test.cc b/test/vp9_arf_freq_test.cc
index 6cf15de..ecbcf92 100644
--- a/test/vp9_arf_freq_test.cc
+++ b/test/vp9_arf_freq_test.cc
@@ -230,7 +230,7 @@
     ::testing::ValuesIn(kEncodeVectors),
     ::testing::ValuesIn(kMinArfVectors));
 
-#if CONFIG_VP9_HIGHBITDEPTH || CONFIG_BIDIR_PRED
+#if CONFIG_VP9_HIGHBITDEPTH || CONFIG_EXT_REFS
 #if CONFIG_VP10_ENCODER
 // TODO(angiebird): 25-29 fail in high bitdepth mode.
 // TODO(zoeliu): This ArfFreqTest does not work with BWDREF_FRAME, as
@@ -252,5 +252,5 @@
     ::testing::ValuesIn(kTestVectors),
     ::testing::ValuesIn(kEncodeVectors),
     ::testing::ValuesIn(kMinArfVectors));
-#endif  // CONFIG_VP9_HIGHBITDEPTH || CONFIG_BIDIR_PRED
+#endif  // CONFIG_VP9_HIGHBITDEPTH || CONFIG_EXT_REFS
 }  // namespace
diff --git a/test/vp9_denoiser_sse2_test.cc b/test/vp9_denoiser_sse2_test.cc
index 17c799d..c84d7ff 100644
--- a/test/vp9_denoiser_sse2_test.cc
+++ b/test/vp9_denoiser_sse2_test.cc
@@ -94,8 +94,7 @@
 // Test for all block size.
 INSTANTIATE_TEST_CASE_P(
     SSE2, VP9DenoiserTest,
-    ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4, BLOCK_8X8,
-                      BLOCK_8X16, BLOCK_16X8, BLOCK_16X16, BLOCK_16X32,
-                      BLOCK_32X16, BLOCK_32X32, BLOCK_32X64, BLOCK_64X32,
-                      BLOCK_64X64));
+    ::testing::Values(BLOCK_8X8, BLOCK_8X16, BLOCK_16X8, BLOCK_16X16,
+                      BLOCK_16X32, BLOCK_32X16, BLOCK_32X32, BLOCK_32X64,
+                      BLOCK_64X32, BLOCK_64X64));
 }  // namespace
diff --git a/test/webm_video_source.h b/test/webm_video_source.h
index 650bc52..8258756 100644
--- a/test/webm_video_source.h
+++ b/test/webm_video_source.h
@@ -62,7 +62,7 @@
 
   void FillFrame() {
     ASSERT_TRUE(vpx_ctx_->file != NULL);
-    const int status = webm_read_frame(webm_ctx_, &buf_, &buf_sz_, &buf_sz_);
+    const int status = webm_read_frame(webm_ctx_, &buf_, &buf_sz_);
     ASSERT_GE(status, 0) << "webm_read_frame failed";
     if (status == 1) {
       end_of_file_ = true;
@@ -72,7 +72,7 @@
   void SeekToNextKeyFrame() {
     ASSERT_TRUE(vpx_ctx_->file != NULL);
     do {
-      const int status = webm_read_frame(webm_ctx_, &buf_, &buf_sz_, &buf_sz_);
+      const int status = webm_read_frame(webm_ctx_, &buf_, &buf_sz_);
       ASSERT_GE(status, 0) << "webm_read_frame failed";
       ++frame_;
       if (status == 1) {
diff --git a/vp10/common/postproc.c b/vp10/common/postproc.c
index a6ea9c0..e8a9f81 100644
--- a/vp10/common/postproc.c
+++ b/vp10/common/postproc.c
@@ -13,6 +13,7 @@
 #include <stdio.h>
 
 #include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
 #include "./vp10_rtcd.h"
 
@@ -587,32 +588,6 @@
   state->last_noise = a;
 }
 
-void vp10_plane_add_noise_c(uint8_t *start, char *noise,
-                           char blackclamp[16],
-                           char whiteclamp[16],
-                           char bothclamp[16],
-                           unsigned int width, unsigned int height, int pitch) {
-  unsigned int i, j;
-
-  // TODO(jbb): why does simd code use both but c doesn't,  normalize and
-  // fix..
-  (void) bothclamp;
-  for (i = 0; i < height; i++) {
-    uint8_t *pos = start + i * pitch;
-    char  *ref = (char *)(noise + (rand() & 0xff));  // NOLINT
-
-    for (j = 0; j < width; j++) {
-      if (pos[j] < blackclamp[0])
-        pos[j] = blackclamp[0];
-
-      if (pos[j] > 255 + whiteclamp[0])
-        pos[j] = 255 + whiteclamp[0];
-
-      pos[j] += ref[j];
-    }
-  }
-}
-
 static void swap_mi_and_prev_mi(VP10_COMMON *cm) {
   // Current mip will be the prev_mip for the next frame.
   MODE_INFO *temp = cm->postproc_state.prev_mip;
@@ -727,7 +702,7 @@
       fillrd(ppstate, 63 - q, noise_level);
     }
 
-    vp10_plane_add_noise(ppbuf->y_buffer, ppstate->noise, ppstate->blackclamp,
+    vpx_plane_add_noise(ppbuf->y_buffer, ppstate->noise, ppstate->blackclamp,
                         ppstate->whiteclamp, ppstate->bothclamp,
                         ppbuf->y_width, ppbuf->y_height, ppbuf->y_stride);
   }
diff --git a/vp10/common/vp10_inv_txfm.c b/vp10/common/vp10_inv_txfm.c
index 0ebac42..d41f389 100644
--- a/vp10/common/vp10_inv_txfm.c
+++ b/vp10/common/vp10_inv_txfm.c
@@ -36,10 +36,10 @@
     c1 = e1 - c1;
     a1 -= b1;
     d1 += c1;
-    op[0] = WRAPLOW(a1, 8);
-    op[1] = WRAPLOW(b1, 8);
-    op[2] = WRAPLOW(c1, 8);
-    op[3] = WRAPLOW(d1, 8);
+    op[0] = WRAPLOW(a1);
+    op[1] = WRAPLOW(b1);
+    op[2] = WRAPLOW(c1);
+    op[3] = WRAPLOW(d1);
     ip += 4;
     op += 4;
   }
@@ -57,10 +57,10 @@
     c1 = e1 - c1;
     a1 -= b1;
     d1 += c1;
-    dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1);
-    dest[stride * 1] = clip_pixel_add(dest[stride * 1], b1);
-    dest[stride * 2] = clip_pixel_add(dest[stride * 2], c1);
-    dest[stride * 3] = clip_pixel_add(dest[stride * 3], d1);
+    dest[stride * 0] = clip_pixel_add(dest[stride * 0], WRAPLOW(a1));
+    dest[stride * 1] = clip_pixel_add(dest[stride * 1], WRAPLOW(b1));
+    dest[stride * 2] = clip_pixel_add(dest[stride * 2], WRAPLOW(c1));
+    dest[stride * 3] = clip_pixel_add(dest[stride * 3], WRAPLOW(d1));
 
     ip++;
     dest++;
@@ -79,8 +79,8 @@
   a1 = ip[0] >> UNIT_QUANT_SHIFT;
   e1 = a1 >> 1;
   a1 -= e1;
-  op[0] = WRAPLOW(a1, 8);
-  op[1] = op[2] = op[3] = WRAPLOW(e1, 8);
+  op[0] = WRAPLOW(a1);
+  op[1] = op[2] = op[3] = WRAPLOW(e1);
 
   ip = tmp;
   for (i = 0; i < 4; i++) {
@@ -101,18 +101,18 @@
   // stage 1
   temp1 = (input[0] + input[2]) * cospi_16_64;
   temp2 = (input[0] - input[2]) * cospi_16_64;
-  step[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step[0] = WRAPLOW(dct_const_round_shift(temp1));
+  step[1] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
   temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
-  step[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step[2] = WRAPLOW(dct_const_round_shift(temp1));
+  step[3] = WRAPLOW(dct_const_round_shift(temp2));
 
   // stage 2
-  output[0] = WRAPLOW(step[0] + step[3], 8);
-  output[1] = WRAPLOW(step[1] + step[2], 8);
-  output[2] = WRAPLOW(step[1] - step[2], 8);
-  output[3] = WRAPLOW(step[0] - step[3], 8);
+  output[0] = WRAPLOW(step[0] + step[3]);
+  output[1] = WRAPLOW(step[1] + step[2]);
+  output[2] = WRAPLOW(step[1] - step[2]);
+  output[3] = WRAPLOW(step[0] - step[3]);
 }
 
 void vp10_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
@@ -144,8 +144,8 @@
                          int dest_stride) {
   int i;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   a1 = ROUND_POWER_OF_TWO(out, 4);
 
   for (i = 0; i < 4; i++) {
@@ -167,48 +167,48 @@
   step1[3] = input[6];
   temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
   temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
-  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
   temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
 
   // stage 2
   temp1 = (step1[0] + step1[2]) * cospi_16_64;
   temp2 = (step1[0] - step1[2]) * cospi_16_64;
-  step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
   temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  step2[4] = WRAPLOW(step1[4] + step1[5], 8);
-  step2[5] = WRAPLOW(step1[4] - step1[5], 8);
-  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
-  step2[7] = WRAPLOW(step1[6] + step1[7], 8);
+  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[4] = WRAPLOW(step1[4] + step1[5]);
+  step2[5] = WRAPLOW(step1[4] - step1[5]);
+  step2[6] = WRAPLOW(-step1[6] + step1[7]);
+  step2[7] = WRAPLOW(step1[6] + step1[7]);
 
   // stage 3
-  step1[0] = WRAPLOW(step2[0] + step2[3], 8);
-  step1[1] = WRAPLOW(step2[1] + step2[2], 8);
-  step1[2] = WRAPLOW(step2[1] - step2[2], 8);
-  step1[3] = WRAPLOW(step2[0] - step2[3], 8);
+  step1[0] = WRAPLOW(step2[0] + step2[3]);
+  step1[1] = WRAPLOW(step2[1] + step2[2]);
+  step1[2] = WRAPLOW(step2[1] - step2[2]);
+  step1[3] = WRAPLOW(step2[0] - step2[3]);
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
   step1[7] = step2[7];
 
   // stage 4
-  output[0] = WRAPLOW(step1[0] + step1[7], 8);
-  output[1] = WRAPLOW(step1[1] + step1[6], 8);
-  output[2] = WRAPLOW(step1[2] + step1[5], 8);
-  output[3] = WRAPLOW(step1[3] + step1[4], 8);
-  output[4] = WRAPLOW(step1[3] - step1[4], 8);
-  output[5] = WRAPLOW(step1[2] - step1[5], 8);
-  output[6] = WRAPLOW(step1[1] - step1[6], 8);
-  output[7] = WRAPLOW(step1[0] - step1[7], 8);
+  output[0] = WRAPLOW(step1[0] + step1[7]);
+  output[1] = WRAPLOW(step1[1] + step1[6]);
+  output[2] = WRAPLOW(step1[2] + step1[5]);
+  output[3] = WRAPLOW(step1[3] + step1[4]);
+  output[4] = WRAPLOW(step1[3] - step1[4]);
+  output[5] = WRAPLOW(step1[2] - step1[5]);
+  output[6] = WRAPLOW(step1[1] - step1[6]);
+  output[7] = WRAPLOW(step1[0] - step1[7]);
 }
 
 void vp10_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
@@ -239,8 +239,8 @@
 void vp10_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   int i, j;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   a1 = ROUND_POWER_OF_TWO(out, 5);
   for (j = 0; j < 8; ++j) {
     for (i = 0; i < 8; ++i)
@@ -269,7 +269,7 @@
   s4 = sinpi_1_9 * x2;
   s5 = sinpi_2_9 * x3;
   s6 = sinpi_4_9 * x3;
-  s7 = x0 - x2 + x3;
+  s7 = WRAPLOW(x0 - x2 + x3);
 
   s0 = s0 + s3 + s5;
   s1 = s1 - s4 - s6;
@@ -280,10 +280,10 @@
   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
   // + 1b (addition) = 29b.
   // Hence the output bit depth is 15b.
-  output[0] = WRAPLOW(dct_const_round_shift(s0 + s3), 8);
-  output[1] = WRAPLOW(dct_const_round_shift(s1 + s3), 8);
-  output[2] = WRAPLOW(dct_const_round_shift(s2), 8);
-  output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3), 8);
+  output[0] = WRAPLOW(dct_const_round_shift(s0 + s3));
+  output[1] = WRAPLOW(dct_const_round_shift(s1 + s3));
+  output[2] = WRAPLOW(dct_const_round_shift(s2));
+  output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));
 }
 
 void vp10_iadst8_c(const tran_low_t *input, tran_low_t *output) {
@@ -314,14 +314,14 @@
   s6 = (int)(cospi_26_64 * x6 + cospi_6_64  * x7);
   s7 = (int)(cospi_6_64  * x6 - cospi_26_64 * x7);
 
-  x0 = WRAPLOW(dct_const_round_shift(s0 + s4), 8);
-  x1 = WRAPLOW(dct_const_round_shift(s1 + s5), 8);
-  x2 = WRAPLOW(dct_const_round_shift(s2 + s6), 8);
-  x3 = WRAPLOW(dct_const_round_shift(s3 + s7), 8);
-  x4 = WRAPLOW(dct_const_round_shift(s0 - s4), 8);
-  x5 = WRAPLOW(dct_const_round_shift(s1 - s5), 8);
-  x6 = WRAPLOW(dct_const_round_shift(s2 - s6), 8);
-  x7 = WRAPLOW(dct_const_round_shift(s3 - s7), 8);
+  x0 = WRAPLOW(dct_const_round_shift(s0 + s4));
+  x1 = WRAPLOW(dct_const_round_shift(s1 + s5));
+  x2 = WRAPLOW(dct_const_round_shift(s2 + s6));
+  x3 = WRAPLOW(dct_const_round_shift(s3 + s7));
+  x4 = WRAPLOW(dct_const_round_shift(s0 - s4));
+  x5 = WRAPLOW(dct_const_round_shift(s1 - s5));
+  x6 = WRAPLOW(dct_const_round_shift(s2 - s6));
+  x7 = WRAPLOW(dct_const_round_shift(s3 - s7));
 
   // stage 2
   s0 = (int)x0;
@@ -333,14 +333,14 @@
   s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
   s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
 
-  x0 = WRAPLOW(s0 + s2, 8);
-  x1 = WRAPLOW(s1 + s3, 8);
-  x2 = WRAPLOW(s0 - s2, 8);
-  x3 = WRAPLOW(s1 - s3, 8);
-  x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);
-  x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);
-  x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);
-  x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
+  x0 = WRAPLOW(s0 + s2);
+  x1 = WRAPLOW(s1 + s3);
+  x2 = WRAPLOW(s0 - s2);
+  x3 = WRAPLOW(s1 - s3);
+  x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
+  x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
+  x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
+  x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
 
   // stage 3
   s2 = (int)(cospi_16_64 * (x2 + x3));
@@ -348,19 +348,19 @@
   s6 = (int)(cospi_16_64 * (x6 + x7));
   s7 = (int)(cospi_16_64 * (x6 - x7));
 
-  x2 = WRAPLOW(dct_const_round_shift(s2), 8);
-  x3 = WRAPLOW(dct_const_round_shift(s3), 8);
-  x6 = WRAPLOW(dct_const_round_shift(s6), 8);
-  x7 = WRAPLOW(dct_const_round_shift(s7), 8);
+  x2 = WRAPLOW(dct_const_round_shift(s2));
+  x3 = WRAPLOW(dct_const_round_shift(s3));
+  x6 = WRAPLOW(dct_const_round_shift(s6));
+  x7 = WRAPLOW(dct_const_round_shift(s7));
 
-  output[0] = WRAPLOW(x0, 8);
-  output[1] = WRAPLOW(-x4, 8);
-  output[2] = WRAPLOW(x6, 8);
-  output[3] = WRAPLOW(-x2, 8);
-  output[4] = WRAPLOW(x3, 8);
-  output[5] = WRAPLOW(-x7, 8);
-  output[6] = WRAPLOW(x5, 8);
-  output[7] = WRAPLOW(-x1, 8);
+  output[0] = WRAPLOW(x0);
+  output[1] = WRAPLOW(-x4);
+  output[2] = WRAPLOW(x6);
+  output[3] = WRAPLOW(-x2);
+  output[4] = WRAPLOW(x3);
+  output[5] = WRAPLOW(-x7);
+  output[6] = WRAPLOW(x5);
+  output[7] = WRAPLOW(-x1);
 }
 
 void vp10_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
@@ -423,23 +423,23 @@
 
   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[8] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[15] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
 
   // stage 3
   step1[0] = step2[0];
@@ -449,109 +449,109 @@
 
   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
 
-  step1[8] = WRAPLOW(step2[8] + step2[9], 8);
-  step1[9] = WRAPLOW(step2[8] - step2[9], 8);
-  step1[10] = WRAPLOW(-step2[10] + step2[11], 8);
-  step1[11] = WRAPLOW(step2[10] + step2[11], 8);
-  step1[12] = WRAPLOW(step2[12] + step2[13], 8);
-  step1[13] = WRAPLOW(step2[12] - step2[13], 8);
-  step1[14] = WRAPLOW(-step2[14] + step2[15], 8);
-  step1[15] = WRAPLOW(step2[14] + step2[15], 8);
+  step1[8] = WRAPLOW(step2[8] + step2[9]);
+  step1[9] = WRAPLOW(step2[8] - step2[9]);
+  step1[10] = WRAPLOW(-step2[10] + step2[11]);
+  step1[11] = WRAPLOW(step2[10] + step2[11]);
+  step1[12] = WRAPLOW(step2[12] + step2[13]);
+  step1[13] = WRAPLOW(step2[12] - step2[13]);
+  step1[14] = WRAPLOW(-step2[14] + step2[15]);
+  step1[15] = WRAPLOW(step2[14] + step2[15]);
 
   // stage 4
   temp1 = (step1[0] + step1[1]) * cospi_16_64;
   temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  step2[4] = WRAPLOW(step1[4] + step1[5], 8);
-  step2[5] = WRAPLOW(step1[4] - step1[5], 8);
-  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
-  step2[7] = WRAPLOW(step1[6] + step1[7], 8);
+  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[4] = WRAPLOW(step1[4] + step1[5]);
+  step2[5] = WRAPLOW(step1[4] - step1[5]);
+  step2[6] = WRAPLOW(-step1[6] + step1[7]);
+  step2[7] = WRAPLOW(step1[6] + step1[7]);
 
   step2[8] = step1[8];
   step2[15] = step1[15];
   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
   step2[11] = step1[11];
   step2[12] = step1[12];
 
   // stage 5
-  step1[0] = WRAPLOW(step2[0] + step2[3], 8);
-  step1[1] = WRAPLOW(step2[1] + step2[2], 8);
-  step1[2] = WRAPLOW(step2[1] - step2[2], 8);
-  step1[3] = WRAPLOW(step2[0] - step2[3], 8);
+  step1[0] = WRAPLOW(step2[0] + step2[3]);
+  step1[1] = WRAPLOW(step2[1] + step2[2]);
+  step1[2] = WRAPLOW(step2[1] - step2[2]);
+  step1[3] = WRAPLOW(step2[0] - step2[3]);
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
   step1[7] = step2[7];
 
-  step1[8] = WRAPLOW(step2[8] + step2[11], 8);
-  step1[9] = WRAPLOW(step2[9] + step2[10], 8);
-  step1[10] = WRAPLOW(step2[9] - step2[10], 8);
-  step1[11] = WRAPLOW(step2[8] - step2[11], 8);
-  step1[12] = WRAPLOW(-step2[12] + step2[15], 8);
-  step1[13] = WRAPLOW(-step2[13] + step2[14], 8);
-  step1[14] = WRAPLOW(step2[13] + step2[14], 8);
-  step1[15] = WRAPLOW(step2[12] + step2[15], 8);
+  step1[8] = WRAPLOW(step2[8] + step2[11]);
+  step1[9] = WRAPLOW(step2[9] + step2[10]);
+  step1[10] = WRAPLOW(step2[9] - step2[10]);
+  step1[11] = WRAPLOW(step2[8] - step2[11]);
+  step1[12] = WRAPLOW(-step2[12] + step2[15]);
+  step1[13] = WRAPLOW(-step2[13] + step2[14]);
+  step1[14] = WRAPLOW(step2[13] + step2[14]);
+  step1[15] = WRAPLOW(step2[12] + step2[15]);
 
   // stage 6
-  step2[0] = WRAPLOW(step1[0] + step1[7], 8);
-  step2[1] = WRAPLOW(step1[1] + step1[6], 8);
-  step2[2] = WRAPLOW(step1[2] + step1[5], 8);
-  step2[3] = WRAPLOW(step1[3] + step1[4], 8);
-  step2[4] = WRAPLOW(step1[3] - step1[4], 8);
-  step2[5] = WRAPLOW(step1[2] - step1[5], 8);
-  step2[6] = WRAPLOW(step1[1] - step1[6], 8);
-  step2[7] = WRAPLOW(step1[0] - step1[7], 8);
+  step2[0] = WRAPLOW(step1[0] + step1[7]);
+  step2[1] = WRAPLOW(step1[1] + step1[6]);
+  step2[2] = WRAPLOW(step1[2] + step1[5]);
+  step2[3] = WRAPLOW(step1[3] + step1[4]);
+  step2[4] = WRAPLOW(step1[3] - step1[4]);
+  step2[5] = WRAPLOW(step1[2] - step1[5]);
+  step2[6] = WRAPLOW(step1[1] - step1[6]);
+  step2[7] = WRAPLOW(step1[0] - step1[7]);
   step2[8] = step1[8];
   step2[9] = step1[9];
   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
   step2[14] = step1[14];
   step2[15] = step1[15];
 
   // stage 7
-  output[0] = WRAPLOW(step2[0] + step2[15], 8);
-  output[1] = WRAPLOW(step2[1] + step2[14], 8);
-  output[2] = WRAPLOW(step2[2] + step2[13], 8);
-  output[3] = WRAPLOW(step2[3] + step2[12], 8);
-  output[4] = WRAPLOW(step2[4] + step2[11], 8);
-  output[5] = WRAPLOW(step2[5] + step2[10], 8);
-  output[6] = WRAPLOW(step2[6] + step2[9], 8);
-  output[7] = WRAPLOW(step2[7] + step2[8], 8);
-  output[8] = WRAPLOW(step2[7] - step2[8], 8);
-  output[9] = WRAPLOW(step2[6] - step2[9], 8);
-  output[10] = WRAPLOW(step2[5] - step2[10], 8);
-  output[11] = WRAPLOW(step2[4] - step2[11], 8);
-  output[12] = WRAPLOW(step2[3] - step2[12], 8);
-  output[13] = WRAPLOW(step2[2] - step2[13], 8);
-  output[14] = WRAPLOW(step2[1] - step2[14], 8);
-  output[15] = WRAPLOW(step2[0] - step2[15], 8);
+  output[0] = WRAPLOW(step2[0] + step2[15]);
+  output[1] = WRAPLOW(step2[1] + step2[14]);
+  output[2] = WRAPLOW(step2[2] + step2[13]);
+  output[3] = WRAPLOW(step2[3] + step2[12]);
+  output[4] = WRAPLOW(step2[4] + step2[11]);
+  output[5] = WRAPLOW(step2[5] + step2[10]);
+  output[6] = WRAPLOW(step2[6] + step2[9]);
+  output[7] = WRAPLOW(step2[7] + step2[8]);
+  output[8] = WRAPLOW(step2[7] - step2[8]);
+  output[9] = WRAPLOW(step2[6] - step2[9]);
+  output[10] = WRAPLOW(step2[5] - step2[10]);
+  output[11] = WRAPLOW(step2[4] - step2[11]);
+  output[12] = WRAPLOW(step2[3] - step2[12]);
+  output[13] = WRAPLOW(step2[2] - step2[13]);
+  output[14] = WRAPLOW(step2[1] - step2[14]);
+  output[15] = WRAPLOW(step2[0] - step2[15]);
 }
 
 void vp10_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
@@ -628,22 +628,22 @@
   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
   s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
 
-  x0 = WRAPLOW(dct_const_round_shift(s0 + s8), 8);
-  x1 = WRAPLOW(dct_const_round_shift(s1 + s9), 8);
-  x2 = WRAPLOW(dct_const_round_shift(s2 + s10), 8);
-  x3 = WRAPLOW(dct_const_round_shift(s3 + s11), 8);
-  x4 = WRAPLOW(dct_const_round_shift(s4 + s12), 8);
-  x5 = WRAPLOW(dct_const_round_shift(s5 + s13), 8);
-  x6 = WRAPLOW(dct_const_round_shift(s6 + s14), 8);
-  x7 = WRAPLOW(dct_const_round_shift(s7 + s15), 8);
-  x8 = WRAPLOW(dct_const_round_shift(s0 - s8), 8);
-  x9 = WRAPLOW(dct_const_round_shift(s1 - s9), 8);
-  x10 = WRAPLOW(dct_const_round_shift(s2 - s10), 8);
-  x11 = WRAPLOW(dct_const_round_shift(s3 - s11), 8);
-  x12 = WRAPLOW(dct_const_round_shift(s4 - s12), 8);
-  x13 = WRAPLOW(dct_const_round_shift(s5 - s13), 8);
-  x14 = WRAPLOW(dct_const_round_shift(s6 - s14), 8);
-  x15 = WRAPLOW(dct_const_round_shift(s7 - s15), 8);
+  x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
+  x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
+  x2 = WRAPLOW(dct_const_round_shift(s2 + s10));
+  x3 = WRAPLOW(dct_const_round_shift(s3 + s11));
+  x4 = WRAPLOW(dct_const_round_shift(s4 + s12));
+  x5 = WRAPLOW(dct_const_round_shift(s5 + s13));
+  x6 = WRAPLOW(dct_const_round_shift(s6 + s14));
+  x7 = WRAPLOW(dct_const_round_shift(s7 + s15));
+  x8 = WRAPLOW(dct_const_round_shift(s0 - s8));
+  x9 = WRAPLOW(dct_const_round_shift(s1 - s9));
+  x10 = WRAPLOW(dct_const_round_shift(s2 - s10));
+  x11 = WRAPLOW(dct_const_round_shift(s3 - s11));
+  x12 = WRAPLOW(dct_const_round_shift(s4 - s12));
+  x13 = WRAPLOW(dct_const_round_shift(s5 - s13));
+  x14 = WRAPLOW(dct_const_round_shift(s6 - s14));
+  x15 = WRAPLOW(dct_const_round_shift(s7 - s15));
 
   // stage 2
   s0 = x0;
@@ -663,22 +663,22 @@
   s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
   s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
 
-  x0 = WRAPLOW(s0 + s4, 8);
-  x1 = WRAPLOW(s1 + s5, 8);
-  x2 = WRAPLOW(s2 + s6, 8);
-  x3 = WRAPLOW(s3 + s7, 8);
-  x4 = WRAPLOW(s0 - s4, 8);
-  x5 = WRAPLOW(s1 - s5, 8);
-  x6 = WRAPLOW(s2 - s6, 8);
-  x7 = WRAPLOW(s3 - s7, 8);
-  x8 = WRAPLOW(dct_const_round_shift(s8 + s12), 8);
-  x9 = WRAPLOW(dct_const_round_shift(s9 + s13), 8);
-  x10 = WRAPLOW(dct_const_round_shift(s10 + s14), 8);
-  x11 = WRAPLOW(dct_const_round_shift(s11 + s15), 8);
-  x12 = WRAPLOW(dct_const_round_shift(s8 - s12), 8);
-  x13 = WRAPLOW(dct_const_round_shift(s9 - s13), 8);
-  x14 = WRAPLOW(dct_const_round_shift(s10 - s14), 8);
-  x15 = WRAPLOW(dct_const_round_shift(s11 - s15), 8);
+  x0 = WRAPLOW(s0 + s4);
+  x1 = WRAPLOW(s1 + s5);
+  x2 = WRAPLOW(s2 + s6);
+  x3 = WRAPLOW(s3 + s7);
+  x4 = WRAPLOW(s0 - s4);
+  x5 = WRAPLOW(s1 - s5);
+  x6 = WRAPLOW(s2 - s6);
+  x7 = WRAPLOW(s3 - s7);
+  x8 = WRAPLOW(dct_const_round_shift(s8 + s12));
+  x9 = WRAPLOW(dct_const_round_shift(s9 + s13));
+  x10 = WRAPLOW(dct_const_round_shift(s10 + s14));
+  x11 = WRAPLOW(dct_const_round_shift(s11 + s15));
+  x12 = WRAPLOW(dct_const_round_shift(s8 - s12));
+  x13 = WRAPLOW(dct_const_round_shift(s9 - s13));
+  x14 = WRAPLOW(dct_const_round_shift(s10 - s14));
+  x15 = WRAPLOW(dct_const_round_shift(s11 - s15));
 
   // stage 3
   s0 = x0;
@@ -698,22 +698,22 @@
   s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
   s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
 
-  x0 = WRAPLOW(check_range(s0 + s2), 8);
-  x1 = WRAPLOW(check_range(s1 + s3), 8);
-  x2 = WRAPLOW(check_range(s0 - s2), 8);
-  x3 = WRAPLOW(check_range(s1 - s3), 8);
-  x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);
-  x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);
-  x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);
-  x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
-  x8 = WRAPLOW(check_range(s8 + s10), 8);
-  x9 = WRAPLOW(check_range(s9 + s11), 8);
-  x10 = WRAPLOW(check_range(s8 - s10), 8);
-  x11 = WRAPLOW(check_range(s9 - s11), 8);
-  x12 = WRAPLOW(dct_const_round_shift(s12 + s14), 8);
-  x13 = WRAPLOW(dct_const_round_shift(s13 + s15), 8);
-  x14 = WRAPLOW(dct_const_round_shift(s12 - s14), 8);
-  x15 = WRAPLOW(dct_const_round_shift(s13 - s15), 8);
+  x0 = WRAPLOW(s0 + s2);
+  x1 = WRAPLOW(s1 + s3);
+  x2 = WRAPLOW(s0 - s2);
+  x3 = WRAPLOW(s1 - s3);
+  x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
+  x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
+  x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
+  x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
+  x8 = WRAPLOW(s8 + s10);
+  x9 = WRAPLOW(s9 + s11);
+  x10 = WRAPLOW(s8 - s10);
+  x11 = WRAPLOW(s9 - s11);
+  x12 = WRAPLOW(dct_const_round_shift(s12 + s14));
+  x13 = WRAPLOW(dct_const_round_shift(s13 + s15));
+  x14 = WRAPLOW(dct_const_round_shift(s12 - s14));
+  x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
 
   // stage 4
   s2 = (- cospi_16_64) * (x2 + x3);
@@ -725,31 +725,31 @@
   s14 = (- cospi_16_64) * (x14 + x15);
   s15 = cospi_16_64 * (x14 - x15);
 
-  x2 = WRAPLOW(dct_const_round_shift(s2), 8);
-  x3 = WRAPLOW(dct_const_round_shift(s3), 8);
-  x6 = WRAPLOW(dct_const_round_shift(s6), 8);
-  x7 = WRAPLOW(dct_const_round_shift(s7), 8);
-  x10 = WRAPLOW(dct_const_round_shift(s10), 8);
-  x11 = WRAPLOW(dct_const_round_shift(s11), 8);
-  x14 = WRAPLOW(dct_const_round_shift(s14), 8);
-  x15 = WRAPLOW(dct_const_round_shift(s15), 8);
+  x2 = WRAPLOW(dct_const_round_shift(s2));
+  x3 = WRAPLOW(dct_const_round_shift(s3));
+  x6 = WRAPLOW(dct_const_round_shift(s6));
+  x7 = WRAPLOW(dct_const_round_shift(s7));
+  x10 = WRAPLOW(dct_const_round_shift(s10));
+  x11 = WRAPLOW(dct_const_round_shift(s11));
+  x14 = WRAPLOW(dct_const_round_shift(s14));
+  x15 = WRAPLOW(dct_const_round_shift(s15));
 
-  output[0] = WRAPLOW(x0, 8);
-  output[1] = WRAPLOW(-x8, 8);
-  output[2] = WRAPLOW(x12, 8);
-  output[3] = WRAPLOW(-x4, 8);
-  output[4] = WRAPLOW(x6, 8);
-  output[5] = WRAPLOW(x14, 8);
-  output[6] = WRAPLOW(x10, 8);
-  output[7] = WRAPLOW(x2, 8);
-  output[8] = WRAPLOW(x3, 8);
-  output[9] = WRAPLOW(x11, 8);
-  output[10] = WRAPLOW(x15, 8);
-  output[11] = WRAPLOW(x7, 8);
-  output[12] = WRAPLOW(x5, 8);
-  output[13] = WRAPLOW(-x13, 8);
-  output[14] = WRAPLOW(x9, 8);
-  output[15] = WRAPLOW(-x1, 8);
+  output[0] = WRAPLOW(x0);
+  output[1] = WRAPLOW(-x8);
+  output[2] = WRAPLOW(x12);
+  output[3] = WRAPLOW(-x4);
+  output[4] = WRAPLOW(x6);
+  output[5] = WRAPLOW(x14);
+  output[6] = WRAPLOW(x10);
+  output[7] = WRAPLOW(x2);
+  output[8] = WRAPLOW(x3);
+  output[9] = WRAPLOW(x11);
+  output[10] = WRAPLOW(x15);
+  output[11] = WRAPLOW(x7);
+  output[12] = WRAPLOW(x5);
+  output[13] = WRAPLOW(-x13);
+  output[14] = WRAPLOW(x9);
+  output[15] = WRAPLOW(-x1);
 }
 
 void vp10_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
@@ -785,8 +785,8 @@
                             int stride) {
   int i, j;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   a1 = ROUND_POWER_OF_TWO(out, 6);
   for (j = 0; j < 16; ++j) {
     for (i = 0; i < 16; ++i)
@@ -819,43 +819,43 @@
 
   temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
   temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
-  step1[16] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[31] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[16] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[31] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
   temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
-  step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[17] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[30] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
   temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
-  step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[18] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[29] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
   temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
-  step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[19] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[28] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
   temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
-  step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[20] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[27] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
   temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
   temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
-  step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[22] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[25] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
   temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
-  step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[23] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[24] = WRAPLOW(dct_const_round_shift(temp2));
 
   // stage 2
   step2[0] = step1[0];
@@ -869,40 +869,40 @@
 
   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[8] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[15] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
 
-  step2[16] = WRAPLOW(step1[16] + step1[17], 8);
-  step2[17] = WRAPLOW(step1[16] - step1[17], 8);
-  step2[18] = WRAPLOW(-step1[18] + step1[19], 8);
-  step2[19] = WRAPLOW(step1[18] + step1[19], 8);
-  step2[20] = WRAPLOW(step1[20] + step1[21], 8);
-  step2[21] = WRAPLOW(step1[20] - step1[21], 8);
-  step2[22] = WRAPLOW(-step1[22] + step1[23], 8);
-  step2[23] = WRAPLOW(step1[22] + step1[23], 8);
-  step2[24] = WRAPLOW(step1[24] + step1[25], 8);
-  step2[25] = WRAPLOW(step1[24] - step1[25], 8);
-  step2[26] = WRAPLOW(-step1[26] + step1[27], 8);
-  step2[27] = WRAPLOW(step1[26] + step1[27], 8);
-  step2[28] = WRAPLOW(step1[28] + step1[29], 8);
-  step2[29] = WRAPLOW(step1[28] - step1[29], 8);
-  step2[30] = WRAPLOW(-step1[30] + step1[31], 8);
-  step2[31] = WRAPLOW(step1[30] + step1[31], 8);
+  step2[16] = WRAPLOW(step1[16] + step1[17]);
+  step2[17] = WRAPLOW(step1[16] - step1[17]);
+  step2[18] = WRAPLOW(-step1[18] + step1[19]);
+  step2[19] = WRAPLOW(step1[18] + step1[19]);
+  step2[20] = WRAPLOW(step1[20] + step1[21]);
+  step2[21] = WRAPLOW(step1[20] - step1[21]);
+  step2[22] = WRAPLOW(-step1[22] + step1[23]);
+  step2[23] = WRAPLOW(step1[22] + step1[23]);
+  step2[24] = WRAPLOW(step1[24] + step1[25]);
+  step2[25] = WRAPLOW(step1[24] - step1[25]);
+  step2[26] = WRAPLOW(-step1[26] + step1[27]);
+  step2[27] = WRAPLOW(step1[26] + step1[27]);
+  step2[28] = WRAPLOW(step1[28] + step1[29]);
+  step2[29] = WRAPLOW(step1[28] - step1[29]);
+  step2[30] = WRAPLOW(-step1[30] + step1[31]);
+  step2[31] = WRAPLOW(step1[30] + step1[31]);
 
   // stage 3
   step1[0] = step2[0];
@@ -912,42 +912,42 @@
 
   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
 
-  step1[8] = WRAPLOW(step2[8] + step2[9], 8);
-  step1[9] = WRAPLOW(step2[8] - step2[9], 8);
-  step1[10] = WRAPLOW(-step2[10] + step2[11], 8);
-  step1[11] = WRAPLOW(step2[10] + step2[11], 8);
-  step1[12] = WRAPLOW(step2[12] + step2[13], 8);
-  step1[13] = WRAPLOW(step2[12] - step2[13], 8);
-  step1[14] = WRAPLOW(-step2[14] + step2[15], 8);
-  step1[15] = WRAPLOW(step2[14] + step2[15], 8);
+  step1[8] = WRAPLOW(step2[8] + step2[9]);
+  step1[9] = WRAPLOW(step2[8] - step2[9]);
+  step1[10] = WRAPLOW(-step2[10] + step2[11]);
+  step1[11] = WRAPLOW(step2[10] + step2[11]);
+  step1[12] = WRAPLOW(step2[12] + step2[13]);
+  step1[13] = WRAPLOW(step2[12] - step2[13]);
+  step1[14] = WRAPLOW(-step2[14] + step2[15]);
+  step1[15] = WRAPLOW(step2[14] + step2[15]);
 
   step1[16] = step2[16];
   step1[31] = step2[31];
   temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
   temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
-  step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[17] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[30] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
   temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
-  step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[18] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[29] = WRAPLOW(dct_const_round_shift(temp2));
   step1[19] = step2[19];
   step1[20] = step2[20];
   temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
   temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
   temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
-  step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[22] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[25] = WRAPLOW(dct_const_round_shift(temp2));
   step1[23] = step2[23];
   step1[24] = step2[24];
   step1[27] = step2[27];
@@ -956,87 +956,87 @@
   // stage 4
   temp1 = (step1[0] + step1[1]) * cospi_16_64;
   temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  step2[4] = WRAPLOW(step1[4] + step1[5], 8);
-  step2[5] = WRAPLOW(step1[4] - step1[5], 8);
-  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
-  step2[7] = WRAPLOW(step1[6] + step1[7], 8);
+  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[4] = WRAPLOW(step1[4] + step1[5]);
+  step2[5] = WRAPLOW(step1[4] - step1[5]);
+  step2[6] = WRAPLOW(-step1[6] + step1[7]);
+  step2[7] = WRAPLOW(step1[6] + step1[7]);
 
   step2[8] = step1[8];
   step2[15] = step1[15];
   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
   step2[11] = step1[11];
   step2[12] = step1[12];
 
-  step2[16] = WRAPLOW(step1[16] + step1[19], 8);
-  step2[17] = WRAPLOW(step1[17] + step1[18], 8);
-  step2[18] = WRAPLOW(step1[17] - step1[18], 8);
-  step2[19] = WRAPLOW(step1[16] - step1[19], 8);
-  step2[20] = WRAPLOW(-step1[20] + step1[23], 8);
-  step2[21] = WRAPLOW(-step1[21] + step1[22], 8);
-  step2[22] = WRAPLOW(step1[21] + step1[22], 8);
-  step2[23] = WRAPLOW(step1[20] + step1[23], 8);
+  step2[16] = WRAPLOW(step1[16] + step1[19]);
+  step2[17] = WRAPLOW(step1[17] + step1[18]);
+  step2[18] = WRAPLOW(step1[17] - step1[18]);
+  step2[19] = WRAPLOW(step1[16] - step1[19]);
+  step2[20] = WRAPLOW(-step1[20] + step1[23]);
+  step2[21] = WRAPLOW(-step1[21] + step1[22]);
+  step2[22] = WRAPLOW(step1[21] + step1[22]);
+  step2[23] = WRAPLOW(step1[20] + step1[23]);
 
-  step2[24] = WRAPLOW(step1[24] + step1[27], 8);
-  step2[25] = WRAPLOW(step1[25] + step1[26], 8);
-  step2[26] = WRAPLOW(step1[25] - step1[26], 8);
-  step2[27] = WRAPLOW(step1[24] - step1[27], 8);
-  step2[28] = WRAPLOW(-step1[28] + step1[31], 8);
-  step2[29] = WRAPLOW(-step1[29] + step1[30], 8);
-  step2[30] = WRAPLOW(step1[29] + step1[30], 8);
-  step2[31] = WRAPLOW(step1[28] + step1[31], 8);
+  step2[24] = WRAPLOW(step1[24] + step1[27]);
+  step2[25] = WRAPLOW(step1[25] + step1[26]);
+  step2[26] = WRAPLOW(step1[25] - step1[26]);
+  step2[27] = WRAPLOW(step1[24] - step1[27]);
+  step2[28] = WRAPLOW(-step1[28] + step1[31]);
+  step2[29] = WRAPLOW(-step1[29] + step1[30]);
+  step2[30] = WRAPLOW(step1[29] + step1[30]);
+  step2[31] = WRAPLOW(step1[28] + step1[31]);
 
   // stage 5
-  step1[0] = WRAPLOW(step2[0] + step2[3], 8);
-  step1[1] = WRAPLOW(step2[1] + step2[2], 8);
-  step1[2] = WRAPLOW(step2[1] - step2[2], 8);
-  step1[3] = WRAPLOW(step2[0] - step2[3], 8);
+  step1[0] = WRAPLOW(step2[0] + step2[3]);
+  step1[1] = WRAPLOW(step2[1] + step2[2]);
+  step1[2] = WRAPLOW(step2[1] - step2[2]);
+  step1[3] = WRAPLOW(step2[0] - step2[3]);
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
   step1[7] = step2[7];
 
-  step1[8] = WRAPLOW(step2[8] + step2[11], 8);
-  step1[9] = WRAPLOW(step2[9] + step2[10], 8);
-  step1[10] = WRAPLOW(step2[9] - step2[10], 8);
-  step1[11] = WRAPLOW(step2[8] - step2[11], 8);
-  step1[12] = WRAPLOW(-step2[12] + step2[15], 8);
-  step1[13] = WRAPLOW(-step2[13] + step2[14], 8);
-  step1[14] = WRAPLOW(step2[13] + step2[14], 8);
-  step1[15] = WRAPLOW(step2[12] + step2[15], 8);
+  step1[8] = WRAPLOW(step2[8] + step2[11]);
+  step1[9] = WRAPLOW(step2[9] + step2[10]);
+  step1[10] = WRAPLOW(step2[9] - step2[10]);
+  step1[11] = WRAPLOW(step2[8] - step2[11]);
+  step1[12] = WRAPLOW(-step2[12] + step2[15]);
+  step1[13] = WRAPLOW(-step2[13] + step2[14]);
+  step1[14] = WRAPLOW(step2[13] + step2[14]);
+  step1[15] = WRAPLOW(step2[12] + step2[15]);
 
   step1[16] = step2[16];
   step1[17] = step2[17];
   temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
   temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
-  step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[18] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[29] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
   temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
-  step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[19] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[28] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
   temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
-  step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[20] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[27] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
   temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
   step1[22] = step2[22];
   step1[23] = step2[23];
   step1[24] = step2[24];
@@ -1045,62 +1045,62 @@
   step1[31] = step2[31];
 
   // stage 6
-  step2[0] = WRAPLOW(step1[0] + step1[7], 8);
-  step2[1] = WRAPLOW(step1[1] + step1[6], 8);
-  step2[2] = WRAPLOW(step1[2] + step1[5], 8);
-  step2[3] = WRAPLOW(step1[3] + step1[4], 8);
-  step2[4] = WRAPLOW(step1[3] - step1[4], 8);
-  step2[5] = WRAPLOW(step1[2] - step1[5], 8);
-  step2[6] = WRAPLOW(step1[1] - step1[6], 8);
-  step2[7] = WRAPLOW(step1[0] - step1[7], 8);
+  step2[0] = WRAPLOW(step1[0] + step1[7]);
+  step2[1] = WRAPLOW(step1[1] + step1[6]);
+  step2[2] = WRAPLOW(step1[2] + step1[5]);
+  step2[3] = WRAPLOW(step1[3] + step1[4]);
+  step2[4] = WRAPLOW(step1[3] - step1[4]);
+  step2[5] = WRAPLOW(step1[2] - step1[5]);
+  step2[6] = WRAPLOW(step1[1] - step1[6]);
+  step2[7] = WRAPLOW(step1[0] - step1[7]);
   step2[8] = step1[8];
   step2[9] = step1[9];
   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
   step2[14] = step1[14];
   step2[15] = step1[15];
 
-  step2[16] = WRAPLOW(step1[16] + step1[23], 8);
-  step2[17] = WRAPLOW(step1[17] + step1[22], 8);
-  step2[18] = WRAPLOW(step1[18] + step1[21], 8);
-  step2[19] = WRAPLOW(step1[19] + step1[20], 8);
-  step2[20] = WRAPLOW(step1[19] - step1[20], 8);
-  step2[21] = WRAPLOW(step1[18] - step1[21], 8);
-  step2[22] = WRAPLOW(step1[17] - step1[22], 8);
-  step2[23] = WRAPLOW(step1[16] - step1[23], 8);
+  step2[16] = WRAPLOW(step1[16] + step1[23]);
+  step2[17] = WRAPLOW(step1[17] + step1[22]);
+  step2[18] = WRAPLOW(step1[18] + step1[21]);
+  step2[19] = WRAPLOW(step1[19] + step1[20]);
+  step2[20] = WRAPLOW(step1[19] - step1[20]);
+  step2[21] = WRAPLOW(step1[18] - step1[21]);
+  step2[22] = WRAPLOW(step1[17] - step1[22]);
+  step2[23] = WRAPLOW(step1[16] - step1[23]);
 
-  step2[24] = WRAPLOW(-step1[24] + step1[31], 8);
-  step2[25] = WRAPLOW(-step1[25] + step1[30], 8);
-  step2[26] = WRAPLOW(-step1[26] + step1[29], 8);
-  step2[27] = WRAPLOW(-step1[27] + step1[28], 8);
-  step2[28] = WRAPLOW(step1[27] + step1[28], 8);
-  step2[29] = WRAPLOW(step1[26] + step1[29], 8);
-  step2[30] = WRAPLOW(step1[25] + step1[30], 8);
-  step2[31] = WRAPLOW(step1[24] + step1[31], 8);
+  step2[24] = WRAPLOW(-step1[24] + step1[31]);
+  step2[25] = WRAPLOW(-step1[25] + step1[30]);
+  step2[26] = WRAPLOW(-step1[26] + step1[29]);
+  step2[27] = WRAPLOW(-step1[27] + step1[28]);
+  step2[28] = WRAPLOW(step1[27] + step1[28]);
+  step2[29] = WRAPLOW(step1[26] + step1[29]);
+  step2[30] = WRAPLOW(step1[25] + step1[30]);
+  step2[31] = WRAPLOW(step1[24] + step1[31]);
 
   // stage 7
-  step1[0] = WRAPLOW(step2[0] + step2[15], 8);
-  step1[1] = WRAPLOW(step2[1] + step2[14], 8);
-  step1[2] = WRAPLOW(step2[2] + step2[13], 8);
-  step1[3] = WRAPLOW(step2[3] + step2[12], 8);
-  step1[4] = WRAPLOW(step2[4] + step2[11], 8);
-  step1[5] = WRAPLOW(step2[5] + step2[10], 8);
-  step1[6] = WRAPLOW(step2[6] + step2[9], 8);
-  step1[7] = WRAPLOW(step2[7] + step2[8], 8);
-  step1[8] = WRAPLOW(step2[7] - step2[8], 8);
-  step1[9] = WRAPLOW(step2[6] - step2[9], 8);
-  step1[10] = WRAPLOW(step2[5] - step2[10], 8);
-  step1[11] = WRAPLOW(step2[4] - step2[11], 8);
-  step1[12] = WRAPLOW(step2[3] - step2[12], 8);
-  step1[13] = WRAPLOW(step2[2] - step2[13], 8);
-  step1[14] = WRAPLOW(step2[1] - step2[14], 8);
-  step1[15] = WRAPLOW(step2[0] - step2[15], 8);
+  step1[0] = WRAPLOW(step2[0] + step2[15]);
+  step1[1] = WRAPLOW(step2[1] + step2[14]);
+  step1[2] = WRAPLOW(step2[2] + step2[13]);
+  step1[3] = WRAPLOW(step2[3] + step2[12]);
+  step1[4] = WRAPLOW(step2[4] + step2[11]);
+  step1[5] = WRAPLOW(step2[5] + step2[10]);
+  step1[6] = WRAPLOW(step2[6] + step2[9]);
+  step1[7] = WRAPLOW(step2[7] + step2[8]);
+  step1[8] = WRAPLOW(step2[7] - step2[8]);
+  step1[9] = WRAPLOW(step2[6] - step2[9]);
+  step1[10] = WRAPLOW(step2[5] - step2[10]);
+  step1[11] = WRAPLOW(step2[4] - step2[11]);
+  step1[12] = WRAPLOW(step2[3] - step2[12]);
+  step1[13] = WRAPLOW(step2[2] - step2[13]);
+  step1[14] = WRAPLOW(step2[1] - step2[14]);
+  step1[15] = WRAPLOW(step2[0] - step2[15]);
 
   step1[16] = step2[16];
   step1[17] = step2[17];
@@ -1108,58 +1108,58 @@
   step1[19] = step2[19];
   temp1 = (-step2[20] + step2[27]) * cospi_16_64;
   temp2 = (step2[20] + step2[27]) * cospi_16_64;
-  step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[20] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[27] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = (-step2[21] + step2[26]) * cospi_16_64;
   temp2 = (step2[21] + step2[26]) * cospi_16_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = (-step2[22] + step2[25]) * cospi_16_64;
   temp2 = (step2[22] + step2[25]) * cospi_16_64;
-  step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[22] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[25] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = (-step2[23] + step2[24]) * cospi_16_64;
   temp2 = (step2[23] + step2[24]) * cospi_16_64;
-  step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[23] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[24] = WRAPLOW(dct_const_round_shift(temp2));
   step1[28] = step2[28];
   step1[29] = step2[29];
   step1[30] = step2[30];
   step1[31] = step2[31];
 
   // final stage
-  output[0] = WRAPLOW(step1[0] + step1[31], 8);
-  output[1] = WRAPLOW(step1[1] + step1[30], 8);
-  output[2] = WRAPLOW(step1[2] + step1[29], 8);
-  output[3] = WRAPLOW(step1[3] + step1[28], 8);
-  output[4] = WRAPLOW(step1[4] + step1[27], 8);
-  output[5] = WRAPLOW(step1[5] + step1[26], 8);
-  output[6] = WRAPLOW(step1[6] + step1[25], 8);
-  output[7] = WRAPLOW(step1[7] + step1[24], 8);
-  output[8] = WRAPLOW(step1[8] + step1[23], 8);
-  output[9] = WRAPLOW(step1[9] + step1[22], 8);
-  output[10] = WRAPLOW(step1[10] + step1[21], 8);
-  output[11] = WRAPLOW(step1[11] + step1[20], 8);
-  output[12] = WRAPLOW(step1[12] + step1[19], 8);
-  output[13] = WRAPLOW(step1[13] + step1[18], 8);
-  output[14] = WRAPLOW(step1[14] + step1[17], 8);
-  output[15] = WRAPLOW(step1[15] + step1[16], 8);
-  output[16] = WRAPLOW(step1[15] - step1[16], 8);
-  output[17] = WRAPLOW(step1[14] - step1[17], 8);
-  output[18] = WRAPLOW(step1[13] - step1[18], 8);
-  output[19] = WRAPLOW(step1[12] - step1[19], 8);
-  output[20] = WRAPLOW(step1[11] - step1[20], 8);
-  output[21] = WRAPLOW(step1[10] - step1[21], 8);
-  output[22] = WRAPLOW(step1[9] - step1[22], 8);
-  output[23] = WRAPLOW(step1[8] - step1[23], 8);
-  output[24] = WRAPLOW(step1[7] - step1[24], 8);
-  output[25] = WRAPLOW(step1[6] - step1[25], 8);
-  output[26] = WRAPLOW(step1[5] - step1[26], 8);
-  output[27] = WRAPLOW(step1[4] - step1[27], 8);
-  output[28] = WRAPLOW(step1[3] - step1[28], 8);
-  output[29] = WRAPLOW(step1[2] - step1[29], 8);
-  output[30] = WRAPLOW(step1[1] - step1[30], 8);
-  output[31] = WRAPLOW(step1[0] - step1[31], 8);
+  output[0] = WRAPLOW(step1[0] + step1[31]);
+  output[1] = WRAPLOW(step1[1] + step1[30]);
+  output[2] = WRAPLOW(step1[2] + step1[29]);
+  output[3] = WRAPLOW(step1[3] + step1[28]);
+  output[4] = WRAPLOW(step1[4] + step1[27]);
+  output[5] = WRAPLOW(step1[5] + step1[26]);
+  output[6] = WRAPLOW(step1[6] + step1[25]);
+  output[7] = WRAPLOW(step1[7] + step1[24]);
+  output[8] = WRAPLOW(step1[8] + step1[23]);
+  output[9] = WRAPLOW(step1[9] + step1[22]);
+  output[10] = WRAPLOW(step1[10] + step1[21]);
+  output[11] = WRAPLOW(step1[11] + step1[20]);
+  output[12] = WRAPLOW(step1[12] + step1[19]);
+  output[13] = WRAPLOW(step1[13] + step1[18]);
+  output[14] = WRAPLOW(step1[14] + step1[17]);
+  output[15] = WRAPLOW(step1[15] + step1[16]);
+  output[16] = WRAPLOW(step1[15] - step1[16]);
+  output[17] = WRAPLOW(step1[14] - step1[17]);
+  output[18] = WRAPLOW(step1[13] - step1[18]);
+  output[19] = WRAPLOW(step1[12] - step1[19]);
+  output[20] = WRAPLOW(step1[11] - step1[20]);
+  output[21] = WRAPLOW(step1[10] - step1[21]);
+  output[22] = WRAPLOW(step1[9] - step1[22]);
+  output[23] = WRAPLOW(step1[8] - step1[23]);
+  output[24] = WRAPLOW(step1[7] - step1[24]);
+  output[25] = WRAPLOW(step1[6] - step1[25]);
+  output[26] = WRAPLOW(step1[5] - step1[26]);
+  output[27] = WRAPLOW(step1[4] - step1[27]);
+  output[28] = WRAPLOW(step1[3] - step1[28]);
+  output[29] = WRAPLOW(step1[2] - step1[29]);
+  output[30] = WRAPLOW(step1[1] - step1[30]);
+  output[31] = WRAPLOW(step1[0] - step1[31]);
 }
 
 void vp10_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
@@ -1235,8 +1235,8 @@
   int i, j;
   tran_high_t a1;
 
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   a1 = ROUND_POWER_OF_TWO(out, 6);
 
   for (j = 0; j < 32; ++j) {
@@ -1270,10 +1270,10 @@
     c1 = e1 - c1;
     a1 -= b1;
     d1 += c1;
-    op[0] = WRAPLOW(a1, bd);
-    op[1] = WRAPLOW(b1, bd);
-    op[2] = WRAPLOW(c1, bd);
-    op[3] = WRAPLOW(d1, bd);
+    op[0] = HIGHBD_WRAPLOW(a1, bd);
+    op[1] = HIGHBD_WRAPLOW(b1, bd);
+    op[2] = HIGHBD_WRAPLOW(c1, bd);
+    op[3] = HIGHBD_WRAPLOW(d1, bd);
     ip += 4;
     op += 4;
   }
@@ -1291,11 +1291,14 @@
     c1 = e1 - c1;
     a1 -= b1;
     d1 += c1;
-    dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);
-    dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], b1, bd);
-    dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], c1, bd);
-    dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], d1, bd);
-
+    dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0],
+                                             HIGHBD_WRAPLOW(a1, bd), bd);
+    dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1],
+                                             HIGHBD_WRAPLOW(b1, bd), bd);
+    dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2],
+                                             HIGHBD_WRAPLOW(c1, bd), bd);
+    dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3],
+                                             HIGHBD_WRAPLOW(d1, bd), bd);
     ip++;
     dest++;
   }
@@ -1314,8 +1317,8 @@
   a1 = ip[0] >> UNIT_QUANT_SHIFT;
   e1 = a1 >> 1;
   a1 -= e1;
-  op[0] = WRAPLOW(a1, bd);
-  op[1] = op[2] = op[3] = WRAPLOW(e1, bd);
+  op[0] = HIGHBD_WRAPLOW(a1, bd);
+  op[1] = op[2] = op[3] = HIGHBD_WRAPLOW(e1, bd);
 
   ip = tmp;
   for (i = 0; i < 4; i++) {
@@ -1341,18 +1344,18 @@
   // stage 1
   temp1 = (input[0] + input[2]) * cospi_16_64;
   temp2 = (input[0] - input[2]) * cospi_16_64;
-  step[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
   temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
-  step[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   // stage 2
-  output[0] = WRAPLOW(step[0] + step[3], bd);
-  output[1] = WRAPLOW(step[1] + step[2], bd);
-  output[2] = WRAPLOW(step[1] - step[2], bd);
-  output[3] = WRAPLOW(step[0] - step[3], bd);
+  output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd);
+  output[1] = HIGHBD_WRAPLOW(step[1] + step[2], bd);
+  output[2] = HIGHBD_WRAPLOW(step[1] - step[2], bd);
+  output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd);
 }
 
 void vp10_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
@@ -1386,11 +1389,11 @@
                                 int dest_stride, int bd) {
   int i;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(
-      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
+  tran_low_t out = HIGHBD_WRAPLOW(
+      highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
+  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 4);
 
   for (i = 0; i < 4; i++) {
@@ -1412,39 +1415,39 @@
   step1[3] = input[6];
   temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
   temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
-  step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
   temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   // stage 2 & stage 3 - even half
   vp10_highbd_idct4_c(step1, step1, bd);
 
   // stage 2 - odd half
-  step2[4] = WRAPLOW(step1[4] + step1[5], bd);
-  step2[5] = WRAPLOW(step1[4] - step1[5], bd);
-  step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
-  step2[7] = WRAPLOW(step1[6] + step1[7], bd);
+  step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
+  step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
+  step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
+  step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
 
   // stage 3 - odd half
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step1[7] = step2[7];
 
   // stage 4
-  output[0] = WRAPLOW(step1[0] + step1[7], bd);
-  output[1] = WRAPLOW(step1[1] + step1[6], bd);
-  output[2] = WRAPLOW(step1[2] + step1[5], bd);
-  output[3] = WRAPLOW(step1[3] + step1[4], bd);
-  output[4] = WRAPLOW(step1[3] - step1[4], bd);
-  output[5] = WRAPLOW(step1[2] - step1[5], bd);
-  output[6] = WRAPLOW(step1[1] - step1[6], bd);
-  output[7] = WRAPLOW(step1[0] - step1[7], bd);
+  output[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
+  output[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
+  output[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
+  output[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
+  output[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
+  output[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
+  output[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
+  output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
 }
 
 void vp10_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
@@ -1478,10 +1481,10 @@
                                 int stride, int bd) {
   int i, j;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(
-      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
+  tran_low_t out = HIGHBD_WRAPLOW(
+      highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
+  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 5);
   for (j = 0; j < 8; ++j) {
     for (i = 0; i < 8; ++i)
@@ -1511,7 +1514,7 @@
   s4 = sinpi_1_9 * x2;
   s5 = sinpi_2_9 * x3;
   s6 = sinpi_4_9 * x3;
-  s7 = (tran_high_t)(x0 - x2 + x3);
+  s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd);
 
   s0 = s0 + s3 + s5;
   s1 = s1 - s4 - s6;
@@ -1522,10 +1525,10 @@
   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
   // + 1b (addition) = 29b.
   // Hence the output bit depth is 15b.
-  output[0] = WRAPLOW(highbd_dct_const_round_shift(s0 + s3, bd), bd);
-  output[1] = WRAPLOW(highbd_dct_const_round_shift(s1 + s3, bd), bd);
-  output[2] = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
-  output[3] = WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3, bd), bd);
+  output[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s3), bd);
+  output[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s3), bd);
+  output[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
+  output[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3), bd);
 }
 
 void vp10_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
@@ -1556,14 +1559,14 @@
   s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
   s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
 
-  x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s4, bd), bd);
-  x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s5, bd), bd);
-  x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s6, bd), bd);
-  x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s7, bd), bd);
-  x4 = WRAPLOW(highbd_dct_const_round_shift(s0 - s4, bd), bd);
-  x5 = WRAPLOW(highbd_dct_const_round_shift(s1 - s5, bd), bd);
-  x6 = WRAPLOW(highbd_dct_const_round_shift(s2 - s6, bd), bd);
-  x7 = WRAPLOW(highbd_dct_const_round_shift(s3 - s7, bd), bd);
+  x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s4), bd);
+  x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s5), bd);
+  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 + s6), bd);
+  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 + s7), bd);
+  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s4), bd);
+  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s5), bd);
+  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s6), bd);
+  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s7), bd);
 
   // stage 2
   s0 = x0;
@@ -1575,14 +1578,14 @@
   s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
   s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
 
-  x0 = WRAPLOW(s0 + s2, bd);
-  x1 = WRAPLOW(s1 + s3, bd);
-  x2 = WRAPLOW(s0 - s2, bd);
-  x3 = WRAPLOW(s1 - s3, bd);
-  x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
-  x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
-  x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
-  x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
+  x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
+  x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
+  x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
+  x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
+  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s6), bd);
+  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s7), bd);
+  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s6), bd);
+  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s7), bd);
 
   // stage 3
   s2 = cospi_16_64 * (x2 + x3);
@@ -1590,19 +1593,19 @@
   s6 = cospi_16_64 * (x6 + x7);
   s7 = cospi_16_64 * (x6 - x7);
 
-  x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
-  x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
-  x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
-  x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
+  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
+  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3), bd);
+  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6), bd);
+  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7), bd);
 
-  output[0] = WRAPLOW(x0, bd);
-  output[1] = WRAPLOW(-x4, bd);
-  output[2] = WRAPLOW(x6, bd);
-  output[3] = WRAPLOW(-x2, bd);
-  output[4] = WRAPLOW(x3, bd);
-  output[5] = WRAPLOW(-x7, bd);
-  output[6] = WRAPLOW(x5, bd);
-  output[7] = WRAPLOW(-x1, bd);
+  output[0] = HIGHBD_WRAPLOW(x0, bd);
+  output[1] = HIGHBD_WRAPLOW(-x4, bd);
+  output[2] = HIGHBD_WRAPLOW(x6, bd);
+  output[3] = HIGHBD_WRAPLOW(-x2, bd);
+  output[4] = HIGHBD_WRAPLOW(x3, bd);
+  output[5] = HIGHBD_WRAPLOW(-x7, bd);
+  output[6] = HIGHBD_WRAPLOW(x5, bd);
+  output[7] = HIGHBD_WRAPLOW(-x1, bd);
 }
 
 void vp10_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
@@ -1667,23 +1670,23 @@
 
   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[8] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[15] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   // stage 3
   step1[0] = step2[0];
@@ -1693,109 +1696,109 @@
 
   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
-  step1[8] = WRAPLOW(step2[8] + step2[9], bd);
-  step1[9] = WRAPLOW(step2[8] - step2[9], bd);
-  step1[10] = WRAPLOW(-step2[10] + step2[11], bd);
-  step1[11] = WRAPLOW(step2[10] + step2[11], bd);
-  step1[12] = WRAPLOW(step2[12] + step2[13], bd);
-  step1[13] = WRAPLOW(step2[12] - step2[13], bd);
-  step1[14] = WRAPLOW(-step2[14] + step2[15], bd);
-  step1[15] = WRAPLOW(step2[14] + step2[15], bd);
+  step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
+  step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
+  step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
+  step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
+  step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
+  step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
+  step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
+  step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
 
   // stage 4
   temp1 = (step1[0] + step1[1]) * cospi_16_64;
   temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-  step2[4] = WRAPLOW(step1[4] + step1[5], bd);
-  step2[5] = WRAPLOW(step1[4] - step1[5], bd);
-  step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
-  step2[7] = WRAPLOW(step1[6] + step1[7], bd);
+  step2[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
+  step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
+  step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
+  step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
 
   step2[8] = step1[8];
   step2[15] = step1[15];
   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step2[11] = step1[11];
   step2[12] = step1[12];
 
   // stage 5
-  step1[0] = WRAPLOW(step2[0] + step2[3], bd);
-  step1[1] = WRAPLOW(step2[1] + step2[2], bd);
-  step1[2] = WRAPLOW(step2[1] - step2[2], bd);
-  step1[3] = WRAPLOW(step2[0] - step2[3], bd);
+  step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
+  step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
+  step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
+  step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step1[7] = step2[7];
 
-  step1[8] = WRAPLOW(step2[8] + step2[11], bd);
-  step1[9] = WRAPLOW(step2[9] + step2[10], bd);
-  step1[10] = WRAPLOW(step2[9] - step2[10], bd);
-  step1[11] = WRAPLOW(step2[8] - step2[11], bd);
-  step1[12] = WRAPLOW(-step2[12] + step2[15], bd);
-  step1[13] = WRAPLOW(-step2[13] + step2[14], bd);
-  step1[14] = WRAPLOW(step2[13] + step2[14], bd);
-  step1[15] = WRAPLOW(step2[12] + step2[15], bd);
+  step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
+  step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
+  step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
+  step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
+  step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
+  step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
+  step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
+  step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
 
   // stage 6
-  step2[0] = WRAPLOW(step1[0] + step1[7], bd);
-  step2[1] = WRAPLOW(step1[1] + step1[6], bd);
-  step2[2] = WRAPLOW(step1[2] + step1[5], bd);
-  step2[3] = WRAPLOW(step1[3] + step1[4], bd);
-  step2[4] = WRAPLOW(step1[3] - step1[4], bd);
-  step2[5] = WRAPLOW(step1[2] - step1[5], bd);
-  step2[6] = WRAPLOW(step1[1] - step1[6], bd);
-  step2[7] = WRAPLOW(step1[0] - step1[7], bd);
+  step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
+  step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
+  step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
+  step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
+  step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
+  step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
+  step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
+  step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
   step2[8] = step1[8];
   step2[9] = step1[9];
   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step2[14] = step1[14];
   step2[15] = step1[15];
 
   // stage 7
-  output[0] = WRAPLOW(step2[0] + step2[15], bd);
-  output[1] = WRAPLOW(step2[1] + step2[14], bd);
-  output[2] = WRAPLOW(step2[2] + step2[13], bd);
-  output[3] = WRAPLOW(step2[3] + step2[12], bd);
-  output[4] = WRAPLOW(step2[4] + step2[11], bd);
-  output[5] = WRAPLOW(step2[5] + step2[10], bd);
-  output[6] = WRAPLOW(step2[6] + step2[9], bd);
-  output[7] = WRAPLOW(step2[7] + step2[8], bd);
-  output[8] = WRAPLOW(step2[7] - step2[8], bd);
-  output[9] = WRAPLOW(step2[6] - step2[9], bd);
-  output[10] = WRAPLOW(step2[5] - step2[10], bd);
-  output[11] = WRAPLOW(step2[4] - step2[11], bd);
-  output[12] = WRAPLOW(step2[3] - step2[12], bd);
-  output[13] = WRAPLOW(step2[2] - step2[13], bd);
-  output[14] = WRAPLOW(step2[1] - step2[14], bd);
-  output[15] = WRAPLOW(step2[0] - step2[15], bd);
+  output[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
+  output[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
+  output[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
+  output[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
+  output[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
+  output[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
+  output[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
+  output[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
+  output[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
+  output[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
+  output[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
+  output[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
+  output[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
+  output[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
+  output[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
+  output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
 }
 
 void vp10_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
@@ -1875,22 +1878,22 @@
   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
   s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
 
-  x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s8, bd), bd);
-  x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s9, bd), bd);
-  x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s10, bd), bd);
-  x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s11, bd), bd);
-  x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s12, bd), bd);
-  x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s13, bd), bd);
-  x6 = WRAPLOW(highbd_dct_const_round_shift(s6 + s14, bd), bd);
-  x7 = WRAPLOW(highbd_dct_const_round_shift(s7 + s15, bd), bd);
-  x8  = WRAPLOW(highbd_dct_const_round_shift(s0 - s8, bd), bd);
-  x9  = WRAPLOW(highbd_dct_const_round_shift(s1 - s9, bd), bd);
-  x10 = WRAPLOW(highbd_dct_const_round_shift(s2 - s10, bd), bd);
-  x11 = WRAPLOW(highbd_dct_const_round_shift(s3 - s11, bd), bd);
-  x12 = WRAPLOW(highbd_dct_const_round_shift(s4 - s12, bd), bd);
-  x13 = WRAPLOW(highbd_dct_const_round_shift(s5 - s13, bd), bd);
-  x14 = WRAPLOW(highbd_dct_const_round_shift(s6 - s14, bd), bd);
-  x15 = WRAPLOW(highbd_dct_const_round_shift(s7 - s15, bd), bd);
+  x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s8), bd);
+  x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s9), bd);
+  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 + s10), bd);
+  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 + s11), bd);
+  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s12), bd);
+  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s13), bd);
+  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 + s14), bd);
+  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 + s15), bd);
+  x8  = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s8), bd);
+  x9  = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s9), bd);
+  x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s10), bd);
+  x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s11), bd);
+  x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s12), bd);
+  x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s13), bd);
+  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 - s14), bd);
+  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 - s15), bd);
 
   // stage 2
   s0 = x0;
@@ -1910,22 +1913,22 @@
   s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
   s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
 
-  x0 = WRAPLOW(s0 + s4, bd);
-  x1 = WRAPLOW(s1 + s5, bd);
-  x2 = WRAPLOW(s2 + s6, bd);
-  x3 = WRAPLOW(s3 + s7, bd);
-  x4 = WRAPLOW(s0 - s4, bd);
-  x5 = WRAPLOW(s1 - s5, bd);
-  x6 = WRAPLOW(s2 - s6, bd);
-  x7 = WRAPLOW(s3 - s7, bd);
-  x8 = WRAPLOW(highbd_dct_const_round_shift(s8 + s12, bd), bd);
-  x9 = WRAPLOW(highbd_dct_const_round_shift(s9 + s13, bd), bd);
-  x10 = WRAPLOW(highbd_dct_const_round_shift(s10 + s14, bd), bd);
-  x11 = WRAPLOW(highbd_dct_const_round_shift(s11 + s15, bd), bd);
-  x12 = WRAPLOW(highbd_dct_const_round_shift(s8 - s12, bd), bd);
-  x13 = WRAPLOW(highbd_dct_const_round_shift(s9 - s13, bd), bd);
-  x14 = WRAPLOW(highbd_dct_const_round_shift(s10 - s14, bd), bd);
-  x15 = WRAPLOW(highbd_dct_const_round_shift(s11 - s15, bd), bd);
+  x0 = HIGHBD_WRAPLOW(s0 + s4, bd);
+  x1 = HIGHBD_WRAPLOW(s1 + s5, bd);
+  x2 = HIGHBD_WRAPLOW(s2 + s6, bd);
+  x3 = HIGHBD_WRAPLOW(s3 + s7, bd);
+  x4 = HIGHBD_WRAPLOW(s0 - s4, bd);
+  x5 = HIGHBD_WRAPLOW(s1 - s5, bd);
+  x6 = HIGHBD_WRAPLOW(s2 - s6, bd);
+  x7 = HIGHBD_WRAPLOW(s3 - s7, bd);
+  x8 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s8 + s12), bd);
+  x9 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s9 + s13), bd);
+  x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10 + s14), bd);
+  x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11 + s15), bd);
+  x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s8 - s12), bd);
+  x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s9 - s13), bd);
+  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10 - s14), bd);
+  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11 - s15), bd);
 
   // stage 3
   s0 = x0;
@@ -1945,22 +1948,22 @@
   s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
   s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
 
-  x0 = WRAPLOW(s0 + s2, bd);
-  x1 = WRAPLOW(s1 + s3, bd);
-  x2 = WRAPLOW(s0 - s2, bd);
-  x3 = WRAPLOW(s1 - s3, bd);
-  x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
-  x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
-  x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
-  x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
-  x8 = WRAPLOW(s8 + s10, bd);
-  x9 = WRAPLOW(s9 + s11, bd);
-  x10 = WRAPLOW(s8 - s10, bd);
-  x11 = WRAPLOW(s9 - s11, bd);
-  x12 = WRAPLOW(highbd_dct_const_round_shift(s12 + s14, bd), bd);
-  x13 = WRAPLOW(highbd_dct_const_round_shift(s13 + s15, bd), bd);
-  x14 = WRAPLOW(highbd_dct_const_round_shift(s12 - s14, bd), bd);
-  x15 = WRAPLOW(highbd_dct_const_round_shift(s13 - s15, bd), bd);
+  x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
+  x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
+  x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
+  x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
+  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s6), bd);
+  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s7), bd);
+  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s6), bd);
+  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s7), bd);
+  x8 = HIGHBD_WRAPLOW(s8 + s10, bd);
+  x9 = HIGHBD_WRAPLOW(s9 + s11, bd);
+  x10 = HIGHBD_WRAPLOW(s8 - s10, bd);
+  x11 = HIGHBD_WRAPLOW(s9 - s11, bd);
+  x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s12 + s14), bd);
+  x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 + s15), bd);
+  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s12 - s14), bd);
+  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 - s15), bd);
 
   // stage 4
   s2 = (- cospi_16_64) * (x2 + x3);
@@ -1972,31 +1975,31 @@
   s14 = (- cospi_16_64) * (x14 + x15);
   s15 = cospi_16_64 * (x14 - x15);
 
-  x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
-  x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
-  x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
-  x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
-  x10 = WRAPLOW(highbd_dct_const_round_shift(s10, bd), bd);
-  x11 = WRAPLOW(highbd_dct_const_round_shift(s11, bd), bd);
-  x14 = WRAPLOW(highbd_dct_const_round_shift(s14, bd), bd);
-  x15 = WRAPLOW(highbd_dct_const_round_shift(s15, bd), bd);
+  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
+  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3), bd);
+  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6), bd);
+  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7), bd);
+  x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10), bd);
+  x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11), bd);
+  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s14), bd);
+  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s15), bd);
 
-  output[0] = WRAPLOW(x0, bd);
-  output[1] = WRAPLOW(-x8, bd);
-  output[2] = WRAPLOW(x12, bd);
-  output[3] = WRAPLOW(-x4, bd);
-  output[4] = WRAPLOW(x6, bd);
-  output[5] = WRAPLOW(x14, bd);
-  output[6] = WRAPLOW(x10, bd);
-  output[7] = WRAPLOW(x2, bd);
-  output[8] = WRAPLOW(x3, bd);
-  output[9] = WRAPLOW(x11, bd);
-  output[10] = WRAPLOW(x15, bd);
-  output[11] = WRAPLOW(x7, bd);
-  output[12] = WRAPLOW(x5, bd);
-  output[13] = WRAPLOW(-x13, bd);
-  output[14] = WRAPLOW(x9, bd);
-  output[15] = WRAPLOW(-x1, bd);
+  output[0] = HIGHBD_WRAPLOW(x0, bd);
+  output[1] = HIGHBD_WRAPLOW(-x8, bd);
+  output[2] = HIGHBD_WRAPLOW(x12, bd);
+  output[3] = HIGHBD_WRAPLOW(-x4, bd);
+  output[4] = HIGHBD_WRAPLOW(x6, bd);
+  output[5] = HIGHBD_WRAPLOW(x14, bd);
+  output[6] = HIGHBD_WRAPLOW(x10, bd);
+  output[7] = HIGHBD_WRAPLOW(x2, bd);
+  output[8] = HIGHBD_WRAPLOW(x3, bd);
+  output[9] = HIGHBD_WRAPLOW(x11, bd);
+  output[10] = HIGHBD_WRAPLOW(x15, bd);
+  output[11] = HIGHBD_WRAPLOW(x7, bd);
+  output[12] = HIGHBD_WRAPLOW(x5, bd);
+  output[13] = HIGHBD_WRAPLOW(-x13, bd);
+  output[14] = HIGHBD_WRAPLOW(x9, bd);
+  output[15] = HIGHBD_WRAPLOW(-x1, bd);
 }
 
 void vp10_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
@@ -2031,11 +2034,11 @@
                                   int stride, int bd) {
   int i, j;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(
-      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
+  tran_low_t out = HIGHBD_WRAPLOW(
+      highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
+  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 6);
   for (j = 0; j < 16; ++j) {
     for (i = 0; i < 16; ++i)
@@ -2070,43 +2073,43 @@
 
   temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
   temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
-  step1[16] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[31] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[16] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[31] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
   temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
-  step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[17] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[30] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
   temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
-  step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
   temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
-  step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[19] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[28] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
   temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
-  step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
   temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
-  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
   temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
-  step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
   temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
-  step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[23] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[24] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   // stage 2
   step2[0] = step1[0];
@@ -2120,40 +2123,40 @@
 
   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[8] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[15] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
-  step2[16] = WRAPLOW(step1[16] + step1[17], bd);
-  step2[17] = WRAPLOW(step1[16] - step1[17], bd);
-  step2[18] = WRAPLOW(-step1[18] + step1[19], bd);
-  step2[19] = WRAPLOW(step1[18] + step1[19], bd);
-  step2[20] = WRAPLOW(step1[20] + step1[21], bd);
-  step2[21] = WRAPLOW(step1[20] - step1[21], bd);
-  step2[22] = WRAPLOW(-step1[22] + step1[23], bd);
-  step2[23] = WRAPLOW(step1[22] + step1[23], bd);
-  step2[24] = WRAPLOW(step1[24] + step1[25], bd);
-  step2[25] = WRAPLOW(step1[24] - step1[25], bd);
-  step2[26] = WRAPLOW(-step1[26] + step1[27], bd);
-  step2[27] = WRAPLOW(step1[26] + step1[27], bd);
-  step2[28] = WRAPLOW(step1[28] + step1[29], bd);
-  step2[29] = WRAPLOW(step1[28] - step1[29], bd);
-  step2[30] = WRAPLOW(-step1[30] + step1[31], bd);
-  step2[31] = WRAPLOW(step1[30] + step1[31], bd);
+  step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[17], bd);
+  step2[17] = HIGHBD_WRAPLOW(step1[16] - step1[17], bd);
+  step2[18] = HIGHBD_WRAPLOW(-step1[18] + step1[19], bd);
+  step2[19] = HIGHBD_WRAPLOW(step1[18] + step1[19], bd);
+  step2[20] = HIGHBD_WRAPLOW(step1[20] + step1[21], bd);
+  step2[21] = HIGHBD_WRAPLOW(step1[20] - step1[21], bd);
+  step2[22] = HIGHBD_WRAPLOW(-step1[22] + step1[23], bd);
+  step2[23] = HIGHBD_WRAPLOW(step1[22] + step1[23], bd);
+  step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[25], bd);
+  step2[25] = HIGHBD_WRAPLOW(step1[24] - step1[25], bd);
+  step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[27], bd);
+  step2[27] = HIGHBD_WRAPLOW(step1[26] + step1[27], bd);
+  step2[28] = HIGHBD_WRAPLOW(step1[28] + step1[29], bd);
+  step2[29] = HIGHBD_WRAPLOW(step1[28] - step1[29], bd);
+  step2[30] = HIGHBD_WRAPLOW(-step1[30] + step1[31], bd);
+  step2[31] = HIGHBD_WRAPLOW(step1[30] + step1[31], bd);
 
   // stage 3
   step1[0] = step2[0];
@@ -2163,42 +2166,42 @@
 
   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
-  step1[8] = WRAPLOW(step2[8] + step2[9], bd);
-  step1[9] = WRAPLOW(step2[8] - step2[9], bd);
-  step1[10] = WRAPLOW(-step2[10] + step2[11], bd);
-  step1[11] = WRAPLOW(step2[10] + step2[11], bd);
-  step1[12] = WRAPLOW(step2[12] + step2[13], bd);
-  step1[13] = WRAPLOW(step2[12] - step2[13], bd);
-  step1[14] = WRAPLOW(-step2[14] + step2[15], bd);
-  step1[15] = WRAPLOW(step2[14] + step2[15], bd);
+  step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
+  step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
+  step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
+  step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
+  step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
+  step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
+  step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
+  step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
 
   step1[16] = step2[16];
   step1[31] = step2[31];
   temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
   temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
-  step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[17] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[30] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
   temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
-  step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step1[19] = step2[19];
   step1[20] = step2[20];
   temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
   temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
-  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
   temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
-  step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step1[23] = step2[23];
   step1[24] = step2[24];
   step1[27] = step2[27];
@@ -2207,87 +2210,87 @@
   // stage 4
   temp1 = (step1[0] + step1[1]) * cospi_16_64;
   temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-  step2[4] = WRAPLOW(step1[4] + step1[5], bd);
-  step2[5] = WRAPLOW(step1[4] - step1[5], bd);
-  step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
-  step2[7] = WRAPLOW(step1[6] + step1[7], bd);
+  step2[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
+  step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
+  step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
+  step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
 
   step2[8] = step1[8];
   step2[15] = step1[15];
   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step2[11] = step1[11];
   step2[12] = step1[12];
 
-  step2[16] = WRAPLOW(step1[16] + step1[19], bd);
-  step2[17] = WRAPLOW(step1[17] + step1[18], bd);
-  step2[18] = WRAPLOW(step1[17] - step1[18], bd);
-  step2[19] = WRAPLOW(step1[16] - step1[19], bd);
-  step2[20] = WRAPLOW(-step1[20] + step1[23], bd);
-  step2[21] = WRAPLOW(-step1[21] + step1[22], bd);
-  step2[22] = WRAPLOW(step1[21] + step1[22], bd);
-  step2[23] = WRAPLOW(step1[20] + step1[23], bd);
+  step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[19], bd);
+  step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[18], bd);
+  step2[18] = HIGHBD_WRAPLOW(step1[17] - step1[18], bd);
+  step2[19] = HIGHBD_WRAPLOW(step1[16] - step1[19], bd);
+  step2[20] = HIGHBD_WRAPLOW(-step1[20] + step1[23], bd);
+  step2[21] = HIGHBD_WRAPLOW(-step1[21] + step1[22], bd);
+  step2[22] = HIGHBD_WRAPLOW(step1[21] + step1[22], bd);
+  step2[23] = HIGHBD_WRAPLOW(step1[20] + step1[23], bd);
 
-  step2[24] = WRAPLOW(step1[24] + step1[27], bd);
-  step2[25] = WRAPLOW(step1[25] + step1[26], bd);
-  step2[26] = WRAPLOW(step1[25] - step1[26], bd);
-  step2[27] = WRAPLOW(step1[24] - step1[27], bd);
-  step2[28] = WRAPLOW(-step1[28] + step1[31], bd);
-  step2[29] = WRAPLOW(-step1[29] + step1[30], bd);
-  step2[30] = WRAPLOW(step1[29] + step1[30], bd);
-  step2[31] = WRAPLOW(step1[28] + step1[31], bd);
+  step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[27], bd);
+  step2[25] = HIGHBD_WRAPLOW(step1[25] + step1[26], bd);
+  step2[26] = HIGHBD_WRAPLOW(step1[25] - step1[26], bd);
+  step2[27] = HIGHBD_WRAPLOW(step1[24] - step1[27], bd);
+  step2[28] = HIGHBD_WRAPLOW(-step1[28] + step1[31], bd);
+  step2[29] = HIGHBD_WRAPLOW(-step1[29] + step1[30], bd);
+  step2[30] = HIGHBD_WRAPLOW(step1[29] + step1[30], bd);
+  step2[31] = HIGHBD_WRAPLOW(step1[28] + step1[31], bd);
 
   // stage 5
-  step1[0] = WRAPLOW(step2[0] + step2[3], bd);
-  step1[1] = WRAPLOW(step2[1] + step2[2], bd);
-  step1[2] = WRAPLOW(step2[1] - step2[2], bd);
-  step1[3] = WRAPLOW(step2[0] - step2[3], bd);
+  step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
+  step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
+  step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
+  step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step1[7] = step2[7];
 
-  step1[8] = WRAPLOW(step2[8] + step2[11], bd);
-  step1[9] = WRAPLOW(step2[9] + step2[10], bd);
-  step1[10] = WRAPLOW(step2[9] - step2[10], bd);
-  step1[11] = WRAPLOW(step2[8] - step2[11], bd);
-  step1[12] = WRAPLOW(-step2[12] + step2[15], bd);
-  step1[13] = WRAPLOW(-step2[13] + step2[14], bd);
-  step1[14] = WRAPLOW(step2[13] + step2[14], bd);
-  step1[15] = WRAPLOW(step2[12] + step2[15], bd);
+  step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
+  step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
+  step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
+  step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
+  step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
+  step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
+  step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
+  step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
 
   step1[16] = step2[16];
   step1[17] = step2[17];
   temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
   temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
-  step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
   temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
-  step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[19] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[28] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
   temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
-  step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
   temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
-  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step1[22] = step2[22];
   step1[23] = step2[23];
   step1[24] = step2[24];
@@ -2296,62 +2299,62 @@
   step1[31] = step2[31];
 
   // stage 6
-  step2[0] = WRAPLOW(step1[0] + step1[7], bd);
-  step2[1] = WRAPLOW(step1[1] + step1[6], bd);
-  step2[2] = WRAPLOW(step1[2] + step1[5], bd);
-  step2[3] = WRAPLOW(step1[3] + step1[4], bd);
-  step2[4] = WRAPLOW(step1[3] - step1[4], bd);
-  step2[5] = WRAPLOW(step1[2] - step1[5], bd);
-  step2[6] = WRAPLOW(step1[1] - step1[6], bd);
-  step2[7] = WRAPLOW(step1[0] - step1[7], bd);
+  step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
+  step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
+  step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
+  step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
+  step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
+  step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
+  step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
+  step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
   step2[8] = step1[8];
   step2[9] = step1[9];
   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step2[14] = step1[14];
   step2[15] = step1[15];
 
-  step2[16] = WRAPLOW(step1[16] + step1[23], bd);
-  step2[17] = WRAPLOW(step1[17] + step1[22], bd);
-  step2[18] = WRAPLOW(step1[18] + step1[21], bd);
-  step2[19] = WRAPLOW(step1[19] + step1[20], bd);
-  step2[20] = WRAPLOW(step1[19] - step1[20], bd);
-  step2[21] = WRAPLOW(step1[18] - step1[21], bd);
-  step2[22] = WRAPLOW(step1[17] - step1[22], bd);
-  step2[23] = WRAPLOW(step1[16] - step1[23], bd);
+  step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[23], bd);
+  step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[22], bd);
+  step2[18] = HIGHBD_WRAPLOW(step1[18] + step1[21], bd);
+  step2[19] = HIGHBD_WRAPLOW(step1[19] + step1[20], bd);
+  step2[20] = HIGHBD_WRAPLOW(step1[19] - step1[20], bd);
+  step2[21] = HIGHBD_WRAPLOW(step1[18] - step1[21], bd);
+  step2[22] = HIGHBD_WRAPLOW(step1[17] - step1[22], bd);
+  step2[23] = HIGHBD_WRAPLOW(step1[16] - step1[23], bd);
 
-  step2[24] = WRAPLOW(-step1[24] + step1[31], bd);
-  step2[25] = WRAPLOW(-step1[25] + step1[30], bd);
-  step2[26] = WRAPLOW(-step1[26] + step1[29], bd);
-  step2[27] = WRAPLOW(-step1[27] + step1[28], bd);
-  step2[28] = WRAPLOW(step1[27] + step1[28], bd);
-  step2[29] = WRAPLOW(step1[26] + step1[29], bd);
-  step2[30] = WRAPLOW(step1[25] + step1[30], bd);
-  step2[31] = WRAPLOW(step1[24] + step1[31], bd);
+  step2[24] = HIGHBD_WRAPLOW(-step1[24] + step1[31], bd);
+  step2[25] = HIGHBD_WRAPLOW(-step1[25] + step1[30], bd);
+  step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[29], bd);
+  step2[27] = HIGHBD_WRAPLOW(-step1[27] + step1[28], bd);
+  step2[28] = HIGHBD_WRAPLOW(step1[27] + step1[28], bd);
+  step2[29] = HIGHBD_WRAPLOW(step1[26] + step1[29], bd);
+  step2[30] = HIGHBD_WRAPLOW(step1[25] + step1[30], bd);
+  step2[31] = HIGHBD_WRAPLOW(step1[24] + step1[31], bd);
 
   // stage 7
-  step1[0] = WRAPLOW(step2[0] + step2[15], bd);
-  step1[1] = WRAPLOW(step2[1] + step2[14], bd);
-  step1[2] = WRAPLOW(step2[2] + step2[13], bd);
-  step1[3] = WRAPLOW(step2[3] + step2[12], bd);
-  step1[4] = WRAPLOW(step2[4] + step2[11], bd);
-  step1[5] = WRAPLOW(step2[5] + step2[10], bd);
-  step1[6] = WRAPLOW(step2[6] + step2[9], bd);
-  step1[7] = WRAPLOW(step2[7] + step2[8], bd);
-  step1[8] = WRAPLOW(step2[7] - step2[8], bd);
-  step1[9] = WRAPLOW(step2[6] - step2[9], bd);
-  step1[10] = WRAPLOW(step2[5] - step2[10], bd);
-  step1[11] = WRAPLOW(step2[4] - step2[11], bd);
-  step1[12] = WRAPLOW(step2[3] - step2[12], bd);
-  step1[13] = WRAPLOW(step2[2] - step2[13], bd);
-  step1[14] = WRAPLOW(step2[1] - step2[14], bd);
-  step1[15] = WRAPLOW(step2[0] - step2[15], bd);
+  step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
+  step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
+  step1[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
+  step1[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
+  step1[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
+  step1[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
+  step1[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
+  step1[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
+  step1[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
+  step1[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
+  step1[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
+  step1[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
+  step1[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
+  step1[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
+  step1[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
+  step1[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
 
   step1[16] = step2[16];
   step1[17] = step2[17];
@@ -2359,58 +2362,58 @@
   step1[19] = step2[19];
   temp1 = (-step2[20] + step2[27]) * cospi_16_64;
   temp2 = (step2[20] + step2[27]) * cospi_16_64;
-  step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = (-step2[21] + step2[26]) * cospi_16_64;
   temp2 = (step2[21] + step2[26]) * cospi_16_64;
-  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = (-step2[22] + step2[25]) * cospi_16_64;
   temp2 = (step2[22] + step2[25]) * cospi_16_64;
-  step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = (-step2[23] + step2[24]) * cospi_16_64;
   temp2 = (step2[23] + step2[24]) * cospi_16_64;
-  step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[23] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[24] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step1[28] = step2[28];
   step1[29] = step2[29];
   step1[30] = step2[30];
   step1[31] = step2[31];
 
   // final stage
-  output[0] = WRAPLOW(step1[0] + step1[31], bd);
-  output[1] = WRAPLOW(step1[1] + step1[30], bd);
-  output[2] = WRAPLOW(step1[2] + step1[29], bd);
-  output[3] = WRAPLOW(step1[3] + step1[28], bd);
-  output[4] = WRAPLOW(step1[4] + step1[27], bd);
-  output[5] = WRAPLOW(step1[5] + step1[26], bd);
-  output[6] = WRAPLOW(step1[6] + step1[25], bd);
-  output[7] = WRAPLOW(step1[7] + step1[24], bd);
-  output[8] = WRAPLOW(step1[8] + step1[23], bd);
-  output[9] = WRAPLOW(step1[9] + step1[22], bd);
-  output[10] = WRAPLOW(step1[10] + step1[21], bd);
-  output[11] = WRAPLOW(step1[11] + step1[20], bd);
-  output[12] = WRAPLOW(step1[12] + step1[19], bd);
-  output[13] = WRAPLOW(step1[13] + step1[18], bd);
-  output[14] = WRAPLOW(step1[14] + step1[17], bd);
-  output[15] = WRAPLOW(step1[15] + step1[16], bd);
-  output[16] = WRAPLOW(step1[15] - step1[16], bd);
-  output[17] = WRAPLOW(step1[14] - step1[17], bd);
-  output[18] = WRAPLOW(step1[13] - step1[18], bd);
-  output[19] = WRAPLOW(step1[12] - step1[19], bd);
-  output[20] = WRAPLOW(step1[11] - step1[20], bd);
-  output[21] = WRAPLOW(step1[10] - step1[21], bd);
-  output[22] = WRAPLOW(step1[9] - step1[22], bd);
-  output[23] = WRAPLOW(step1[8] - step1[23], bd);
-  output[24] = WRAPLOW(step1[7] - step1[24], bd);
-  output[25] = WRAPLOW(step1[6] - step1[25], bd);
-  output[26] = WRAPLOW(step1[5] - step1[26], bd);
-  output[27] = WRAPLOW(step1[4] - step1[27], bd);
-  output[28] = WRAPLOW(step1[3] - step1[28], bd);
-  output[29] = WRAPLOW(step1[2] - step1[29], bd);
-  output[30] = WRAPLOW(step1[1] - step1[30], bd);
-  output[31] = WRAPLOW(step1[0] - step1[31], bd);
+  output[0] = HIGHBD_WRAPLOW(step1[0] + step1[31], bd);
+  output[1] = HIGHBD_WRAPLOW(step1[1] + step1[30], bd);
+  output[2] = HIGHBD_WRAPLOW(step1[2] + step1[29], bd);
+  output[3] = HIGHBD_WRAPLOW(step1[3] + step1[28], bd);
+  output[4] = HIGHBD_WRAPLOW(step1[4] + step1[27], bd);
+  output[5] = HIGHBD_WRAPLOW(step1[5] + step1[26], bd);
+  output[6] = HIGHBD_WRAPLOW(step1[6] + step1[25], bd);
+  output[7] = HIGHBD_WRAPLOW(step1[7] + step1[24], bd);
+  output[8] = HIGHBD_WRAPLOW(step1[8] + step1[23], bd);
+  output[9] = HIGHBD_WRAPLOW(step1[9] + step1[22], bd);
+  output[10] = HIGHBD_WRAPLOW(step1[10] + step1[21], bd);
+  output[11] = HIGHBD_WRAPLOW(step1[11] + step1[20], bd);
+  output[12] = HIGHBD_WRAPLOW(step1[12] + step1[19], bd);
+  output[13] = HIGHBD_WRAPLOW(step1[13] + step1[18], bd);
+  output[14] = HIGHBD_WRAPLOW(step1[14] + step1[17], bd);
+  output[15] = HIGHBD_WRAPLOW(step1[15] + step1[16], bd);
+  output[16] = HIGHBD_WRAPLOW(step1[15] - step1[16], bd);
+  output[17] = HIGHBD_WRAPLOW(step1[14] - step1[17], bd);
+  output[18] = HIGHBD_WRAPLOW(step1[13] - step1[18], bd);
+  output[19] = HIGHBD_WRAPLOW(step1[12] - step1[19], bd);
+  output[20] = HIGHBD_WRAPLOW(step1[11] - step1[20], bd);
+  output[21] = HIGHBD_WRAPLOW(step1[10] - step1[21], bd);
+  output[22] = HIGHBD_WRAPLOW(step1[9] - step1[22], bd);
+  output[23] = HIGHBD_WRAPLOW(step1[8] - step1[23], bd);
+  output[24] = HIGHBD_WRAPLOW(step1[7] - step1[24], bd);
+  output[25] = HIGHBD_WRAPLOW(step1[6] - step1[25], bd);
+  output[26] = HIGHBD_WRAPLOW(step1[5] - step1[26], bd);
+  output[27] = HIGHBD_WRAPLOW(step1[4] - step1[27], bd);
+  output[28] = HIGHBD_WRAPLOW(step1[3] - step1[28], bd);
+  output[29] = HIGHBD_WRAPLOW(step1[2] - step1[29], bd);
+  output[30] = HIGHBD_WRAPLOW(step1[1] - step1[30], bd);
+  output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd);
 }
 
 void vp10_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
@@ -2486,9 +2489,9 @@
   int a1;
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  tran_low_t out = WRAPLOW(
-      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
-  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
+  tran_low_t out = HIGHBD_WRAPLOW(
+      highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
+  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 6);
 
   for (j = 0; j < 32; ++j) {
diff --git a/vp10/common/vp10_inv_txfm.h b/vp10/common/vp10_inv_txfm.h
index 52611ac..1751f62 100644
--- a/vp10/common/vp10_inv_txfm.h
+++ b/vp10/common/vp10_inv_txfm.h
@@ -15,13 +15,14 @@
 
 #include "./vpx_config.h"
 #include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/inv_txfm.h"
 #include "vpx_ports/mem.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-static INLINE tran_low_t check_range(tran_high_t input) {
+static INLINE tran_high_t check_range(tran_high_t input) {
 #if CONFIG_COEFFICIENT_RANGE_CHECKING
   // For valid VP9 input streams, intermediate stage coefficients should always
   // stay within the range of a signed 16 bit integer. Coefficients can go out
@@ -32,17 +33,17 @@
   assert(INT16_MIN <= input);
   assert(input <= INT16_MAX);
 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
-  return (tran_low_t)input;
+  return input;
 }
 
-static INLINE tran_low_t dct_const_round_shift(tran_high_t input) {
+static INLINE tran_high_t dct_const_round_shift(tran_high_t input) {
   tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-  return check_range(rv);
+  return rv;
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static INLINE tran_low_t highbd_check_range(tran_high_t input,
-                                            int bd) {
+static INLINE tran_high_t highbd_check_range(tran_high_t input,
+                                             int bd) {
 #if CONFIG_COEFFICIENT_RANGE_CHECKING
   // For valid highbitdepth VP9 streams, intermediate stage coefficients will
   // stay within the ranges:
@@ -56,13 +57,12 @@
   (void) int_min;
 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
   (void) bd;
-  return (tran_low_t)input;
+  return input;
 }
 
-static INLINE tran_low_t highbd_dct_const_round_shift(tran_high_t input,
-                                                      int bd) {
+static INLINE tran_high_t highbd_dct_const_round_shift(tran_high_t input) {
   tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-  return highbd_check_range(rv, bd);
+  return rv;
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
@@ -83,9 +83,21 @@
 // bd of 10 uses trans_low with 18bits, need to remove 14bits
 // bd of 12 uses trans_low with 20bits, need to remove 12bits
 // bd of x uses trans_low with 8+x bits, need to remove 24-x bits
-#define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd))
-#else
-#define WRAPLOW(x, bd) ((int32_t)(x))
+
+#define WRAPLOW(x) ((((int32_t)check_range(x)) << 16) >> 16)
+#if CONFIG_VP9_HIGHBITDEPTH
+#define HIGHBD_WRAPLOW(x, bd) \
+        ((((int32_t)highbd_check_range((x), bd)) << (24 - bd)) >> (24 - bd))
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#else   // CONFIG_EMULATE_HARDWARE
+
+#define WRAPLOW(x) ((int32_t)check_range(x))
+#if CONFIG_VP9_HIGHBITDEPTH
+#define HIGHBD_WRAPLOW(x, bd) \
+        ((int32_t)highbd_check_range((x), bd))
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 #endif  // CONFIG_EMULATE_HARDWARE
 
 void vp10_idct4_c(const tran_low_t *input, tran_low_t *output);
@@ -107,14 +119,14 @@
 
 static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
                                              int bd) {
-  trans = WRAPLOW(trans, bd);
-  return clip_pixel_highbd(WRAPLOW(dest + trans, bd), bd);
+  trans = HIGHBD_WRAPLOW(trans, bd);
+  return clip_pixel_highbd(dest + trans, bd);
 }
 #endif
 
 static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) {
-  trans = WRAPLOW(trans, 8);
-  return clip_pixel(WRAPLOW(dest + trans, 8));
+  trans = WRAPLOW(trans);
+  return clip_pixel(dest + trans);
 }
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp10/common/vp10_rtcd_defs.pl b/vp10/common/vp10_rtcd_defs.pl
index 1b501e2..51b674b 100644
--- a/vp10/common/vp10_rtcd_defs.pl
+++ b/vp10/common/vp10_rtcd_defs.pl
@@ -73,10 +73,6 @@
 specialize qw/vp10_post_proc_down_and_across sse2/;
 $vp10_post_proc_down_and_across_sse2=vp10_post_proc_down_and_across_xmm;
 
-add_proto qw/void vp10_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
-specialize qw/vp10_plane_add_noise sse2/;
-$vp10_plane_add_noise_sse2=vp10_plane_add_noise_wmt;
-
 add_proto qw/void vp10_filter_by_weight16x16/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight";
 specialize qw/vp10_filter_by_weight16x16 sse2 msa/;
 
@@ -365,9 +361,6 @@
 
     add_proto qw/void vp10_highbd_post_proc_down_and_across/, "const uint16_t *src_ptr, uint16_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit";
     specialize qw/vp10_highbd_post_proc_down_and_across/;
-
-    add_proto qw/void vp10_highbd_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
-    specialize qw/vp10_highbd_plane_add_noise/;
   }
 
   #
@@ -447,7 +440,7 @@
   specialize qw/vp10_fht32x32/;
 
   add_proto qw/void vp10_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp10_fwht4x4/, "$mmx_x86inc";
+  specialize qw/vp10_fwht4x4/, "$sse2_x86inc";
 } else {
   add_proto qw/void vp10_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
   specialize qw/vp10_fht4x4 sse2/;
@@ -468,7 +461,7 @@
   specialize qw/vp10_fht32x32/;
 
   add_proto qw/void vp10_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp10_fwht4x4 msa/, "$mmx_x86inc";
+  specialize qw/vp10_fwht4x4 msa/, "$sse2_x86inc";
 }
 
 add_proto qw/void vp10_fwd_idtx/, "const int16_t *src_diff, tran_low_t *coeff, int stride, int bs, int tx_type";
diff --git a/vp10/common/x86/postproc_sse2.asm b/vp10/common/x86/postproc_sse2.asm
index d5f8e92..d477a65 100644
--- a/vp10/common/x86/postproc_sse2.asm
+++ b/vp10/common/x86/postproc_sse2.asm
@@ -624,68 +624,6 @@
 %undef flimit4
 
 
-;void vp10_plane_add_noise_wmt (unsigned char *start, unsigned char *noise,
-;                            unsigned char blackclamp[16],
-;                            unsigned char whiteclamp[16],
-;                            unsigned char bothclamp[16],
-;                            unsigned int width, unsigned int height, int pitch)
-global sym(vp10_plane_add_noise_wmt) PRIVATE
-sym(vp10_plane_add_noise_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-.addnoise_loop:
-    call sym(LIBVPX_RAND) WRT_PLT
-    mov     rcx, arg(1) ;noise
-    and     rax, 0xff
-    add     rcx, rax
-
-    ; we rely on the fact that the clamping vectors are stored contiguously
-    ; in black/white/both order. Note that we have to reload this here because
-    ; rdx could be trashed by rand()
-    mov     rdx, arg(2) ; blackclamp
-
-
-            mov     rdi, rcx
-            movsxd  rcx, dword arg(5) ;[Width]
-            mov     rsi, arg(0) ;Pos
-            xor         rax,rax
-
-.addnoise_nextset:
-            movdqu      xmm1,[rsi+rax]         ; get the source
-
-            psubusb     xmm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
-            paddusb     xmm1, [rdx+32] ;bothclamp
-            psubusb     xmm1, [rdx+16] ;whiteclamp
-
-            movdqu      xmm2,[rdi+rax]         ; get the noise for this line
-            paddb       xmm1,xmm2              ; add it in
-            movdqu      [rsi+rax],xmm1         ; store the result
-
-            add         rax,16                 ; move to the next line
-
-            cmp         rax, rcx
-            jl          .addnoise_nextset
-
-    movsxd  rax, dword arg(7) ; Pitch
-    add     arg(0), rax ; Start += Pitch
-    sub     dword arg(6), 1   ; Height -= 1
-    jg      .addnoise_loop
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
 SECTION_RODATA
 align 16
 rd42:
diff --git a/vp10/encoder/encodemb.c b/vp10/encoder/encodemb.c
index 34428a7..aceb10f 100644
--- a/vp10/encoder/encodemb.c
+++ b/vp10/encoder/encodemb.c
@@ -1111,7 +1111,7 @@
                           tx_size, ctx);
 #else
   vp10_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize,
-                   tx_size, VP10_XFORM_QUANT_FP);
+                   tx_size, VP10_XFORM_QUANT_B);
 #endif  // CONFIG_NEW_QUANT
 
   if (p->eobs[block] > 0) {
diff --git a/vp10/encoder/encoder.c b/vp10/encoder/encoder.c
index d2757bf..5adba4c 100644
--- a/vp10/encoder/encoder.c
+++ b/vp10/encoder/encoder.c
@@ -2805,6 +2805,8 @@
       const double dr =
           (double)cpi->bytes * (double) 8 / (double)1000 / time_encoded;
       const double peak = (double)((1 << cpi->oxcf.input_bit_depth) - 1);
+      const double target_rate = (double)cpi->oxcf.target_bandwidth / 1000;
+      const double rate_err = ((100.0 * (dr - target_rate)) / target_rate);
 
       if (cpi->b_calculate_psnr) {
         const double total_psnr =
@@ -2844,8 +2846,9 @@
           SNPRINT2(results, "\t%7.3f", cpi->worst_consistency);
         }
 
-        fprintf(f, "%s\t    Time\n", headings);
-        fprintf(f, "%s\t%8.0f\n", results, total_encode_time);
+        fprintf(f, "%s\t    Time  Rc-Err Abs Err\n", headings);
+        fprintf(f, "%s\t%8.0f %7.2f %7.2f\n", results,
+                total_encode_time, rate_err, fabs(rate_err));
       }
 
       fclose(f);
diff --git a/vp10/encoder/encoder.h b/vp10/encoder/encoder.h
index 608cace..d7c62b2 100644
--- a/vp10/encoder/encoder.h
+++ b/vp10/encoder/encoder.h
@@ -139,7 +139,7 @@
   int height;  // height of data passed to the compressor
   unsigned int input_bit_depth;  // Input bit depth.
   double init_framerate;  // set to passed in framerate
-  int64_t target_bandwidth;  // bandwidth to be used in kilobits per second
+  int64_t target_bandwidth;  // bandwidth to be used in bits per second
 
   int noise_sensitivity;  // pre processing blur: recommendation 0
   int sharpness;  // sharpening output: recommendation 0:
diff --git a/vp10/encoder/firstpass.c b/vp10/encoder/firstpass.c
index 71a6662..add2510 100644
--- a/vp10/encoder/firstpass.c
+++ b/vp10/encoder/firstpass.c
@@ -45,7 +45,6 @@
 
 #define BOOST_BREAKOUT      12.5
 #define BOOST_FACTOR        12.5
-#define ERR_DIVISOR         128.0
 #define FACTOR_PT_LOW       0.70
 #define FACTOR_PT_HIGH      0.90
 #define FIRST_PASS_Q        10.0
@@ -230,6 +229,13 @@
   section->duration   -= frame->duration;
 }
 
+// Calculate the linear size relative to a baseline of 1080P
+#define BASE_SIZE 2073600.0  // 1920x1080
+static double get_linear_size_factor(const VP10_COMP *cpi) {
+  const double this_area = cpi->initial_width * cpi->initial_height;
+  return pow(this_area / BASE_SIZE, 0.5);
+}
+
 // Calculate an active area of the image that discounts formatting
 // bars and partially discounts other 0 energy areas.
 #define MIN_ACTIVE_AREA 0.5
@@ -1121,11 +1127,7 @@
   return fclamp(pow(error_term, power_term), 0.05, 5.0);
 }
 
-// Larger image formats are expected to be a little harder to code relatively
-// given the same prediction error score. This in part at least relates to the
-// increased size and hence coding cost of motion vectors.
-#define EDIV_SIZE_FACTOR 800
-
+#define ERR_DIVISOR         100.0
 static int get_twopass_worst_quality(const VP10_COMP *cpi,
                                      const double section_err,
                                      double inactive_zone,
@@ -1144,12 +1146,22 @@
     const int active_mbs = VPXMAX(1, num_mbs - (int)(num_mbs * inactive_zone));
     const double av_err_per_mb = section_err / active_mbs;
     const double speed_term = 1.0 + 0.04 * oxcf->speed;
-    const double ediv_size_correction = (double)num_mbs / EDIV_SIZE_FACTOR;
+    double ediv_size_correction;
     const int target_norm_bits_per_mb = ((uint64_t)section_target_bandwidth <<
                                          BPER_MB_NORMBITS) / active_mbs;
-
     int q;
 
+    // Larger image formats are expected to be a little harder to code
+    // relatively given the same prediction error score. This in part at
+    // least relates to the increased size and hence coding overheads of
+    // motion vectors. Some account of this is made through adjustment of
+    // the error divisor.
+    ediv_size_correction =
+        VPXMAX(0.2, VPXMIN(5.0, get_linear_size_factor(cpi)));
+    if (ediv_size_correction < 1.0)
+      ediv_size_correction = -(1.0 / ediv_size_correction);
+    ediv_size_correction *= 4.0;
+
     // Try and pick a max Q that will be high enough to encode the
     // content at the given rate.
     for (q = rc->best_quality; q < rc->worst_quality; ++q) {
diff --git a/vp10/encoder/lookahead.c b/vp10/encoder/lookahead.c
index bc7b404..9e8f536 100644
--- a/vp10/encoder/lookahead.c
+++ b/vp10/encoder/lookahead.c
@@ -20,8 +20,8 @@
 
 /* Return the buffer at the given absolute index and increment the index */
 static struct lookahead_entry *pop(struct lookahead_ctx *ctx,
-                                   unsigned int *idx) {
-  unsigned int index = *idx;
+                                   int *idx) {
+  int index = *idx;
   struct lookahead_entry *buf = ctx->buf + index;
 
   assert(index < ctx->max_sz);
@@ -35,7 +35,7 @@
 void vp10_lookahead_destroy(struct lookahead_ctx *ctx) {
   if (ctx) {
     if (ctx->buf) {
-      unsigned int i;
+      int i;
 
       for (i = 0; i < ctx->max_sz; i++)
         vpx_free_frame_buffer(&ctx->buf[i].img);
@@ -221,9 +221,9 @@
 
   if (index >= 0) {
     // Forward peek
-    if (index < (int)ctx->sz) {
+    if (index < ctx->sz) {
       index += ctx->read_idx;
-      if (index >= (int)ctx->max_sz)
+      if (index >= ctx->max_sz)
         index -= ctx->max_sz;
       buf = ctx->buf + index;
     }
diff --git a/vp10/encoder/lookahead.h b/vp10/encoder/lookahead.h
index 22429ae..f650f80 100644
--- a/vp10/encoder/lookahead.h
+++ b/vp10/encoder/lookahead.h
@@ -31,10 +31,10 @@
 #define MAX_PRE_FRAMES 1
 
 struct lookahead_ctx {
-  unsigned int max_sz;         /* Absolute size of the queue */
-  unsigned int sz;             /* Number of buffers currently in the queue */
-  unsigned int read_idx;       /* Read index */
-  unsigned int write_idx;      /* Write index */
+  int max_sz;                  /* Absolute size of the queue */
+  int sz;                      /* Number of buffers currently in the queue */
+  int read_idx;                /* Read index */
+  int write_idx;               /* Write index */
   struct lookahead_entry *buf; /* Buffer list */
 };
 
diff --git a/vp10/encoder/quantize.c b/vp10/encoder/quantize.c
index 8792db2..36b4804 100644
--- a/vp10/encoder/quantize.c
+++ b/vp10/encoder/quantize.c
@@ -1158,12 +1158,12 @@
 
 static void invert_quant(int16_t *quant, int16_t *shift, int d) {
   unsigned t;
-  int l;
+  int l, m;
   t = d;
   for (l = 0; t > 1; l++)
     t >>= 1;
-  t = 1 + (1 << (16 + l)) / d;
-  *quant = (int16_t)(t - (1 << 16));
+  m = 1 + (1 << (16 + l)) / d;
+  *quant = (int16_t)(m - (1 << 16));
   *shift = 1 << (16 - l);
 }
 
diff --git a/vp10/encoder/x86/dct_sse2.c b/vp10/encoder/x86/dct_intrin_sse2.c
similarity index 100%
rename from vp10/encoder/x86/dct_sse2.c
rename to vp10/encoder/x86/dct_intrin_sse2.c
diff --git a/vp10/encoder/x86/dct_mmx.asm b/vp10/encoder/x86/dct_mmx.asm
deleted file mode 100644
index 2327fe9..0000000
--- a/vp10/encoder/x86/dct_mmx.asm
+++ /dev/null
@@ -1,104 +0,0 @@
-;
-;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%define private_prefix vp10
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-%macro TRANSFORM_COLS 0
-  paddw           m0,        m1
-  movq            m4,        m0
-  psubw           m3,        m2
-  psubw           m4,        m3
-  psraw           m4,        1
-  movq            m5,        m4
-  psubw           m5,        m1 ;b1
-  psubw           m4,        m2 ;c1
-  psubw           m0,        m4
-  paddw           m3,        m5
-                                ; m0 a0
-  SWAP            1,         4  ; m1 c1
-  SWAP            2,         3  ; m2 d1
-  SWAP            3,         5  ; m3 b1
-%endmacro
-
-%macro TRANSPOSE_4X4 0
-  movq            m4,        m0
-  movq            m5,        m2
-  punpcklwd       m4,        m1
-  punpckhwd       m0,        m1
-  punpcklwd       m5,        m3
-  punpckhwd       m2,        m3
-  movq            m1,        m4
-  movq            m3,        m0
-  punpckldq       m1,        m5
-  punpckhdq       m4,        m5
-  punpckldq       m3,        m2
-  punpckhdq       m0,        m2
-  SWAP            2, 3, 0, 1, 4
-%endmacro
-
-INIT_MMX mmx
-cglobal fwht4x4, 3, 4, 8, input, output, stride
-  lea             r3q,       [inputq + strideq*4]
-  movq            m0,        [inputq] ;a1
-  movq            m1,        [inputq + strideq*2] ;b1
-  movq            m2,        [r3q] ;c1
-  movq            m3,        [r3q + strideq*2] ;d1
-
-  TRANSFORM_COLS
-  TRANSPOSE_4X4
-  TRANSFORM_COLS
-  TRANSPOSE_4X4
-
-  psllw           m0,        2
-  psllw           m1,        2
-  psllw           m2,        2
-  psllw           m3,        2
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  pxor            m4,             m4
-  pxor            m5,             m5
-  pcmpgtw         m4,             m0
-  pcmpgtw         m5,             m1
-  movq            m6,             m0
-  movq            m7,             m1
-  punpcklwd       m0,             m4
-  punpcklwd       m1,             m5
-  punpckhwd       m6,             m4
-  punpckhwd       m7,             m5
-  movq            [outputq],      m0
-  movq            [outputq + 8],  m6
-  movq            [outputq + 16], m1
-  movq            [outputq + 24], m7
-  pxor            m4,             m4
-  pxor            m5,             m5
-  pcmpgtw         m4,             m2
-  pcmpgtw         m5,             m3
-  movq            m6,             m2
-  movq            m7,             m3
-  punpcklwd       m2,             m4
-  punpcklwd       m3,             m5
-  punpckhwd       m6,             m4
-  punpckhwd       m7,             m5
-  movq            [outputq + 32], m2
-  movq            [outputq + 40], m6
-  movq            [outputq + 48], m3
-  movq            [outputq + 56], m7
-%else
-  movq            [outputq],      m0
-  movq            [outputq + 8],  m1
-  movq            [outputq + 16], m2
-  movq            [outputq + 24], m3
-%endif
-
-  RET
diff --git a/vp10/encoder/x86/dct_sse2.asm b/vp10/encoder/x86/dct_sse2.asm
new file mode 100644
index 0000000..c3a5fb5
--- /dev/null
+++ b/vp10/encoder/x86/dct_sse2.asm
@@ -0,0 +1,86 @@
+;
+;  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%define private_prefix vp10
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro TRANSFORM_COLS 0
+  paddw           m0,        m1
+  movq            m4,        m0
+  psubw           m3,        m2
+  psubw           m4,        m3
+  psraw           m4,        1
+  movq            m5,        m4
+  psubw           m5,        m1 ;b1
+  psubw           m4,        m2 ;c1
+  psubw           m0,        m4
+  paddw           m3,        m5
+                                ; m0 a0
+  SWAP            1,         4  ; m1 c1
+  SWAP            2,         3  ; m2 d1
+  SWAP            3,         5  ; m3 b1
+%endmacro
+
+%macro TRANSPOSE_4X4 0
+                                ; 00 01 02 03
+                                ; 10 11 12 13
+                                ; 20 21 22 23
+                                ; 30 31 32 33
+  punpcklwd       m0,        m1 ; 00 10 01 11  02 12 03 13
+  punpcklwd       m2,        m3 ; 20 30 21 31  22 32 23 33
+  mova            m1,        m0
+  punpckldq       m0,        m2 ; 00 10 20 30  01 11 21 31
+  punpckhdq       m1,        m2 ; 02 12 22 32  03 13 23 33
+%endmacro
+
+INIT_XMM sse2
+cglobal fwht4x4, 3, 4, 8, input, output, stride
+  lea             r3q,       [inputq + strideq*4]
+  movq            m0,        [inputq] ;a1
+  movq            m1,        [inputq + strideq*2] ;b1
+  movq            m2,        [r3q] ;c1
+  movq            m3,        [r3q + strideq*2] ;d1
+
+  TRANSFORM_COLS
+  TRANSPOSE_4X4
+  SWAP            1,         2
+  psrldq          m1,        m0, 8
+  psrldq          m3,        m2, 8
+  TRANSFORM_COLS
+  TRANSPOSE_4X4
+
+  psllw           m0,        2
+  psllw           m1,        2
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  ; sign extension
+  mova            m2,             m0
+  mova            m3,             m1
+  punpcklwd       m0,             m0
+  punpcklwd       m1,             m1
+  punpckhwd       m2,             m2
+  punpckhwd       m3,             m3
+  psrad           m0,             16
+  psrad           m1,             16
+  psrad           m2,             16
+  psrad           m3,             16
+  mova            [outputq],      m0
+  mova            [outputq + 16], m2
+  mova            [outputq + 32], m1
+  mova            [outputq + 48], m3
+%else
+  mova            [outputq],      m0
+  mova            [outputq + 16], m1
+%endif
+
+  RET
diff --git a/vp10/vp10cx.mk b/vp10/vp10cx.mk
index 007fb4e..5d5c88a 100644
--- a/vp10/vp10cx.mk
+++ b/vp10/vp10cx.mk
@@ -104,7 +104,7 @@
 endif
 
 ifeq ($(CONFIG_USE_X86INC),yes)
-VP10_CX_SRCS-$(HAVE_MMX) += encoder/x86/dct_mmx.asm
+VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
 VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/error_sse2.asm
 endif
 
@@ -114,7 +114,7 @@
 endif
 endif
 
-VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.c
+VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_intrin_sse2.c
 VP10_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/dct_ssse3.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP10_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/highbd_fwd_txfm_sse4.c
diff --git a/vp8/common/arm/neon/bilinearpredict_neon.c b/vp8/common/arm/neon/bilinearpredict_neon.c
index 9824a31..bb6ea76 100644
--- a/vp8/common/arm/neon/bilinearpredict_neon.c
+++ b/vp8/common/arm/neon/bilinearpredict_neon.c
@@ -21,114 +21,6 @@
     { 16, 112}
 };
 
-void vp8_bilinear_predict4x4_neon(
-        unsigned char *src_ptr,
-        int src_pixels_per_line,
-        int xoffset,
-        int yoffset,
-        unsigned char *dst_ptr,
-        int dst_pitch) {
-    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8;
-    uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8;
-    uint8x16_t q1u8, q2u8;
-    uint16x8_t q1u16, q2u16;
-    uint16x8_t q7u16, q8u16, q9u16;
-    uint64x2_t q4u64, q5u64;
-    uint64x1_t d12u64;
-    uint32x2x2_t d0u32x2, d1u32x2, d2u32x2, d3u32x2;
-
-    if (xoffset == 0) {  // skip_1stpass_filter
-        uint32x2_t d28u32 = vdup_n_u32(0);
-        uint32x2_t d29u32 = vdup_n_u32(0);
-        uint32x2_t d30u32 = vdup_n_u32(0);
-
-        d28u32 = vld1_lane_u32((const uint32_t *)src_ptr, d28u32, 0);
-        src_ptr += src_pixels_per_line;
-        d28u32 = vld1_lane_u32((const uint32_t *)src_ptr, d28u32, 1);
-        src_ptr += src_pixels_per_line;
-        d29u32 = vld1_lane_u32((const uint32_t *)src_ptr, d29u32, 0);
-        src_ptr += src_pixels_per_line;
-        d29u32 = vld1_lane_u32((const uint32_t *)src_ptr, d29u32, 1);
-        src_ptr += src_pixels_per_line;
-        d30u32 = vld1_lane_u32((const uint32_t *)src_ptr, d30u32, 0);
-        d28u8 = vreinterpret_u8_u32(d28u32);
-        d29u8 = vreinterpret_u8_u32(d29u32);
-        d30u8 = vreinterpret_u8_u32(d30u32);
-    } else {
-        d2u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
-        d3u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
-        d4u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
-        d5u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
-        d6u8 = vld1_u8(src_ptr);
-
-        q1u8 = vcombine_u8(d2u8, d3u8);
-        q2u8 = vcombine_u8(d4u8, d5u8);
-
-        d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
-        d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
-
-        q4u64  = vshrq_n_u64(vreinterpretq_u64_u8(q1u8), 8);
-        q5u64  = vshrq_n_u64(vreinterpretq_u64_u8(q2u8), 8);
-        d12u64 = vshr_n_u64(vreinterpret_u64_u8(d6u8), 8);
-
-        d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q1u8)),
-                           vreinterpret_u32_u8(vget_high_u8(q1u8)));
-        d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q2u8)),
-                           vreinterpret_u32_u8(vget_high_u8(q2u8)));
-        d2u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q4u64)),
-                           vreinterpret_u32_u64(vget_high_u64(q4u64)));
-        d3u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)),
-                           vreinterpret_u32_u64(vget_high_u64(q5u64)));
-
-        q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d0u8);
-        q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d0u8);
-        q9u16 = vmull_u8(d6u8, d0u8);
-
-        q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d2u32x2.val[0]), d1u8);
-        q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d3u32x2.val[0]), d1u8);
-        q9u16 = vmlal_u8(q9u16, vreinterpret_u8_u64(d12u64), d1u8);
-
-        d28u8 = vqrshrn_n_u16(q7u16, 7);
-        d29u8 = vqrshrn_n_u16(q8u16, 7);
-        d30u8 = vqrshrn_n_u16(q9u16, 7);
-    }
-
-    // secondpass_filter
-    if (yoffset == 0) {  // skip_2ndpass_filter
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 0);
-        dst_ptr += dst_pitch;
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 1);
-        dst_ptr += dst_pitch;
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d29u8), 0);
-        dst_ptr += dst_pitch;
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d29u8), 1);
-    } else {
-        d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
-        d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
-
-        q1u16 = vmull_u8(d28u8, d0u8);
-        q2u16 = vmull_u8(d29u8, d0u8);
-
-        d26u8 = vext_u8(d28u8, d29u8, 4);
-        d27u8 = vext_u8(d29u8, d30u8, 4);
-
-        q1u16 = vmlal_u8(q1u16, d26u8, d1u8);
-        q2u16 = vmlal_u8(q2u16, d27u8, d1u8);
-
-        d2u8 = vqrshrn_n_u16(q1u16, 7);
-        d3u8 = vqrshrn_n_u16(q2u16, 7);
-
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 0);
-        dst_ptr += dst_pitch;
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 1);
-        dst_ptr += dst_pitch;
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0);
-        dst_ptr += dst_pitch;
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1);
-    }
-    return;
-}
-
 void vp8_bilinear_predict8x4_neon(
         unsigned char *src_ptr,
         int src_pixels_per_line,
diff --git a/vp8/common/arm/neon/sixtappredict_neon.c b/vp8/common/arm/neon/sixtappredict_neon.c
index 4c2efc9..49d8d22 100644
--- a/vp8/common/arm/neon/sixtappredict_neon.c
+++ b/vp8/common/arm/neon/sixtappredict_neon.c
@@ -22,383 +22,6 @@
     {0, -1,   12, 123,  -6, 0, 0, 0},
 };
 
-void vp8_sixtap_predict4x4_neon(
-        unsigned char *src_ptr,
-        int src_pixels_per_line,
-        int xoffset,
-        int yoffset,
-        unsigned char *dst_ptr,
-        int dst_pitch) {
-    unsigned char *src;
-    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d18u8, d19u8, d20u8, d21u8;
-    uint8x8_t d23u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
-    int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
-    uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
-    uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
-    int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
-    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
-    uint8x16_t q3u8, q4u8, q5u8, q6u8, q11u8;
-    uint64x2_t q3u64, q4u64, q5u64, q6u64, q9u64, q10u64;
-    uint32x2x2_t d0u32x2, d1u32x2;
-
-    if (xoffset == 0) {  // secondpass_filter4x4_only
-        uint32x2_t d27u32 = vdup_n_u32(0);
-        uint32x2_t d28u32 = vdup_n_u32(0);
-        uint32x2_t d29u32 = vdup_n_u32(0);
-        uint32x2_t d30u32 = vdup_n_u32(0);
-        uint32x2_t d31u32 = vdup_n_u32(0);
-
-        // load second_pass filter
-        dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
-        d0s8 = vdup_lane_s8(dtmps8, 0);
-        d1s8 = vdup_lane_s8(dtmps8, 1);
-        d2s8 = vdup_lane_s8(dtmps8, 2);
-        d3s8 = vdup_lane_s8(dtmps8, 3);
-        d4s8 = vdup_lane_s8(dtmps8, 4);
-        d5s8 = vdup_lane_s8(dtmps8, 5);
-        d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
-        d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
-        d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
-        d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
-        d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
-        d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
-
-        // load src data
-        src = src_ptr - src_pixels_per_line * 2;
-        d27u32 = vld1_lane_u32((const uint32_t *)src, d27u32, 0);
-        src += src_pixels_per_line;
-        d27u32 = vld1_lane_u32((const uint32_t *)src, d27u32, 1);
-        src += src_pixels_per_line;
-        d28u32 = vld1_lane_u32((const uint32_t *)src, d28u32, 0);
-        src += src_pixels_per_line;
-        d28u32 = vld1_lane_u32((const uint32_t *)src, d28u32, 1);
-        src += src_pixels_per_line;
-        d29u32 = vld1_lane_u32((const uint32_t *)src, d29u32, 0);
-        src += src_pixels_per_line;
-        d29u32 = vld1_lane_u32((const uint32_t *)src, d29u32, 1);
-        src += src_pixels_per_line;
-        d30u32 = vld1_lane_u32((const uint32_t *)src, d30u32, 0);
-        src += src_pixels_per_line;
-        d30u32 = vld1_lane_u32((const uint32_t *)src, d30u32, 1);
-        src += src_pixels_per_line;
-        d31u32 = vld1_lane_u32((const uint32_t *)src, d31u32, 0);
-
-        d27u8 = vreinterpret_u8_u32(d27u32);
-        d28u8 = vreinterpret_u8_u32(d28u32);
-        d29u8 = vreinterpret_u8_u32(d29u32);
-        d30u8 = vreinterpret_u8_u32(d30u32);
-        d31u8 = vreinterpret_u8_u32(d31u32);
-
-        d23u8 = vext_u8(d27u8, d28u8, 4);
-        d24u8 = vext_u8(d28u8, d29u8, 4);
-        d25u8 = vext_u8(d29u8, d30u8, 4);
-        d26u8 = vext_u8(d30u8, d31u8, 4);
-
-        q3u16 = vmull_u8(d27u8, d0u8);
-        q4u16 = vmull_u8(d28u8, d0u8);
-        q5u16 = vmull_u8(d25u8, d5u8);
-        q6u16 = vmull_u8(d26u8, d5u8);
-
-        q3u16 = vmlsl_u8(q3u16, d29u8, d4u8);
-        q4u16 = vmlsl_u8(q4u16, d30u8, d4u8);
-        q5u16 = vmlsl_u8(q5u16, d23u8, d1u8);
-        q6u16 = vmlsl_u8(q6u16, d24u8, d1u8);
-
-        q3u16 = vmlal_u8(q3u16, d28u8, d2u8);
-        q4u16 = vmlal_u8(q4u16, d29u8, d2u8);
-        q5u16 = vmlal_u8(q5u16, d24u8, d3u8);
-        q6u16 = vmlal_u8(q6u16, d25u8, d3u8);
-
-        q3s16 = vreinterpretq_s16_u16(q3u16);
-        q4s16 = vreinterpretq_s16_u16(q4u16);
-        q5s16 = vreinterpretq_s16_u16(q5u16);
-        q6s16 = vreinterpretq_s16_u16(q6u16);
-
-        q5s16 = vqaddq_s16(q5s16, q3s16);
-        q6s16 = vqaddq_s16(q6s16, q4s16);
-
-        d3u8 = vqrshrun_n_s16(q5s16, 7);
-        d4u8 = vqrshrun_n_s16(q6s16, 7);
-
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0);
-        dst_ptr += dst_pitch;
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1);
-        dst_ptr += dst_pitch;
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 0);
-        dst_ptr += dst_pitch;
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 1);
-        return;
-    }
-
-    // load first_pass filter
-    dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
-    d0s8 = vdup_lane_s8(dtmps8, 0);
-    d1s8 = vdup_lane_s8(dtmps8, 1);
-    d2s8 = vdup_lane_s8(dtmps8, 2);
-    d3s8 = vdup_lane_s8(dtmps8, 3);
-    d4s8 = vdup_lane_s8(dtmps8, 4);
-    d5s8 = vdup_lane_s8(dtmps8, 5);
-    d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
-    d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
-    d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
-    d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
-    d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
-    d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
-
-    // First pass: output_height lines x output_width columns (9x4)
-
-    if (yoffset == 0)  // firstpass_filter4x4_only
-        src = src_ptr - 2;
-    else
-        src = src_ptr - 2 - (src_pixels_per_line * 2);
-
-    q3u8 = vld1q_u8(src);
-    src += src_pixels_per_line;
-    q4u8 = vld1q_u8(src);
-    src += src_pixels_per_line;
-    q5u8 = vld1q_u8(src);
-    src += src_pixels_per_line;
-    q6u8 = vld1q_u8(src);
-    src += src_pixels_per_line;
-
-    d18u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
-    d19u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
-    d20u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
-    d21u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
-
-    // vswp here
-    q3u8 = vcombine_u8(vget_low_u8(q3u8), vget_low_u8(q4u8));
-    q5u8 = vcombine_u8(vget_low_u8(q5u8), vget_low_u8(q6u8));
-
-    d0u32x2 = vzip_u32(vreinterpret_u32_u8(d18u8),  // d18 d19
-                       vreinterpret_u32_u8(d19u8));
-    d1u32x2 = vzip_u32(vreinterpret_u32_u8(d20u8),  // d20 d21
-                       vreinterpret_u32_u8(d21u8));
-    q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d5u8);
-    q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d5u8);
-
-    // keep original src data in q4 q6
-    q4u64 = vreinterpretq_u64_u8(q3u8);
-    q6u64 = vreinterpretq_u64_u8(q5u8);
-
-    d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q3u8)),  // d6 d7
-                       vreinterpret_u32_u8(vget_high_u8(q3u8)));
-    d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q5u8)),  // d10 d11
-                       vreinterpret_u32_u8(vget_high_u8(q5u8)));
-    q9u64 = vshrq_n_u64(q4u64, 8);
-    q10u64 = vshrq_n_u64(q6u64, 8);
-    q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d0u8);
-    q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d0u8);
-
-    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)),   // d18 d19
-                       vreinterpret_u32_u64(vget_high_u64(q9u64)));
-    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)),  // d20 d211
-                       vreinterpret_u32_u64(vget_high_u64(q10u64)));
-    q3u64 = vshrq_n_u64(q4u64, 32);
-    q5u64 = vshrq_n_u64(q6u64, 32);
-    q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d1u8);
-    q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d1u8);
-
-    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)),  // d6 d7
-                       vreinterpret_u32_u64(vget_high_u64(q3u64)));
-    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)),  // d10 d11
-                       vreinterpret_u32_u64(vget_high_u64(q5u64)));
-    q9u64 = vshrq_n_u64(q4u64, 16);
-    q10u64 = vshrq_n_u64(q6u64, 16);
-    q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d4u8);
-    q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d4u8);
-
-    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)),   // d18 d19
-                       vreinterpret_u32_u64(vget_high_u64(q9u64)));
-    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)),  // d20 d211
-                       vreinterpret_u32_u64(vget_high_u64(q10u64)));
-    q3u64 = vshrq_n_u64(q4u64, 24);
-    q5u64 = vshrq_n_u64(q6u64, 24);
-    q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d2u8);
-    q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d2u8);
-
-    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)),  // d6 d7
-                       vreinterpret_u32_u64(vget_high_u64(q3u64)));
-    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)),  // d10 d11
-                       vreinterpret_u32_u64(vget_high_u64(q5u64)));
-    q9u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d3u8);
-    q10u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d3u8);
-
-    q7s16 = vreinterpretq_s16_u16(q7u16);
-    q8s16 = vreinterpretq_s16_u16(q8u16);
-    q9s16 = vreinterpretq_s16_u16(q9u16);
-    q10s16 = vreinterpretq_s16_u16(q10u16);
-    q7s16 = vqaddq_s16(q7s16, q9s16);
-    q8s16 = vqaddq_s16(q8s16, q10s16);
-
-    d27u8 = vqrshrun_n_s16(q7s16, 7);
-    d28u8 = vqrshrun_n_s16(q8s16, 7);
-
-    if (yoffset == 0) {  // firstpass_filter4x4_only
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d27u8), 0);
-        dst_ptr += dst_pitch;
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d27u8), 1);
-        dst_ptr += dst_pitch;
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 0);
-        dst_ptr += dst_pitch;
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 1);
-        return;
-    }
-
-    // First Pass on rest 5-line data
-    q3u8 = vld1q_u8(src);
-    src += src_pixels_per_line;
-    q4u8 = vld1q_u8(src);
-    src += src_pixels_per_line;
-    q5u8 = vld1q_u8(src);
-    src += src_pixels_per_line;
-    q6u8 = vld1q_u8(src);
-    src += src_pixels_per_line;
-    q11u8 = vld1q_u8(src);
-
-    d18u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
-    d19u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
-    d20u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
-    d21u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
-
-    // vswp here
-    q3u8 = vcombine_u8(vget_low_u8(q3u8), vget_low_u8(q4u8));
-    q5u8 = vcombine_u8(vget_low_u8(q5u8), vget_low_u8(q6u8));
-
-    d0u32x2 = vzip_u32(vreinterpret_u32_u8(d18u8),  // d18 d19
-                       vreinterpret_u32_u8(d19u8));
-    d1u32x2 = vzip_u32(vreinterpret_u32_u8(d20u8),  // d20 d21
-                       vreinterpret_u32_u8(d21u8));
-    d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 5);
-    q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d5u8);
-    q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d5u8);
-    q12u16 = vmull_u8(d31u8, d5u8);
-
-    q4u64 = vreinterpretq_u64_u8(q3u8);
-    q6u64 = vreinterpretq_u64_u8(q5u8);
-
-    d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q3u8)),  // d6 d7
-                       vreinterpret_u32_u8(vget_high_u8(q3u8)));
-    d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q5u8)),  // d10 d11
-                       vreinterpret_u32_u8(vget_high_u8(q5u8)));
-    q9u64 = vshrq_n_u64(q4u64, 8);
-    q10u64 = vshrq_n_u64(q6u64, 8);
-    q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d0u8);
-    q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d0u8);
-    q12u16 = vmlal_u8(q12u16, vget_low_u8(q11u8), d0u8);
-
-    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)),   // d18 d19
-                       vreinterpret_u32_u64(vget_high_u64(q9u64)));
-    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)),  // d20 d211
-                       vreinterpret_u32_u64(vget_high_u64(q10u64)));
-    q3u64 = vshrq_n_u64(q4u64, 32);
-    q5u64 = vshrq_n_u64(q6u64, 32);
-    d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 1);
-    q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d1u8);
-    q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d1u8);
-    q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
-
-    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)),  // d6 d7
-                       vreinterpret_u32_u64(vget_high_u64(q3u64)));
-    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)),  // d10 d11
-                       vreinterpret_u32_u64(vget_high_u64(q5u64)));
-    q9u64 = vshrq_n_u64(q4u64, 16);
-    q10u64 = vshrq_n_u64(q6u64, 16);
-    d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 4);
-    q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d4u8);
-    q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d4u8);
-    q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
-
-    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)),   // d18 d19
-                       vreinterpret_u32_u64(vget_high_u64(q9u64)));
-    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)),  // d20 d211
-                       vreinterpret_u32_u64(vget_high_u64(q10u64)));
-    q3u64 = vshrq_n_u64(q4u64, 24);
-    q5u64 = vshrq_n_u64(q6u64, 24);
-    d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 2);
-    q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d2u8);
-    q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d2u8);
-    q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
-
-    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)),  // d6 d7
-                       vreinterpret_u32_u64(vget_high_u64(q3u64)));
-    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)),  // d10 d11
-                       vreinterpret_u32_u64(vget_high_u64(q5u64)));
-    d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 3);
-    q9u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d3u8);
-    q10u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d3u8);
-    q11u16 = vmull_u8(d31u8, d3u8);
-
-    q7s16 = vreinterpretq_s16_u16(q7u16);
-    q8s16 = vreinterpretq_s16_u16(q8u16);
-    q9s16 = vreinterpretq_s16_u16(q9u16);
-    q10s16 = vreinterpretq_s16_u16(q10u16);
-    q11s16 = vreinterpretq_s16_u16(q11u16);
-    q12s16 = vreinterpretq_s16_u16(q12u16);
-    q7s16 = vqaddq_s16(q7s16, q9s16);
-    q8s16 = vqaddq_s16(q8s16, q10s16);
-    q12s16 = vqaddq_s16(q12s16, q11s16);
-
-    d29u8 = vqrshrun_n_s16(q7s16, 7);
-    d30u8 = vqrshrun_n_s16(q8s16, 7);
-    d31u8 = vqrshrun_n_s16(q12s16, 7);
-
-    // Second pass: 4x4
-    dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
-    d0s8 = vdup_lane_s8(dtmps8, 0);
-    d1s8 = vdup_lane_s8(dtmps8, 1);
-    d2s8 = vdup_lane_s8(dtmps8, 2);
-    d3s8 = vdup_lane_s8(dtmps8, 3);
-    d4s8 = vdup_lane_s8(dtmps8, 4);
-    d5s8 = vdup_lane_s8(dtmps8, 5);
-    d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
-    d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
-    d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
-    d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
-    d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
-    d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
-
-    d23u8 = vext_u8(d27u8, d28u8, 4);
-    d24u8 = vext_u8(d28u8, d29u8, 4);
-    d25u8 = vext_u8(d29u8, d30u8, 4);
-    d26u8 = vext_u8(d30u8, d31u8, 4);
-
-    q3u16 = vmull_u8(d27u8, d0u8);
-    q4u16 = vmull_u8(d28u8, d0u8);
-    q5u16 = vmull_u8(d25u8, d5u8);
-    q6u16 = vmull_u8(d26u8, d5u8);
-
-    q3u16 = vmlsl_u8(q3u16, d29u8, d4u8);
-    q4u16 = vmlsl_u8(q4u16, d30u8, d4u8);
-    q5u16 = vmlsl_u8(q5u16, d23u8, d1u8);
-    q6u16 = vmlsl_u8(q6u16, d24u8, d1u8);
-
-    q3u16 = vmlal_u8(q3u16, d28u8, d2u8);
-    q4u16 = vmlal_u8(q4u16, d29u8, d2u8);
-    q5u16 = vmlal_u8(q5u16, d24u8, d3u8);
-    q6u16 = vmlal_u8(q6u16, d25u8, d3u8);
-
-    q3s16 = vreinterpretq_s16_u16(q3u16);
-    q4s16 = vreinterpretq_s16_u16(q4u16);
-    q5s16 = vreinterpretq_s16_u16(q5u16);
-    q6s16 = vreinterpretq_s16_u16(q6u16);
-
-    q5s16 = vqaddq_s16(q5s16, q3s16);
-    q6s16 = vqaddq_s16(q6s16, q4s16);
-
-    d3u8 = vqrshrun_n_s16(q5s16, 7);
-    d4u8 = vqrshrun_n_s16(q6s16, 7);
-
-    vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0);
-    dst_ptr += dst_pitch;
-    vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1);
-    dst_ptr += dst_pitch;
-    vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 0);
-    dst_ptr += dst_pitch;
-    vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 1);
-    return;
-}
-
 void vp8_sixtap_predict8x4_neon(
         unsigned char *src_ptr,
         int src_pixels_per_line,
diff --git a/vp8/common/findnearmv.h b/vp8/common/findnearmv.h
index 155847c..472a7b5 100644
--- a/vp8/common/findnearmv.h
+++ b/vp8/common/findnearmv.h
@@ -104,7 +104,7 @@
 extern const unsigned char vp8_mbsplit_offset[4][16];
 
 
-static INLINE int left_block_mv(const MODE_INFO *cur_mb, int b)
+static INLINE uint32_t left_block_mv(const MODE_INFO *cur_mb, int b)
 {
     if (!(b & 3))
     {
@@ -119,7 +119,8 @@
     return (cur_mb->bmi + b - 1)->mv.as_int;
 }
 
-static INLINE int above_block_mv(const MODE_INFO *cur_mb, int b, int mi_stride)
+static INLINE uint32_t above_block_mv(const MODE_INFO *cur_mb, int b,
+                                      int mi_stride)
 {
     if (!(b >> 2))
     {
diff --git a/vp8/common/mips/msa/postproc_msa.c b/vp8/common/mips/msa/postproc_msa.c
index c88f302..23dcde2 100644
--- a/vp8/common/mips/msa/postproc_msa.c
+++ b/vp8/common/mips/msa/postproc_msa.c
@@ -10,6 +10,7 @@
 
 #include <stdlib.h>
 #include "./vp8_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vp8/common/mips/msa/vp8_macros_msa.h"
 
 static const int16_t vp8_rv_msa[] =
@@ -798,54 +799,3 @@
         }
     }
 }
-
-void vp8_plane_add_noise_msa(uint8_t *start_ptr, char *noise,
-                             char blackclamp[16], char whiteclamp[16],
-                             char bothclamp[16],
-                             uint32_t width, uint32_t height,
-                             int32_t pitch)
-{
-    uint32_t i, j;
-
-    for (i = 0; i < height / 2; ++i)
-    {
-        uint8_t *pos0_ptr = start_ptr + (2 * i) * pitch;
-        int8_t *ref0_ptr = (int8_t *) (noise + (rand() & 0xff));
-        uint8_t *pos1_ptr = start_ptr + (2 * i + 1) * pitch;
-        int8_t *ref1_ptr = (int8_t *) (noise + (rand() & 0xff));
-        for (j = width / 16; j--;)
-        {
-            v16i8 temp00_s, temp01_s;
-            v16u8 temp00, temp01, black_clamp, white_clamp;
-            v16u8 pos0, ref0, pos1, ref1;
-            v16i8 const127 = __msa_ldi_b(127);
-
-            pos0 = LD_UB(pos0_ptr);
-            ref0 = LD_UB(ref0_ptr);
-            pos1 = LD_UB(pos1_ptr);
-            ref1 = LD_UB(ref1_ptr);
-            black_clamp = (v16u8)__msa_fill_b(blackclamp[0]);
-            white_clamp = (v16u8)__msa_fill_b(whiteclamp[0]);
-            temp00 = (pos0 < black_clamp);
-            pos0 = __msa_bmnz_v(pos0, black_clamp, temp00);
-            temp01 = (pos1 < black_clamp);
-            pos1 = __msa_bmnz_v(pos1, black_clamp, temp01);
-            XORI_B2_128_UB(pos0, pos1);
-            temp00_s = __msa_adds_s_b((v16i8)white_clamp, const127);
-            temp00 = (v16u8)(temp00_s < pos0);
-            pos0 = (v16u8)__msa_bmnz_v((v16u8)pos0, (v16u8)temp00_s, temp00);
-            temp01_s = __msa_adds_s_b((v16i8)white_clamp, const127);
-            temp01 = (temp01_s < pos1);
-            pos1 = (v16u8)__msa_bmnz_v((v16u8)pos1, (v16u8)temp01_s, temp01);
-            XORI_B2_128_UB(pos0, pos1);
-            pos0 += ref0;
-            ST_UB(pos0, pos0_ptr);
-            pos1 += ref1;
-            ST_UB(pos1, pos1_ptr);
-            pos0_ptr += 16;
-            pos1_ptr += 16;
-            ref0_ptr += 16;
-            ref1_ptr += 16;
-        }
-    }
-}
diff --git a/vp8/common/postproc.c b/vp8/common/postproc.c
index 322b613..6baf00f 100644
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -10,6 +10,7 @@
 
 
 #include "vpx_config.h"
+#include "vpx_dsp_rtcd.h"
 #include "vp8_rtcd.h"
 #include "vpx_scale_rtcd.h"
 #include "vpx_scale/yv12config.h"
@@ -490,54 +491,6 @@
     state->last_noise = a;
 }
 
-/****************************************************************************
- *
- *  ROUTINE       : plane_add_noise_c
- *
- *  INPUTS        : unsigned char *Start    starting address of buffer to add gaussian
- *                                  noise to
- *                  unsigned int Width    width of plane
- *                  unsigned int Height   height of plane
- *                  int  Pitch    distance between subsequent lines of frame
- *                  int  q        quantizer used to determine amount of noise
- *                                  to add
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void.
- *
- *  FUNCTION      : adds gaussian noise to a plane of pixels
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
-void vp8_plane_add_noise_c(unsigned char *Start, char *noise,
-                           char blackclamp[16],
-                           char whiteclamp[16],
-                           char bothclamp[16],
-                           unsigned int Width, unsigned int Height, int Pitch)
-{
-    unsigned int i, j;
-    (void)bothclamp;
-
-    for (i = 0; i < Height; i++)
-    {
-        unsigned char *Pos = Start + i * Pitch;
-        char  *Ref = (char *)(noise + (rand() & 0xff));
-
-        for (j = 0; j < Width; j++)
-        {
-            if (Pos[j] < blackclamp[0])
-                Pos[j] = blackclamp[0];
-
-            if (Pos[j] > 255 + whiteclamp[0])
-                Pos[j] = 255 + whiteclamp[0];
-
-            Pos[j] += Ref[j];
-        }
-    }
-}
-
 /* Blend the macro block with a solid colored square.  Leave the
  * edges unblended to give distinction to macro blocks in areas
  * filled with the same color block.
@@ -828,7 +781,7 @@
             fillrd(&oci->postproc_state, 63 - q, noise_level);
         }
 
-        vp8_plane_add_noise
+        vpx_plane_add_noise
         (oci->post_proc_buffer.y_buffer,
          oci->postproc_state.noise,
          oci->postproc_state.blackclamp,
diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index 6799c27..856ede1 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -167,10 +167,6 @@
     add_proto qw/void vp8_post_proc_down_and_across_mb_row/, "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size";
     specialize qw/vp8_post_proc_down_and_across_mb_row sse2 msa/;
 
-    add_proto qw/void vp8_plane_add_noise/, "unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch";
-    specialize qw/vp8_plane_add_noise mmx sse2 msa/;
-    $vp8_plane_add_noise_sse2=vp8_plane_add_noise_wmt;
-
     add_proto qw/void vp8_blend_mb_inner/, "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride";
     # no asm yet
 
@@ -209,7 +205,6 @@
 $vp8_sixtap_predict8x4_dspr2=vp8_sixtap_predict8x4_dspr2;
 
 add_proto qw/void vp8_sixtap_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-#TODO(johannkoenig): fix the neon version https://code.google.com/p/webm/issues/detail?id=817
 specialize qw/vp8_sixtap_predict4x4 mmx ssse3 media dspr2 msa/;
 $vp8_sixtap_predict4x4_media=vp8_sixtap_predict4x4_armv6;
 $vp8_sixtap_predict4x4_dspr2=vp8_sixtap_predict4x4_dspr2;
@@ -227,7 +222,6 @@
 $vp8_bilinear_predict8x4_media=vp8_bilinear_predict8x4_armv6;
 
 add_proto qw/void vp8_bilinear_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-#TODO(johannkoenig): fix the neon version https://code.google.com/p/webm/issues/detail?id=892
 specialize qw/vp8_bilinear_predict4x4 mmx media msa/;
 $vp8_bilinear_predict4x4_media=vp8_bilinear_predict4x4_armv6;
 
diff --git a/vp8/common/threading.h b/vp8/common/threading.h
index c00e517..183b49b 100644
--- a/vp8/common/threading.h
+++ b/vp8/common/threading.h
@@ -44,8 +44,8 @@
 #include <os2.h>
 
 #include <stdlib.h>
-#define THREAD_FUNCTION void
-#define THREAD_FUNCTION_RETURN void
+#define THREAD_FUNCTION void *
+#define THREAD_FUNCTION_RETURN void *
 #define THREAD_SPECIFIC_INDEX PULONG
 #define pthread_t TID
 #define pthread_attr_t ULONG
diff --git a/vp8/common/x86/postproc_mmx.asm b/vp8/common/x86/postproc_mmx.asm
index a2b1632..1a89e7e 100644
--- a/vp8/common/x86/postproc_mmx.asm
+++ b/vp8/common/x86/postproc_mmx.asm
@@ -241,68 +241,6 @@
 %undef flimit2
 
 
-;void vp8_plane_add_noise_mmx (unsigned char *Start, unsigned char *noise,
-;                            unsigned char blackclamp[16],
-;                            unsigned char whiteclamp[16],
-;                            unsigned char bothclamp[16],
-;                            unsigned int Width, unsigned int Height, int Pitch)
-global sym(vp8_plane_add_noise_mmx) PRIVATE
-sym(vp8_plane_add_noise_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-.addnoise_loop:
-    call sym(LIBVPX_RAND) WRT_PLT
-    mov     rcx, arg(1) ;noise
-    and     rax, 0xff
-    add     rcx, rax
-
-    ; we rely on the fact that the clamping vectors are stored contiguously
-    ; in black/white/both order. Note that we have to reload this here because
-    ; rdx could be trashed by rand()
-    mov     rdx, arg(2) ; blackclamp
-
-
-            mov     rdi, rcx
-            movsxd  rcx, dword arg(5) ;[Width]
-            mov     rsi, arg(0) ;Pos
-            xor         rax,rax
-
-.addnoise_nextset:
-            movq        mm1,[rsi+rax]         ; get the source
-
-            psubusb     mm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
-            paddusb     mm1, [rdx+32] ;bothclamp
-            psubusb     mm1, [rdx+16] ;whiteclamp
-
-            movq        mm2,[rdi+rax]         ; get the noise for this line
-            paddb       mm1,mm2              ; add it in
-            movq        [rsi+rax],mm1         ; store the result
-
-            add         rax,8                 ; move to the next line
-
-            cmp         rax, rcx
-            jl          .addnoise_nextset
-
-    movsxd  rax, dword arg(7) ; Pitch
-    add     arg(0), rax ; Start += Pitch
-    sub     dword arg(6), 1   ; Height -= 1
-    jg      .addnoise_loop
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
 SECTION_RODATA
 align 16
 Blur:
diff --git a/vp8/common/x86/postproc_sse2.asm b/vp8/common/x86/postproc_sse2.asm
index fed4ee5..de17afa 100644
--- a/vp8/common/x86/postproc_sse2.asm
+++ b/vp8/common/x86/postproc_sse2.asm
@@ -655,68 +655,6 @@
 %undef flimit4
 
 
-;void vp8_plane_add_noise_wmt (unsigned char *Start, unsigned char *noise,
-;                            unsigned char blackclamp[16],
-;                            unsigned char whiteclamp[16],
-;                            unsigned char bothclamp[16],
-;                            unsigned int Width, unsigned int Height, int Pitch)
-global sym(vp8_plane_add_noise_wmt) PRIVATE
-sym(vp8_plane_add_noise_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-.addnoise_loop:
-    call sym(LIBVPX_RAND) WRT_PLT
-    mov     rcx, arg(1) ;noise
-    and     rax, 0xff
-    add     rcx, rax
-
-    ; we rely on the fact that the clamping vectors are stored contiguously
-    ; in black/white/both order. Note that we have to reload this here because
-    ; rdx could be trashed by rand()
-    mov     rdx, arg(2) ; blackclamp
-
-
-            mov     rdi, rcx
-            movsxd  rcx, dword arg(5) ;[Width]
-            mov     rsi, arg(0) ;Pos
-            xor         rax,rax
-
-.addnoise_nextset:
-            movdqu      xmm1,[rsi+rax]         ; get the source
-
-            psubusb     xmm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
-            paddusb     xmm1, [rdx+32] ;bothclamp
-            psubusb     xmm1, [rdx+16] ;whiteclamp
-
-            movdqu      xmm2,[rdi+rax]         ; get the noise for this line
-            paddb       xmm1,xmm2              ; add it in
-            movdqu      [rsi+rax],xmm1         ; store the result
-
-            add         rax,16                 ; move to the next line
-
-            cmp         rax, rcx
-            jl          .addnoise_nextset
-
-    movsxd  rax, dword arg(7) ; Pitch
-    add     arg(0), rax ; Start += Pitch
-    sub     dword arg(6), 1   ; Height -= 1
-    jg      .addnoise_loop
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
 SECTION_RODATA
 align 16
 four8s:
diff --git a/vp8/decoder/dboolhuff.c b/vp8/decoder/dboolhuff.c
index 8a7e332..5cdd2a2 100644
--- a/vp8/decoder/dboolhuff.c
+++ b/vp8/decoder/dboolhuff.c
@@ -44,7 +44,7 @@
     int shift = VP8_BD_VALUE_SIZE - CHAR_BIT - (count + CHAR_BIT);
     size_t bytes_left = br->user_buffer_end - bufptr;
     size_t bits_left = bytes_left * CHAR_BIT;
-    int x = (int)(shift + CHAR_BIT - bits_left);
+    int x = shift + CHAR_BIT - (int)bits_left;
     int loop_end = 0;
     unsigned char decrypted[sizeof(VP8_BD_VALUE) + 1];
 
diff --git a/vp8/decoder/dboolhuff.h b/vp8/decoder/dboolhuff.h
index cc9eaaf..1b1bbf8 100644
--- a/vp8/decoder/dboolhuff.h
+++ b/vp8/decoder/dboolhuff.h
@@ -83,7 +83,7 @@
     }
 
     {
-        register unsigned int shift = vp8_norm[range];
+        register int shift = vp8_norm[range];
         range <<= shift;
         value <<= shift;
         count -= shift;
diff --git a/vp8/decoder/decodeframe.c b/vp8/decoder/decodeframe.c
index 4bc87eb..566972e 100644
--- a/vp8/decoder/decodeframe.c
+++ b/vp8/decoder/decodeframe.c
@@ -986,7 +986,8 @@
     VP8_COMMON *const pc = &pbi->common;
     MACROBLOCKD *const xd  = &pbi->mb;
     const unsigned char *data = pbi->fragments.ptrs[0];
-    const unsigned char *data_end =  data + pbi->fragments.sizes[0];
+    const unsigned int data_sz = pbi->fragments.sizes[0];
+    const unsigned char *data_end = data + data_sz;
     ptrdiff_t first_partition_length_in_bytes;
 
     int i, j, k, l;
@@ -1022,7 +1023,7 @@
         const unsigned char *clear = data;
         if (pbi->decrypt_cb)
         {
-            int n = (int)VPXMIN(sizeof(clear_buffer), data_end - data);
+            int n = (int)VPXMIN(sizeof(clear_buffer), data_sz);
             pbi->decrypt_cb(pbi->decrypt_state, data, clear_buffer, n);
             clear = clear_buffer;
         }
diff --git a/vp8/decoder/error_concealment.c b/vp8/decoder/error_concealment.c
index 0b846a0..fbfae61 100644
--- a/vp8/decoder/error_concealment.c
+++ b/vp8/decoder/error_concealment.c
@@ -194,7 +194,7 @@
         return;
     }
 
-    if (new_row <= (-4 << 3) || new_col <= (-4 << 3))
+    if (new_row <= -32 || new_col <= -32)
     {
         /* outside the frame */
         return;
diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c
index f3d91b5..3196422 100644
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -163,7 +163,7 @@
 {
     const TOKENEXTRA *stop = p + xcount;
     unsigned int split;
-    unsigned int shift;
+    int shift;
     int count = w->count;
     unsigned int range = w->range;
     unsigned int lowvalue = w->lowvalue;
diff --git a/vp8/encoder/boolhuff.h b/vp8/encoder/boolhuff.h
index 7c012a8..e66a2db 100644
--- a/vp8/encoder/boolhuff.h
+++ b/vp8/encoder/boolhuff.h
@@ -65,7 +65,7 @@
     int count = br->count;
     unsigned int range = br->range;
     unsigned int lowvalue = br->lowvalue;
-    register unsigned int shift;
+    register int shift;
 
 #ifdef VP8_ENTROPY_STATS
 #if defined(SECTIONBITS_OUTPUT)
diff --git a/vp8/encoder/denoising.c b/vp8/encoder/denoising.c
index cff99c0..26ce120 100644
--- a/vp8/encoder/denoising.c
+++ b/vp8/encoder/denoising.c
@@ -529,7 +529,7 @@
         // Bias on zero motion vector sse.
         const int zero_bias = denoiser->denoise_pars.denoise_mv_bias;
         zero_mv_sse = (unsigned int)((int64_t)zero_mv_sse * zero_bias / 100);
-        sse_diff = zero_mv_sse - best_sse;
+        sse_diff = (int)zero_mv_sse - (int)best_sse;
 
         saved_mbmi = *mbmi;
 
diff --git a/vp8/encoder/denoising.h b/vp8/encoder/denoising.h
index 34c561d..8c126c1 100644
--- a/vp8/encoder/denoising.h
+++ b/vp8/encoder/denoising.h
@@ -18,8 +18,8 @@
 extern "C" {
 #endif
 
-#define SUM_DIFF_THRESHOLD 448
-#define SUM_DIFF_THRESHOLD_HIGH 512
+#define SUM_DIFF_THRESHOLD 512
+#define SUM_DIFF_THRESHOLD_HIGH 600
 #define MOTION_MAGNITUDE_THRESHOLD (8*3)
 
 #define SUM_DIFF_THRESHOLD_UV (96)   // (8 * 8 * 1.5)
diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index 4c2acc7..c526a3e 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -18,6 +18,7 @@
 #include "onyx_int.h"
 #include "vpx_dsp/variance.h"
 #include "encodeintra.h"
+#include "vp8/common/common.h"
 #include "vp8/common/setupintrarecon.h"
 #include "vp8/common/systemdependent.h"
 #include "mcomp.h"
@@ -2417,7 +2418,7 @@
     int tmp_q;
     int frames_left = (int)(cpi->twopass.total_stats.count - cpi->common.current_video_frame);
 
-    FIRSTPASS_STATS this_frame = {0};
+    FIRSTPASS_STATS this_frame;
     FIRSTPASS_STATS this_frame_copy;
 
     double this_frame_intra_error;
@@ -2425,6 +2426,8 @@
 
     int overhead_bits;
 
+    vp8_zero(this_frame);
+
     if (!cpi->twopass.stats_in)
     {
         return ;
@@ -2808,7 +2811,8 @@
              * static scene.
              */
             if ( detect_transition_to_still( cpi, i,
-                                             (cpi->key_frame_frequency-i),
+                                             ((int)(cpi->key_frame_frequency) -
+                                              (int)i),
                                              loop_decay_rate,
                                              decay_accumulator ) )
             {
diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c
index 768c764..e20c1ea 100644
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -1591,7 +1591,6 @@
     int col_min = ref_col - distance;
     int col_max = ref_col + distance;
 
-    // TODO(johannkoenig): check if this alignment is necessary.
     DECLARE_ALIGNED(16, unsigned int, sad_array8[8]);
     unsigned int sad_array[3];
 
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 0efdac4..8511af2 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -1523,7 +1523,8 @@
 void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
 {
     VP8_COMMON *cm = &cpi->common;
-    int last_w, last_h, prev_number_of_layers;
+    int last_w, last_h;
+    unsigned int prev_number_of_layers;
 
     if (!cpi)
         return;
@@ -1786,10 +1787,8 @@
     if (last_w != cpi->oxcf.Width || last_h != cpi->oxcf.Height)
         cpi->force_next_frame_intra = 1;
 
-    if (((cm->Width + 15) & 0xfffffff0) !=
-          cm->yv12_fb[cm->lst_fb_idx].y_width ||
-        ((cm->Height + 15) & 0xfffffff0) !=
-          cm->yv12_fb[cm->lst_fb_idx].y_height ||
+    if (((cm->Width + 15) & ~15) != cm->yv12_fb[cm->lst_fb_idx].y_width ||
+        ((cm->Height + 15) & ~15) != cm->yv12_fb[cm->lst_fb_idx].y_height ||
         cm->yv12_fb[cm->lst_fb_idx].y_width == 0)
     {
         dealloc_raw_frame_buffers(cpi);
@@ -2247,6 +2246,8 @@
             double total_encode_time = (cpi->time_receive_data +
                                             cpi->time_compress_data) / 1000.000;
             double dr = (double)cpi->bytes * 8.0 / 1000.0 / time_encoded;
+            const double target_rate = (double)cpi->oxcf.target_bandwidth / 1000;
+            const double rate_err = ((100.0 * (dr - target_rate)) / target_rate);
 
             if (cpi->b_calculate_psnr)
             {
@@ -2292,12 +2293,14 @@
                                                       cpi->summed_weights, 8.0);
 
                     fprintf(f, "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\t"
-                               "GLPsnrP\tVPXSSIM\t  Time(us)\n");
+                               "GLPsnrP\tVPXSSIM\t  Time(us)  Rc-Err "
+                               "Abs Err\n");
                     fprintf(f, "%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
-                               "%7.3f\t%8.0f\n",
+                               "%7.3f\t%8.0f %7.2f %7.2f\n",
                                dr, cpi->total / cpi->count, total_psnr,
                                cpi->totalp / cpi->count, total_psnr2,
-                               total_ssim, total_encode_time);
+                               total_ssim, total_encode_time,
+                               rate_err, fabs(rate_err));
                 }
             }
             fclose(f);
@@ -5168,7 +5171,7 @@
         vp8_second_pass(cpi);
 
     encode_frame_to_data_rate(cpi, size, dest, dest_end, frame_flags);
-    cpi->twopass.bits_left -= 8 * *size;
+    cpi->twopass.bits_left -= 8 * (int)(*size);
 
     if (!cpi->common.refresh_alt_ref_frame)
     {
@@ -5772,7 +5775,7 @@
         return -1;
 
     // Check number of rows and columns match
-    if (cpi->common.mb_rows != rows || cpi->common.mb_cols != cols)
+    if (cpi->common.mb_rows != (int)rows || cpi->common.mb_cols != (int)cols)
         return -1;
 
     // Range check the delta Q values and convert the external Q range values
@@ -5828,7 +5831,7 @@
 
 int vp8_set_active_map(VP8_COMP *cpi, unsigned char *map, unsigned int rows, unsigned int cols)
 {
-    if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols)
+    if ((int)rows == cpi->common.mb_rows && (int)cols == cpi->common.mb_cols)
     {
         if (map)
         {
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 86f401c..716f878c 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -371,7 +371,7 @@
     double key_frame_rate_correction_factor;
     double gf_rate_correction_factor;
 
-    unsigned int frames_since_golden;
+    int frames_since_golden;
     /* Count down till next GF */
     int frames_till_gf_update_due;
 
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index 0708d65..24b332d 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -90,7 +90,7 @@
     {
       int i = 0;
       // No skin if block has been zero motion for long consecutive time.
-      if (consec_zeromv > 80)
+      if (consec_zeromv > 60)
         return 0;
       // Exit on grey.
        if (cb == 128 && cr == 128)
@@ -103,7 +103,7 @@
          if (skin_color_diff < skin_threshold[i + 1]) {
             if (y < 60 && skin_color_diff > 3 * (skin_threshold[i + 1] >> 2))
               return 0;
-            else if (consec_zeromv > 30 &&
+            else if (consec_zeromv > 25 &&
                      skin_color_diff > (skin_threshold[i + 1] >> 1))
               return 0;
             else
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 9063cea..6507ae9 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -1899,7 +1899,8 @@
                     int prob_skip_cost;
 
                     prob_skip_cost = vp8_cost_bit(cpi->prob_skip_false, 1);
-                    prob_skip_cost -= vp8_cost_bit(cpi->prob_skip_false, 0);
+                    prob_skip_cost -=
+                        (int)vp8_cost_bit(cpi->prob_skip_false, 0);
                     rd->rate2 += prob_skip_cost;
                     *other_cost += prob_skip_cost;
                 }
diff --git a/vp8/encoder/vp8_quantize.c b/vp8/encoder/vp8_quantize.c
index ee922c9..0d101ba 100644
--- a/vp8/encoder/vp8_quantize.c
+++ b/vp8/encoder/vp8_quantize.c
@@ -227,12 +227,12 @@
     if(improved_quant)
     {
         unsigned t;
-        int l;
+        int l, m;
         t = d;
         for(l = 0; t > 1; l++)
             t>>=1;
-        t = 1 + (1<<(16+l))/d;
-        *quant = (short)(t - (1<<16));
+        m = 1 + (1<<(16+l))/d;
+        *quant = (short)(m - (1<<16));
         *shift = l;
         /* use multiplication and constant shift by 16 */
         *shift = 1 << (16 - *shift);
diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index 257d2a0..b19ab7a 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -22,6 +22,7 @@
 #include "vpx/vp8cx.h"
 #include "vp8/encoder/firstpass.h"
 #include "vp8/common/onyx.h"
+#include "vp8/common/common.h"
 #include <stdlib.h>
 #include <string.h>
 
@@ -760,7 +761,7 @@
                                     unsigned long          duration,
                                     unsigned long          deadline)
 {
-    unsigned int new_qc;
+    int new_qc;
 
 #if !(CONFIG_REALTIME_ONLY)
     /* Use best quality mode if no deadline is given. */
@@ -785,7 +786,9 @@
     new_qc = MODE_REALTIME;
 #endif
 
-    if (ctx->cfg.g_pass == VPX_RC_FIRST_PASS)
+    if (deadline == VPX_DL_REALTIME)
+        new_qc = MODE_REALTIME;
+    else if (ctx->cfg.g_pass == VPX_RC_FIRST_PASS)
         new_qc = MODE_FIRSTPASS;
     else if (ctx->cfg.g_pass == VPX_RC_LAST_PASS)
         new_qc = (new_qc == MODE_BESTQUALITY)
@@ -1116,7 +1119,8 @@
 {
 
     YV12_BUFFER_CONFIG sd;
-    vp8_ppflags_t flags = {0};
+    vp8_ppflags_t flags;
+    vp8_zero(flags);
 
     if (ctx->preview_ppcfg.post_proc_flag)
     {
@@ -1305,8 +1309,8 @@
         30,                 /* rc_resize_up_thresold */
 
         VPX_VBR,            /* rc_end_usage */
-        {0},                /* rc_twopass_stats_in */
-        {0},                /* rc_firstpass_mb_stats_in */
+        {NULL, 0},          /* rc_twopass_stats_in */
+        {NULL, 0},          /* rc_firstpass_mb_stats_in */
         256,                /* rc_target_bandwidth */
         4,                  /* rc_min_quantizer */
         63,                 /* rc_max_quantizer */
@@ -1334,6 +1338,8 @@
         {0},                /* ts_rate_decimator */
         0,                  /* ts_periodicity */
         {0},                /* ts_layer_id */
+        {0},                /* layer_target_bitrate */
+        0                   /* temporal_layering_mode */
     }},
 };
 
diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c
index 9c78de1..fc9288d 100644
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -522,7 +522,8 @@
     {
         YV12_BUFFER_CONFIG sd;
         int64_t time_stamp = 0, time_end_stamp = 0;
-        vp8_ppflags_t flags = {0};
+        vp8_ppflags_t flags;
+        vp8_zero(flags);
 
         if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)
         {
@@ -816,11 +817,12 @@
     },
     { /* encoder functions */
         0,
-        NULL,
-        NULL,
-        NULL,
-        NULL,
-        NULL,
-        NULL
+        NULL,  /* vpx_codec_enc_cfg_map_t */
+        NULL,  /* vpx_codec_encode_fn_t */
+        NULL,  /* vpx_codec_get_cx_data_fn_t */
+        NULL,  /* vpx_codec_enc_config_set_fn_t */
+        NULL,  /* vpx_codec_get_global_headers_fn_t */
+        NULL,  /* vpx_codec_get_preview_frame_fn_t */
+        NULL   /* vpx_codec_enc_mr_get_mem_loc_fn_t */
     }
 };
diff --git a/vp9/common/vp9_common.h b/vp9/common/vp9_common.h
index 9d5dbc6..38815ac 100644
--- a/vp9/common/vp9_common.h
+++ b/vp9/common/vp9_common.h
@@ -67,7 +67,6 @@
 
 #define VP9_FRAME_MARKER 0x2
 
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/common/vp9_common_data.c b/vp9/common/vp9_common_data.c
index a6dae6a..3409d04 100644
--- a/vp9/common/vp9_common_data.c
+++ b/vp9/common/vp9_common_data.c
@@ -159,3 +159,18 @@
   {0,  8 },  // 64X32 - {0b0000, 0b1000}
   {0,  0 },  // 64X64 - {0b0000, 0b0000}
 };
+
+#if CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH
+const uint8_t need_top_left[INTRA_MODES] = {
+    0,  // DC_PRED
+    0,  // V_PRED
+    0,  // H_PRED
+    0,  // D45_PRED
+    1,  // D135_PRED
+    1,  // D117_PRED
+    1,  // D153_PRED
+    0,  // D207_PRED
+    0,  // D63_PRED
+    1,  // TM_PRED
+};
+#endif  // CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/common/vp9_common_data.h b/vp9/common/vp9_common_data.h
index 95a1179..0ae24da 100644
--- a/vp9/common/vp9_common_data.h
+++ b/vp9/common/vp9_common_data.h
@@ -33,6 +33,9 @@
 extern const BLOCK_SIZE txsize_to_bsize[TX_SIZES];
 extern const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES];
 extern const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2];
+#if CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH
+extern const uint8_t need_top_left[INTRA_MODES];
+#endif  // CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index aca69bd..183dec4 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -298,196 +298,168 @@
 
 static void filter_selectively_vert_row2(int subsampling_factor,
                                          uint8_t *s, int pitch,
-                                         unsigned int mask_16x16_l,
-                                         unsigned int mask_8x8_l,
-                                         unsigned int mask_4x4_l,
-                                         unsigned int mask_4x4_int_l,
-                                         const loop_filter_info_n *lfi_n,
+                                         unsigned int mask_16x16,
+                                         unsigned int mask_8x8,
+                                         unsigned int mask_4x4,
+                                         unsigned int mask_4x4_int,
+                                         const loop_filter_thresh *lfthr,
                                          const uint8_t *lfl) {
-  const int mask_shift = subsampling_factor ? 4 : 8;
-  const int mask_cutoff = subsampling_factor ? 0xf : 0xff;
+  const int dual_mask_cutoff = subsampling_factor ? 0xff : 0xffff;
   const int lfl_forward = subsampling_factor ? 4 : 8;
-
-  unsigned int mask_16x16_0 = mask_16x16_l & mask_cutoff;
-  unsigned int mask_8x8_0 = mask_8x8_l & mask_cutoff;
-  unsigned int mask_4x4_0 = mask_4x4_l & mask_cutoff;
-  unsigned int mask_4x4_int_0 = mask_4x4_int_l & mask_cutoff;
-  unsigned int mask_16x16_1 = (mask_16x16_l >> mask_shift) & mask_cutoff;
-  unsigned int mask_8x8_1 = (mask_8x8_l >> mask_shift) & mask_cutoff;
-  unsigned int mask_4x4_1 = (mask_4x4_l >> mask_shift) & mask_cutoff;
-  unsigned int mask_4x4_int_1 = (mask_4x4_int_l >> mask_shift) & mask_cutoff;
+  const unsigned int dual_one = 1 | (1 << lfl_forward);
   unsigned int mask;
+  uint8_t *ss[2];
+  ss[0] = s;
 
-  for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_4x4_int_0 |
-              mask_16x16_1 | mask_8x8_1 | mask_4x4_1 | mask_4x4_int_1;
-       mask; mask >>= 1) {
-    const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
-    const loop_filter_thresh *lfi1 = lfi_n->lfthr + *(lfl + lfl_forward);
+  for (mask =
+           (mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int) & dual_mask_cutoff;
+       mask; mask = (mask & ~dual_one) >> 1) {
+    if (mask & dual_one) {
+      const loop_filter_thresh *lfis[2];
+      lfis[0] = lfthr + *lfl;
+      lfis[1] = lfthr + *(lfl + lfl_forward);
+      ss[1] = ss[0] + 8 * pitch;
 
-    if (mask & 1) {
-      if ((mask_16x16_0 | mask_16x16_1) & 1) {
-        if ((mask_16x16_0 & mask_16x16_1) & 1) {
-          vpx_lpf_vertical_16_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                   lfi0->hev_thr);
-        } else if (mask_16x16_0 & 1) {
-          vpx_lpf_vertical_16(s, pitch, lfi0->mblim, lfi0->lim,
-                              lfi0->hev_thr);
+      if (mask_16x16 & dual_one) {
+        if ((mask_16x16 & dual_one) == dual_one) {
+          vpx_lpf_vertical_16_dual(ss[0], pitch, lfis[0]->mblim, lfis[0]->lim,
+                                   lfis[0]->hev_thr);
         } else {
-          vpx_lpf_vertical_16(s + 8 *pitch, pitch, lfi1->mblim,
-                              lfi1->lim, lfi1->hev_thr);
+          const loop_filter_thresh *lfi = lfis[!(mask_16x16 & 1)];
+          vpx_lpf_vertical_16(ss[!(mask_16x16 & 1)], pitch, lfi->mblim,
+                              lfi->lim, lfi->hev_thr);
         }
       }
 
-      if ((mask_8x8_0 | mask_8x8_1) & 1) {
-        if ((mask_8x8_0 & mask_8x8_1) & 1) {
-          vpx_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                  lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                  lfi1->hev_thr);
-        } else if (mask_8x8_0 & 1) {
-          vpx_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
+      if (mask_8x8 & dual_one) {
+        if ((mask_8x8 & dual_one) == dual_one) {
+          vpx_lpf_vertical_8_dual(ss[0], pitch, lfis[0]->mblim, lfis[0]->lim,
+                                  lfis[0]->hev_thr, lfis[1]->mblim,
+                                  lfis[1]->lim, lfis[1]->hev_thr);
         } else {
-          vpx_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                             lfi1->hev_thr);
+          const loop_filter_thresh *lfi = lfis[!(mask_8x8 & 1)];
+          vpx_lpf_vertical_8(ss[!(mask_8x8 & 1)], pitch, lfi->mblim, lfi->lim,
+                             lfi->hev_thr);
         }
       }
 
-      if ((mask_4x4_0 | mask_4x4_1) & 1) {
-        if ((mask_4x4_0 & mask_4x4_1) & 1) {
-          vpx_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                  lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                  lfi1->hev_thr);
-        } else if (mask_4x4_0 & 1) {
-          vpx_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
+      if (mask_4x4 & dual_one) {
+        if ((mask_4x4 & dual_one) == dual_one) {
+          vpx_lpf_vertical_4_dual(ss[0], pitch, lfis[0]->mblim, lfis[0]->lim,
+                                  lfis[0]->hev_thr, lfis[1]->mblim,
+                                  lfis[1]->lim, lfis[1]->hev_thr);
         } else {
-          vpx_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                             lfi1->hev_thr);
+          const loop_filter_thresh *lfi = lfis[!(mask_4x4 & 1)];
+          vpx_lpf_vertical_4(ss[!(mask_4x4 & 1)], pitch, lfi->mblim, lfi->lim,
+                             lfi->hev_thr);
         }
       }
 
-      if ((mask_4x4_int_0 | mask_4x4_int_1) & 1) {
-        if ((mask_4x4_int_0 & mask_4x4_int_1) & 1) {
-          vpx_lpf_vertical_4_dual(s + 4, pitch, lfi0->mblim, lfi0->lim,
-                                  lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                  lfi1->hev_thr);
-        } else if (mask_4x4_int_0 & 1) {
-          vpx_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim,
-                             lfi0->hev_thr);
+      if (mask_4x4_int & dual_one) {
+        if ((mask_4x4_int & dual_one) == dual_one) {
+          vpx_lpf_vertical_4_dual(ss[0] + 4, pitch, lfis[0]->mblim,
+                                  lfis[0]->lim, lfis[0]->hev_thr,
+                                  lfis[1]->mblim, lfis[1]->lim,
+                                  lfis[1]->hev_thr);
         } else {
-          vpx_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim, lfi1->lim,
-                             lfi1->hev_thr);
+          const loop_filter_thresh *lfi = lfis[!(mask_4x4_int & 1)];
+          vpx_lpf_vertical_4(ss[!(mask_4x4_int & 1)] + 4, pitch, lfi->mblim,
+                             lfi->lim, lfi->hev_thr);
         }
       }
     }
 
-    s += 8;
+    ss[0] += 8;
     lfl += 1;
-    mask_16x16_0 >>= 1;
-    mask_8x8_0 >>= 1;
-    mask_4x4_0 >>= 1;
-    mask_4x4_int_0 >>= 1;
-    mask_16x16_1 >>= 1;
-    mask_8x8_1 >>= 1;
-    mask_4x4_1 >>= 1;
-    mask_4x4_int_1 >>= 1;
+    mask_16x16 >>= 1;
+    mask_8x8 >>= 1;
+    mask_4x4 >>= 1;
+    mask_4x4_int >>= 1;
   }
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
 static void highbd_filter_selectively_vert_row2(int subsampling_factor,
                                                 uint16_t *s, int pitch,
-                                                unsigned int mask_16x16_l,
-                                                unsigned int mask_8x8_l,
-                                                unsigned int mask_4x4_l,
-                                                unsigned int mask_4x4_int_l,
-                                                const loop_filter_info_n *lfi_n,
+                                                unsigned int mask_16x16,
+                                                unsigned int mask_8x8,
+                                                unsigned int mask_4x4,
+                                                unsigned int mask_4x4_int,
+                                                const loop_filter_thresh *lfthr,
                                                 const uint8_t *lfl, int bd) {
-  const int mask_shift = subsampling_factor ? 4 : 8;
-  const int mask_cutoff = subsampling_factor ? 0xf : 0xff;
+  const int dual_mask_cutoff = subsampling_factor ? 0xff : 0xffff;
   const int lfl_forward = subsampling_factor ? 4 : 8;
-
-  unsigned int mask_16x16_0 = mask_16x16_l & mask_cutoff;
-  unsigned int mask_8x8_0 = mask_8x8_l & mask_cutoff;
-  unsigned int mask_4x4_0 = mask_4x4_l & mask_cutoff;
-  unsigned int mask_4x4_int_0 = mask_4x4_int_l & mask_cutoff;
-  unsigned int mask_16x16_1 = (mask_16x16_l >> mask_shift) & mask_cutoff;
-  unsigned int mask_8x8_1 = (mask_8x8_l >> mask_shift) & mask_cutoff;
-  unsigned int mask_4x4_1 = (mask_4x4_l >> mask_shift) & mask_cutoff;
-  unsigned int mask_4x4_int_1 = (mask_4x4_int_l >> mask_shift) & mask_cutoff;
+  const unsigned int dual_one = 1 | (1 << lfl_forward);
   unsigned int mask;
+  uint16_t *ss[2];
+  ss[0] = s;
 
-  for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_4x4_int_0 |
-       mask_16x16_1 | mask_8x8_1 | mask_4x4_1 | mask_4x4_int_1;
-       mask; mask >>= 1) {
-    const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
-    const loop_filter_thresh *lfi1 = lfi_n->lfthr + *(lfl + lfl_forward);
+  for (mask =
+           (mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int) & dual_mask_cutoff;
+       mask; mask = (mask & ~dual_one) >> 1) {
+    if (mask & dual_one) {
+      const loop_filter_thresh *lfis[2];
+      lfis[0] = lfthr + *lfl;
+      lfis[1] = lfthr + *(lfl + lfl_forward);
+      ss[1] = ss[0] + 8 * pitch;
 
-    if (mask & 1) {
-      if ((mask_16x16_0 | mask_16x16_1) & 1) {
-        if ((mask_16x16_0 & mask_16x16_1) & 1) {
-          vpx_highbd_lpf_vertical_16_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                          lfi0->hev_thr, bd);
-        } else if (mask_16x16_0 & 1) {
-          vpx_highbd_lpf_vertical_16(s, pitch, lfi0->mblim, lfi0->lim,
-                                     lfi0->hev_thr, bd);
+      if (mask_16x16 & dual_one) {
+        if ((mask_16x16 & dual_one) == dual_one) {
+          vpx_highbd_lpf_vertical_16_dual(ss[0], pitch, lfis[0]->mblim,
+                                          lfis[0]->lim, lfis[0]->hev_thr, bd);
         } else {
-          vpx_highbd_lpf_vertical_16(s + 8 *pitch, pitch, lfi1->mblim,
-                                     lfi1->lim, lfi1->hev_thr, bd);
+          const loop_filter_thresh *lfi = lfis[!(mask_16x16 & 1)];
+          vpx_highbd_lpf_vertical_16(ss[!(mask_16x16 & 1)], pitch, lfi->mblim,
+                                     lfi->lim, lfi->hev_thr, bd);
         }
       }
 
-      if ((mask_8x8_0 | mask_8x8_1) & 1) {
-        if ((mask_8x8_0 & mask_8x8_1) & 1) {
-          vpx_highbd_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                         lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                         lfi1->hev_thr, bd);
-        } else if (mask_8x8_0 & 1) {
-          vpx_highbd_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim,
-                                    lfi0->hev_thr, bd);
+      if (mask_8x8 & dual_one) {
+        if ((mask_8x8 & dual_one) == dual_one) {
+          vpx_highbd_lpf_vertical_8_dual(ss[0], pitch, lfis[0]->mblim,
+                                         lfis[0]->lim, lfis[0]->hev_thr,
+                                         lfis[1]->mblim, lfis[1]->lim,
+                                         lfis[1]->hev_thr, bd);
         } else {
-          vpx_highbd_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim,
-                                    lfi1->lim, lfi1->hev_thr, bd);
+          const loop_filter_thresh *lfi = lfis[!(mask_8x8 & 1)];
+          vpx_highbd_lpf_vertical_8(ss[!(mask_8x8 & 1)], pitch, lfi->mblim,
+                                    lfi->lim, lfi->hev_thr, bd);
         }
       }
 
-      if ((mask_4x4_0 | mask_4x4_1) & 1) {
-        if ((mask_4x4_0 & mask_4x4_1) & 1) {
-          vpx_highbd_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                         lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                         lfi1->hev_thr, bd);
-        } else if (mask_4x4_0 & 1) {
-          vpx_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim,
-                                    lfi0->hev_thr, bd);
+      if (mask_4x4 & dual_one) {
+        if ((mask_4x4 & dual_one) == dual_one) {
+          vpx_highbd_lpf_vertical_4_dual(ss[0], pitch, lfis[0]->mblim,
+                                         lfis[0]->lim, lfis[0]->hev_thr,
+                                         lfis[1]->mblim, lfis[1]->lim,
+                                         lfis[1]->hev_thr, bd);
         } else {
-          vpx_highbd_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim,
-                                    lfi1->lim, lfi1->hev_thr, bd);
+          const loop_filter_thresh *lfi = lfis[!(mask_4x4 & 1)];
+          vpx_highbd_lpf_vertical_4(ss[!(mask_4x4 & 1)], pitch, lfi->mblim,
+                                    lfi->lim, lfi->hev_thr, bd);
         }
       }
 
-      if ((mask_4x4_int_0 | mask_4x4_int_1) & 1) {
-        if ((mask_4x4_int_0 & mask_4x4_int_1) & 1) {
-          vpx_highbd_lpf_vertical_4_dual(s + 4, pitch, lfi0->mblim, lfi0->lim,
-                                         lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                         lfi1->hev_thr, bd);
-        } else if (mask_4x4_int_0 & 1) {
-          vpx_highbd_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim,
-                                    lfi0->hev_thr, bd);
+      if (mask_4x4_int & dual_one) {
+        if ((mask_4x4_int & dual_one) == dual_one) {
+          vpx_highbd_lpf_vertical_4_dual(ss[0] + 4, pitch, lfis[0]->mblim,
+                                         lfis[0]->lim, lfis[0]->hev_thr,
+                                         lfis[1]->mblim, lfis[1]->lim,
+                                         lfis[1]->hev_thr, bd);
         } else {
-          vpx_highbd_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim,
-                                    lfi1->lim, lfi1->hev_thr, bd);
+          const loop_filter_thresh *lfi = lfis[!(mask_4x4_int & 1)];
+          vpx_highbd_lpf_vertical_4(ss[!(mask_4x4_int & 1)] + 4, pitch,
+                                    lfi->mblim, lfi->lim, lfi->hev_thr, bd);
         }
       }
     }
 
-    s += 8;
+    ss[0] += 8;
     lfl += 1;
-    mask_16x16_0 >>= 1;
-    mask_8x8_0 >>= 1;
-    mask_4x4_0 >>= 1;
-    mask_4x4_int_0 >>= 1;
-    mask_16x16_1 >>= 1;
-    mask_8x8_1 >>= 1;
-    mask_4x4_1 >>= 1;
-    mask_4x4_int_1 >>= 1;
+    mask_16x16 >>= 1;
+    mask_8x8 >>= 1;
+    mask_4x4 >>= 1;
+    mask_4x4_int >>= 1;
   }
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -497,17 +469,17 @@
                                      unsigned int mask_8x8,
                                      unsigned int mask_4x4,
                                      unsigned int mask_4x4_int,
-                                     const loop_filter_info_n *lfi_n,
+                                     const loop_filter_thresh *lfthr,
                                      const uint8_t *lfl) {
   unsigned int mask;
   int count;
 
   for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
        mask; mask >>= count) {
-    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
-
     count = 1;
     if (mask & 1) {
+      const loop_filter_thresh *lfi = lfthr + *lfl;
+
       if (mask_16x16 & 1) {
         if ((mask_16x16 & 3) == 3) {
           vpx_lpf_horizontal_edge_16(s, pitch, lfi->mblim, lfi->lim,
@@ -520,7 +492,7 @@
       } else if (mask_8x8 & 1) {
         if ((mask_8x8 & 3) == 3) {
           // Next block's thresholds.
-          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
+          const loop_filter_thresh *lfin = lfthr + *(lfl + 1);
 
           vpx_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
                                     lfi->hev_thr, lfin->mblim, lfin->lim,
@@ -549,7 +521,7 @@
       } else if (mask_4x4 & 1) {
         if ((mask_4x4 & 3) == 3) {
           // Next block's thresholds.
-          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
+          const loop_filter_thresh *lfin = lfthr + *(lfl + 1);
 
           vpx_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
                                     lfi->hev_thr, lfin->mblim, lfin->lim,
@@ -574,7 +546,7 @@
             vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
                                  lfi->hev_thr);
         }
-      } else if (mask_4x4_int & 1) {
+      } else {
         vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
                              lfi->hev_thr);
       }
@@ -594,17 +566,17 @@
                                             unsigned int mask_8x8,
                                             unsigned int mask_4x4,
                                             unsigned int mask_4x4_int,
-                                            const loop_filter_info_n *lfi_n,
+                                            const loop_filter_thresh *lfthr,
                                             const uint8_t *lfl, int bd) {
   unsigned int mask;
   int count;
 
   for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
        mask; mask >>= count) {
-    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
-
     count = 1;
     if (mask & 1) {
+      const loop_filter_thresh *lfi = lfthr + *lfl;
+
       if (mask_16x16 & 1) {
         if ((mask_16x16 & 3) == 3) {
           vpx_highbd_lpf_horizontal_edge_16(s, pitch, lfi->mblim, lfi->lim,
@@ -617,7 +589,7 @@
       } else if (mask_8x8 & 1) {
         if ((mask_8x8 & 3) == 3) {
           // Next block's thresholds.
-          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
+          const loop_filter_thresh *lfin = lfthr + *(lfl + 1);
 
           vpx_highbd_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
                                            lfi->hev_thr, lfin->mblim, lfin->lim,
@@ -650,7 +622,7 @@
       } else if (mask_4x4 & 1) {
         if ((mask_4x4 & 3) == 3) {
           // Next block's thresholds.
-          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
+          const loop_filter_thresh *lfin = lfthr + *(lfl + 1);
 
           vpx_highbd_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
                                            lfi->hev_thr, lfin->mblim, lfin->lim,
@@ -679,7 +651,7 @@
                                         lfi->lim, lfi->hev_thr, bd);
           }
         }
-      } else if (mask_4x4_int & 1) {
+      } else {
         vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
                                     lfi->hev_thr, bd);
       }
@@ -700,7 +672,6 @@
 // whether there were any coefficients encoded, and the loop filter strength
 // block we are currently looking at. Shift is used to position the
 // 1's we produce.
-// TODO(JBB) Need another function for different resolution color..
 static void build_masks(const loop_filter_info_n *const lfi_n,
                         const MODE_INFO *mi, const int shift_y,
                         const int shift_uv,
@@ -935,7 +906,6 @@
 
 // This function sets up the bit masks for the entire 64x64 region represented
 // by mi_row, mi_col.
-// TODO(JBB): This function only works for yv12.
 void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
                     MODE_INFO **mi, const int mode_info_stride,
                     LOOP_FILTER_MASK *lfm) {
@@ -971,9 +941,6 @@
   vp9_zero(*lfm);
   assert(mip[0] != NULL);
 
-  // TODO(jimbankoski): Try moving most of the following code into decode
-  // loop and storing lfm in the mbmi structure so that we don't have to go
-  // through the recursive loop structure multiple times.
   switch (mip[0]->sb_type) {
     case BLOCK_64X64:
       build_masks(lfi_n, mip[0] , 0, 0, lfm);
@@ -1077,8 +1044,6 @@
       }
       break;
   }
-
-  vp9_adjust_mask(cm, mi_row, mi_col, lfm);
 }
 
 static void filter_selectively_vert(uint8_t *s, int pitch,
@@ -1086,13 +1051,13 @@
                                     unsigned int mask_8x8,
                                     unsigned int mask_4x4,
                                     unsigned int mask_4x4_int,
-                                    const loop_filter_info_n *lfi_n,
+                                    const loop_filter_thresh *lfthr,
                                     const uint8_t *lfl) {
   unsigned int mask;
 
   for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
        mask; mask >>= 1) {
-    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+    const loop_filter_thresh *lfi = lfthr + *lfl;
 
     if (mask & 1) {
       if (mask_16x16 & 1) {
@@ -1120,13 +1085,13 @@
                                            unsigned int mask_8x8,
                                            unsigned int mask_4x4,
                                            unsigned int mask_4x4_int,
-                                           const loop_filter_info_n *lfi_n,
+                                           const loop_filter_thresh *lfthr,
                                            const uint8_t *lfl, int bd) {
   unsigned int mask;
 
   for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
        mask; mask >>= 1) {
-    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+    const loop_filter_thresh *lfi = lfthr + *lfl;
 
     if (mask & 1) {
       if (mask_16x16 & 1) {
@@ -1257,23 +1222,18 @@
                                      mask_8x8_c & border_mask,
                                      mask_4x4_c & border_mask,
                                      mask_4x4_int[r],
-                                     &cm->lf_info, &lfl[r << 3],
+                                     cm->lf_info.lfthr, &lfl[r << 3],
                                      (int)cm->bit_depth);
     } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
       filter_selectively_vert(dst->buf, dst->stride,
                               mask_16x16_c & border_mask,
                               mask_8x8_c & border_mask,
                               mask_4x4_c & border_mask,
                               mask_4x4_int[r],
-                              &cm->lf_info, &lfl[r << 3]);
+                              cm->lf_info.lfthr, &lfl[r << 3]);
+#if CONFIG_VP9_HIGHBITDEPTH
     }
-#else
-    filter_selectively_vert(dst->buf, dst->stride,
-                            mask_16x16_c & border_mask,
-                            mask_8x8_c & border_mask,
-                            mask_4x4_c & border_mask,
-                            mask_4x4_int[r],
-                            &cm->lf_info, &lfl[r << 3]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     dst->buf += 8 * dst->stride;
     mi_8x8 += row_step_stride;
@@ -1306,23 +1266,18 @@
                                       mask_8x8_r,
                                       mask_4x4_r,
                                       mask_4x4_int_r,
-                                      &cm->lf_info, &lfl[r << 3],
+                                      cm->lf_info.lfthr, &lfl[r << 3],
                                       (int)cm->bit_depth);
     } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
       filter_selectively_horiz(dst->buf, dst->stride,
                                mask_16x16_r,
                                mask_8x8_r,
                                mask_4x4_r,
                                mask_4x4_int_r,
-                               &cm->lf_info, &lfl[r << 3]);
+                               cm->lf_info.lfthr, &lfl[r << 3]);
+#if CONFIG_VP9_HIGHBITDEPTH
     }
-#else
-    filter_selectively_horiz(dst->buf, dst->stride,
-                             mask_16x16_r,
-                             mask_8x8_r,
-                             mask_4x4_r,
-                             mask_4x4_int_r,
-                             &cm->lf_info, &lfl[r << 3]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     dst->buf += 8 * dst->stride;
   }
@@ -1344,27 +1299,29 @@
 
   // Vertical pass: do 2 rows at one time
   for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) {
-    unsigned int mask_16x16_l = mask_16x16 & 0xffff;
-    unsigned int mask_8x8_l = mask_8x8 & 0xffff;
-    unsigned int mask_4x4_l = mask_4x4 & 0xffff;
-    unsigned int mask_4x4_int_l = mask_4x4_int & 0xffff;
-
-// Disable filtering on the leftmost column.
+    // Disable filtering on the leftmost column.
 #if CONFIG_VP9_HIGHBITDEPTH
     if (cm->use_highbitdepth) {
-      highbd_filter_selectively_vert_row2(
-          plane->subsampling_x, CONVERT_TO_SHORTPTR(dst->buf), dst->stride,
-          mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
-          &lfm->lfl_y[r << 3], (int)cm->bit_depth);
+      highbd_filter_selectively_vert_row2(plane->subsampling_x,
+                                          CONVERT_TO_SHORTPTR(dst->buf),
+                                          dst->stride,
+                                          (unsigned int)mask_16x16,
+                                          (unsigned int)mask_8x8,
+                                          (unsigned int)mask_4x4,
+                                          (unsigned int)mask_4x4_int,
+                                          cm->lf_info.lfthr,
+                                          &lfm->lfl_y[r << 3],
+                                          (int)cm->bit_depth);
     } else {
-      filter_selectively_vert_row2(
-          plane->subsampling_x, dst->buf, dst->stride, mask_16x16_l, mask_8x8_l,
-          mask_4x4_l, mask_4x4_int_l, &cm->lf_info, &lfm->lfl_y[r << 3]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      filter_selectively_vert_row2(plane->subsampling_x, dst->buf, dst->stride,
+                                   (unsigned int)mask_16x16,
+                                   (unsigned int)mask_8x8,
+                                   (unsigned int)mask_4x4,
+                                   (unsigned int)mask_4x4_int,
+                                   cm->lf_info.lfthr, &lfm->lfl_y[r << 3]);
+#if CONFIG_VP9_HIGHBITDEPTH
     }
-#else
-    filter_selectively_vert_row2(
-        plane->subsampling_x, dst->buf, dst->stride, mask_16x16_l, mask_8x8_l,
-        mask_4x4_l, mask_4x4_int_l, &cm->lf_info, &lfm->lfl_y[r << 3]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     dst->buf += 16 * dst->stride;
     mask_16x16 >>= 16;
@@ -1397,19 +1354,18 @@
 
 #if CONFIG_VP9_HIGHBITDEPTH
     if (cm->use_highbitdepth) {
-      highbd_filter_selectively_horiz(
-          CONVERT_TO_SHORTPTR(dst->buf), dst->stride, mask_16x16_r, mask_8x8_r,
-          mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info, &lfm->lfl_y[r << 3],
-          (int)cm->bit_depth);
+      highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
+                                      dst->stride, mask_16x16_r, mask_8x8_r,
+                                      mask_4x4_r, mask_4x4_int & 0xff,
+                                      cm->lf_info.lfthr, &lfm->lfl_y[r << 3],
+                                      (int)cm->bit_depth);
     } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
       filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
-                               mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info,
-                               &lfm->lfl_y[r << 3]);
+                               mask_4x4_r, mask_4x4_int & 0xff,
+                               cm->lf_info.lfthr, &lfm->lfl_y[r << 3]);
+#if CONFIG_VP9_HIGHBITDEPTH
     }
-#else
-    filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
-                             mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info,
-                             &lfm->lfl_y[r << 3]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
     dst->buf += 8 * dst->stride;
@@ -1443,38 +1399,35 @@
       lfl_uv[((r + 2) << 1) + c] = lfm->lfl_y[((r + 2) << 3) + (c << 1)];
     }
 
-    {
-      unsigned int mask_16x16_l = mask_16x16 & 0xff;
-      unsigned int mask_8x8_l = mask_8x8 & 0xff;
-      unsigned int mask_4x4_l = mask_4x4 & 0xff;
-      unsigned int mask_4x4_int_l = mask_4x4_int & 0xff;
-
-// Disable filtering on the leftmost column.
+    // Disable filtering on the leftmost column.
 #if CONFIG_VP9_HIGHBITDEPTH
-      if (cm->use_highbitdepth) {
-        highbd_filter_selectively_vert_row2(
-            plane->subsampling_x, CONVERT_TO_SHORTPTR(dst->buf), dst->stride,
-            mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
-            &lfl_uv[r << 1], (int)cm->bit_depth);
-      } else {
-        filter_selectively_vert_row2(
-            plane->subsampling_x, dst->buf, dst->stride,
-            mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
-            &lfl_uv[r << 1]);
-      }
-#else
-      filter_selectively_vert_row2(
-          plane->subsampling_x, dst->buf, dst->stride,
-          mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
-          &lfl_uv[r << 1]);
+    if (cm->use_highbitdepth) {
+      highbd_filter_selectively_vert_row2(plane->subsampling_x,
+                                          CONVERT_TO_SHORTPTR(dst->buf),
+                                          dst->stride,
+                                          (unsigned int)mask_16x16,
+                                          (unsigned int)mask_8x8,
+                                          (unsigned int)mask_4x4,
+                                          (unsigned int)mask_4x4_int,
+                                          cm->lf_info.lfthr, &lfl_uv[r << 1],
+                                          (int)cm->bit_depth);
+    } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      filter_selectively_vert_row2(plane->subsampling_x, dst->buf, dst->stride,
+                                   (unsigned int)mask_16x16,
+                                   (unsigned int)mask_8x8,
+                                   (unsigned int)mask_4x4,
+                                   (unsigned int)mask_4x4_int,
+                                   cm->lf_info.lfthr, &lfl_uv[r << 1]);
+#if CONFIG_VP9_HIGHBITDEPTH
+    }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-      dst->buf += 16 * dst->stride;
-      mask_16x16 >>= 8;
-      mask_8x8 >>= 8;
-      mask_4x4 >>= 8;
-      mask_4x4_int >>= 8;
-    }
+    dst->buf += 16 * dst->stride;
+    mask_16x16 >>= 8;
+    mask_8x8 >>= 8;
+    mask_4x4 >>= 8;
+    mask_4x4_int >>= 8;
   }
 
   // Horizontal pass
@@ -1506,17 +1459,16 @@
     if (cm->use_highbitdepth) {
       highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
                                       dst->stride, mask_16x16_r, mask_8x8_r,
-                                      mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
-                                      &lfl_uv[r << 1], (int)cm->bit_depth);
+                                      mask_4x4_r, mask_4x4_int_r,
+                                      cm->lf_info.lfthr, &lfl_uv[r << 1],
+                                      (int)cm->bit_depth);
     } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
       filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
-                               mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
+                               mask_4x4_r, mask_4x4_int_r, cm->lf_info.lfthr,
                                &lfl_uv[r << 1]);
+#if CONFIG_VP9_HIGHBITDEPTH
     }
-#else
-    filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
-                             mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
-                             &lfl_uv[r << 1]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
     dst->buf += 8 * dst->stride;
@@ -1552,7 +1504,7 @@
 
       vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
 
-      // TODO(JBB): Make setup_mask work for non 420.
+      // TODO(jimbankoski): For 444 only need to do y mask.
       vp9_adjust_mask(cm, mi_row, mi_col, lfm);
 
       vp9_filter_block_plane_ss00(cm, &planes[0], mi_row, lfm);
@@ -1592,6 +1544,8 @@
 }
 
 // Used by the encoder to build the loopfilter masks.
+// TODO(slavarnway): Do the encoder the same way the decoder does it and
+//                   build the masks in line as part of the encode process.
 void vp9_build_mask_frame(VP9_COMMON *cm, int frame_filter_level,
                           int partial_frame) {
   int start_mi_row, end_mi_row, mi_rows_to_filter;
diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c
index b685d81..c04cc8f 100644
--- a/vp9/common/vp9_postproc.c
+++ b/vp9/common/vp9_postproc.c
@@ -12,6 +12,7 @@
 #include <stdlib.h>
 #include <stdio.h>
 
+#include "./vpx_dsp_rtcd.h"
 #include "./vpx_config.h"
 #include "./vpx_scale_rtcd.h"
 #include "./vp9_rtcd.h"
@@ -587,32 +588,6 @@
   state->last_noise = a;
 }
 
-void vp9_plane_add_noise_c(uint8_t *start, char *noise,
-                           char blackclamp[16],
-                           char whiteclamp[16],
-                           char bothclamp[16],
-                           unsigned int width, unsigned int height, int pitch) {
-  unsigned int i, j;
-
-  // TODO(jbb): why does simd code use both but c doesn't,  normalize and
-  // fix..
-  (void) bothclamp;
-  for (i = 0; i < height; i++) {
-    uint8_t *pos = start + i * pitch;
-    char  *ref = (char *)(noise + (rand() & 0xff));  // NOLINT
-
-    for (j = 0; j < width; j++) {
-      if (pos[j] < blackclamp[0])
-        pos[j] = blackclamp[0];
-
-      if (pos[j] > 255 + whiteclamp[0])
-        pos[j] = 255 + whiteclamp[0];
-
-      pos[j] += ref[j];
-    }
-  }
-}
-
 static void swap_mi_and_prev_mi(VP9_COMMON *cm) {
   // Current mip will be the prev_mip for the next frame.
   MODE_INFO *temp = cm->postproc_state.prev_mip;
@@ -726,8 +701,7 @@
         ppstate->last_noise != noise_level) {
       fillrd(ppstate, 63 - q, noise_level);
     }
-
-    vp9_plane_add_noise(ppbuf->y_buffer, ppstate->noise, ppstate->blackclamp,
+    vpx_plane_add_noise(ppbuf->y_buffer, ppstate->noise, ppstate->blackclamp,
                         ppstate->whiteclamp, ppstate->bothclamp,
                         ppbuf->y_width, ppbuf->y_height, ppbuf->y_stride);
   }
diff --git a/vp9/common/vp9_pred_common.c b/vp9/common/vp9_pred_common.c
index b5751fc..8f90e70 100644
--- a/vp9/common/vp9_pred_common.c
+++ b/vp9/common/vp9_pred_common.c
@@ -28,9 +28,9 @@
 
   if (left_type == above_type)
     return left_type;
-  else if (left_type == SWITCHABLE_FILTERS && above_type != SWITCHABLE_FILTERS)
+  else if (left_type == SWITCHABLE_FILTERS)
     return above_type;
-  else if (left_type != SWITCHABLE_FILTERS && above_type == SWITCHABLE_FILTERS)
+  else if (above_type == SWITCHABLE_FILTERS)
     return left_type;
   else
     return SWITCHABLE_FILTERS;
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index 74bc1d2..84718e9 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -20,19 +20,6 @@
 #include "vp9/common/vp9_reconintra.h"
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void high_inter_predictor(const uint8_t *src, int src_stride,
-                                 uint8_t *dst, int dst_stride,
-                                 const int subpel_x,
-                                 const int subpel_y,
-                                 const struct scale_factors *sf,
-                                 int w, int h, int ref,
-                                 const InterpKernel *kernel,
-                                 int xs, int ys, int bd) {
-  sf->highbd_predict[subpel_x != 0][subpel_y != 0][ref](
-      src, src_stride, dst, dst_stride,
-      kernel[subpel_x], xs, kernel[subpel_y], ys, w, h, bd);
-}
-
 void vp9_highbd_build_inter_predictor(const uint8_t *src, int src_stride,
                                       uint8_t *dst, int dst_stride,
                                       const MV *src_mv,
@@ -50,8 +37,9 @@
 
   src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS);
 
-  high_inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y,
-                       sf, w, h, ref, kernel, sf->x_step_q4, sf->y_step_q4, bd);
+  highbd_inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y,
+                         sf, w, h, ref, kernel, sf->x_step_q4, sf->y_step_q4,
+                         bd);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
@@ -222,9 +210,9 @@
 
 #if CONFIG_VP9_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      high_inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
-                           subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys,
-                           xd->bd);
+      highbd_inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
+                             subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys,
+                             xd->bd);
     } else {
       inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
                       subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys);
diff --git a/vp9/common/vp9_reconinter.h b/vp9/common/vp9_reconinter.h
index 7d90774..07745e3 100644
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h
@@ -34,14 +34,18 @@
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void high_inter_predictor(const uint8_t *src, int src_stride,
-                                 uint8_t *dst, int dst_stride,
-                                 const int subpel_x,
-                                 const int subpel_y,
-                                 const struct scale_factors *sf,
-                                 int w, int h, int ref,
-                                 const InterpKernel *kernel,
-                                 int xs, int ys, int bd);
+static INLINE void highbd_inter_predictor(const uint8_t *src, int src_stride,
+                                          uint8_t *dst, int dst_stride,
+                                          const int subpel_x,
+                                          const int subpel_y,
+                                          const struct scale_factors *sf,
+                                          int w, int h, int ref,
+                                          const InterpKernel *kernel,
+                                          int xs, int ys, int bd) {
+  sf->highbd_predict[subpel_x != 0][subpel_y != 0][ref](
+      src, src_stride, dst, dst_stride,
+      kernel[subpel_x], xs, kernel[subpel_y], ys, w, h, bd);
+}
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 MV average_split_mvs(const struct macroblockd_plane *pd, const MODE_INFO *mi,
diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c
index c4d91c8..4457858 100644
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@@ -142,6 +142,7 @@
   // 129  C   D  ..  W   X
   // 129  E   F  ..  U   V
   // 129  G   H  ..  S   T   T   T   T   T
+  // For 10 bit and 12 bit, 127 and 129 are replaced by base -1 and base + 1.
 
   // Get current frame pointer, width and height.
   if (plane == 0) {
@@ -177,7 +178,6 @@
           left_col[i] = ref[i * ref_stride - 1];
       }
     } else {
-      // TODO(Peter): this value should probably change for high bitdepth
       vpx_memset16(left_col, base + 1, bs);
     }
   }
@@ -239,7 +239,6 @@
           vpx_memset16(above_row + r, above_row[r - 1],
                        x0 + 2 * bs - frame_width);
         }
-        // TODO(Peter) this value should probably change for high bitdepth
         above_row[-1] = left_available ? above_ref[-1] : (base + 1);
       } else {
         /* faster path if the block does not need extension */
@@ -251,13 +250,11 @@
             memcpy(above_row + bs, above_ref + bs, bs * sizeof(above_row[0]));
           else
             vpx_memset16(above_row + bs, above_row[bs - 1], bs);
-          // TODO(Peter): this value should probably change for high bitdepth
           above_row[-1] = left_available ? above_ref[-1] : (base + 1);
         }
       }
     } else {
       vpx_memset16(above_row, base - 1, bs * 2);
-      // TODO(Peter): this value should probably change for high bitdepth
       above_row[-1] = base - 1;
     }
   }
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 1cf636c..8461336 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -70,10 +70,6 @@
 specialize qw/vp9_post_proc_down_and_across sse2/;
 $vp9_post_proc_down_and_across_sse2=vp9_post_proc_down_and_across_xmm;
 
-add_proto qw/void vp9_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
-specialize qw/vp9_plane_add_noise sse2/;
-$vp9_plane_add_noise_sse2=vp9_plane_add_noise_wmt;
-
 add_proto qw/void vp9_filter_by_weight16x16/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight";
 specialize qw/vp9_filter_by_weight16x16 sse2 msa/;
 
@@ -169,9 +165,6 @@
 
     add_proto qw/void vp9_highbd_post_proc_down_and_across/, "const uint16_t *src_ptr, uint16_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit";
     specialize qw/vp9_highbd_post_proc_down_and_across/;
-
-    add_proto qw/void vp9_highbd_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
-    specialize qw/vp9_highbd_plane_add_noise/;
   }
 
   #
@@ -252,7 +245,7 @@
   specialize qw/vp9_fht16x16 sse2/;
 
   add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fwht4x4/, "$mmx_x86inc";
+  specialize qw/vp9_fwht4x4/, "$sse2_x86inc";
 } else {
   add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
   specialize qw/vp9_fht4x4 sse2 msa/;
@@ -264,7 +257,7 @@
   specialize qw/vp9_fht16x16 sse2 msa/;
 
   add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fwht4x4 msa/, "$mmx_x86inc";
+  specialize qw/vp9_fwht4x4 msa/, "$sse2_x86inc";
 }
 
 #
@@ -276,7 +269,7 @@
 $vp9_full_search_sad_sse4_1=vp9_full_search_sadx8;
 
 add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
-specialize qw/vp9_diamond_search_sad/;
+specialize qw/vp9_diamond_search_sad avx/;
 
 add_proto qw/void vp9_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
 specialize qw/vp9_temporal_filter_apply sse2 msa/;
diff --git a/vp9/common/vp9_seg_common.c b/vp9/common/vp9_seg_common.c
index c8ef618..7af6162 100644
--- a/vp9/common/vp9_seg_common.c
+++ b/vp9/common/vp9_seg_common.c
@@ -28,6 +28,7 @@
 void vp9_clearall_segfeatures(struct segmentation *seg) {
   vp9_zero(seg->feature_data);
   vp9_zero(seg->feature_mask);
+  seg->aq_av_offset = 0;
 }
 
 void vp9_enable_segfeature(struct segmentation *seg, int segment_id,
diff --git a/vp9/common/vp9_seg_common.h b/vp9/common/vp9_seg_common.h
index 5b75d8d..7ea7c3d 100644
--- a/vp9/common/vp9_seg_common.h
+++ b/vp9/common/vp9_seg_common.h
@@ -47,6 +47,7 @@
 
   int16_t feature_data[MAX_SEGMENTS][SEG_LVL_MAX];
   unsigned int feature_mask[MAX_SEGMENTS];
+  int aq_av_offset;
 };
 
 static INLINE int segfeature_active(const struct segmentation *seg,
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index 8d312d0..1c77b57 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vp9_rtcd.h"
 #include "vpx_dsp/x86/inv_txfm_sse2.h"
 #include "vpx_dsp/x86/txfm_common_sse2.h"
 #include "vpx_ports/mem.h"
diff --git a/vp9/common/x86/vp9_postproc_sse2.asm b/vp9/common/x86/vp9_postproc_sse2.asm
index ec8bfdb..4307628 100644
--- a/vp9/common/x86/vp9_postproc_sse2.asm
+++ b/vp9/common/x86/vp9_postproc_sse2.asm
@@ -624,68 +624,6 @@
 %undef flimit4
 
 
-;void vp9_plane_add_noise_wmt (unsigned char *start, unsigned char *noise,
-;                            unsigned char blackclamp[16],
-;                            unsigned char whiteclamp[16],
-;                            unsigned char bothclamp[16],
-;                            unsigned int width, unsigned int height, int pitch)
-global sym(vp9_plane_add_noise_wmt) PRIVATE
-sym(vp9_plane_add_noise_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-.addnoise_loop:
-    call sym(LIBVPX_RAND) WRT_PLT
-    mov     rcx, arg(1) ;noise
-    and     rax, 0xff
-    add     rcx, rax
-
-    ; we rely on the fact that the clamping vectors are stored contiguously
-    ; in black/white/both order. Note that we have to reload this here because
-    ; rdx could be trashed by rand()
-    mov     rdx, arg(2) ; blackclamp
-
-
-            mov     rdi, rcx
-            movsxd  rcx, dword arg(5) ;[Width]
-            mov     rsi, arg(0) ;Pos
-            xor         rax,rax
-
-.addnoise_nextset:
-            movdqu      xmm1,[rsi+rax]         ; get the source
-
-            psubusb     xmm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
-            paddusb     xmm1, [rdx+32] ;bothclamp
-            psubusb     xmm1, [rdx+16] ;whiteclamp
-
-            movdqu      xmm2,[rdi+rax]         ; get the noise for this line
-            paddb       xmm1,xmm2              ; add it in
-            movdqu      [rsi+rax],xmm1         ; store the result
-
-            add         rax,16                 ; move to the next line
-
-            cmp         rax, rcx
-            jl          .addnoise_nextset
-
-    movsxd  rax, dword arg(7) ; Pitch
-    add     arg(0), rax ; Start += Pitch
-    sub     dword arg(6), 1   ; Height -= 1
-    jg      .addnoise_loop
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
 SECTION_RODATA
 align 16
 rd42:
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 2c2c0ba..d639129 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -525,8 +525,8 @@
   }
 
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    high_inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x,
-                         subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd);
+    highbd_inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x,
+                           subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd);
   } else {
     inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x,
                     subpel_y, sf, w, h, ref, kernel, xs, ys);
@@ -699,8 +699,8 @@
   }
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    high_inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
-                         subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd);
+    highbd_inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
+                           subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd);
   } else {
     inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
                     subpel_y, sf, w, h, ref, kernel, xs, ys);
@@ -1315,11 +1315,16 @@
   BufferPool *const pool = cm->buffer_pool;
   for (i = 0; i < REFS_PER_FRAME; ++i) {
     if (vpx_rb_read_bit(rb)) {
-      YV12_BUFFER_CONFIG *const buf = cm->frame_refs[i].buf;
-      width = buf->y_crop_width;
-      height = buf->y_crop_height;
-      found = 1;
-      break;
+      if (cm->frame_refs[i].idx != INVALID_IDX) {
+        YV12_BUFFER_CONFIG *const buf = cm->frame_refs[i].buf;
+        width = buf->y_crop_width;
+        height = buf->y_crop_height;
+        found = 1;
+        break;
+      } else {
+        vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                           "Failed to decode frame size");
+      }
     }
   }
 
@@ -1334,22 +1339,23 @@
   // has valid dimensions.
   for (i = 0; i < REFS_PER_FRAME; ++i) {
     RefBuffer *const ref_frame = &cm->frame_refs[i];
-    has_valid_ref_frame |= valid_ref_frame_size(ref_frame->buf->y_crop_width,
-                                                ref_frame->buf->y_crop_height,
-                                                width, height);
+    has_valid_ref_frame |= (ref_frame->idx != INVALID_IDX &&
+                            valid_ref_frame_size(ref_frame->buf->y_crop_width,
+                                                 ref_frame->buf->y_crop_height,
+                                                 width, height));
   }
   if (!has_valid_ref_frame)
     vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
                        "Referenced frame has invalid size");
   for (i = 0; i < REFS_PER_FRAME; ++i) {
     RefBuffer *const ref_frame = &cm->frame_refs[i];
-    if (!valid_ref_frame_img_fmt(
-            ref_frame->buf->bit_depth,
-            ref_frame->buf->subsampling_x,
-            ref_frame->buf->subsampling_y,
-            cm->bit_depth,
-            cm->subsampling_x,
-            cm->subsampling_y))
+    if (ref_frame->idx == INVALID_IDX ||
+        !valid_ref_frame_img_fmt(ref_frame->buf->bit_depth,
+                                 ref_frame->buf->subsampling_x,
+                                 ref_frame->buf->subsampling_y,
+                                 cm->bit_depth,
+                                 cm->subsampling_x,
+                                 cm->subsampling_y))
       vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
                          "Referenced frame has incompatible color format");
   }
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 8831de9..ffc6839 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -371,9 +371,9 @@
 
   if (left_type == above_type)
     return left_type;
-  else if (left_type == SWITCHABLE_FILTERS && above_type != SWITCHABLE_FILTERS)
+  else if (left_type == SWITCHABLE_FILTERS)
     return above_type;
-  else if (left_type != SWITCHABLE_FILTERS && above_type == SWITCHABLE_FILTERS)
+  else if (above_type == SWITCHABLE_FILTERS)
     return left_type;
   else
     return SWITCHABLE_FILTERS;
@@ -902,4 +902,10 @@
       frame_mvs += cm->mi_cols;
     }
   }
+#if CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH
+    if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) &&
+        (xd->above_mi == NULL || xd->left_mi == NULL) &&
+        !is_inter_block(mi) && need_top_left[mi->uv_mode])
+      assert(0);
+#endif  // CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH
 }
diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index dde609b..891613f 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -505,7 +505,7 @@
         uint32_t this_sz = 0;
 
         for (j = 0; j < mag; ++j)
-          this_sz |= (*x++) << (j * 8);
+          this_sz |= ((uint32_t)(*x++)) << (j * 8);
         sizes[i] = this_sz;
       }
       *count = frames;
diff --git a/vp9/encoder/vp9_aq_360.c b/vp9/encoder/vp9_aq_360.c
index f8c187c..7d411f6 100644
--- a/vp9/encoder/vp9_aq_360.c
+++ b/vp9/encoder/vp9_aq_360.c
@@ -13,6 +13,7 @@
 #include "vpx_ports/mem.h"
 #include "vpx_ports/system_state.h"
 
+#include "vp9/encoder/vp9_aq_360.h"
 #include "vp9/encoder/vp9_aq_variance.h"
 
 #include "vp9/common/vp9_seg_common.h"
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c
index b27ce6a..d8920fb 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -22,7 +22,6 @@
 #include "vp9/encoder/vp9_segmentation.h"
 CYCLIC_REFRESH *vp9_cyclic_refresh_alloc(int mi_rows, int mi_cols) {
   size_t last_coded_q_map_size;
-  size_t consec_zero_mv_size;
   CYCLIC_REFRESH *const cr = vpx_calloc(1, sizeof(*cr));
   if (cr == NULL)
     return NULL;
@@ -40,21 +39,12 @@
   }
   assert(MAXQ <= 255);
   memset(cr->last_coded_q_map, MAXQ, last_coded_q_map_size);
-
-  consec_zero_mv_size = mi_rows * mi_cols * sizeof(*cr->consec_zero_mv);
-  cr->consec_zero_mv = vpx_malloc(consec_zero_mv_size);
-  if (cr->consec_zero_mv == NULL) {
-    vp9_cyclic_refresh_free(cr);
-    return NULL;
-  }
-  memset(cr->consec_zero_mv, 0, consec_zero_mv_size);
   return cr;
 }
 
 void vp9_cyclic_refresh_free(CYCLIC_REFRESH *cr) {
   vpx_free(cr->map);
   vpx_free(cr->last_coded_q_map);
-  vpx_free(cr->consec_zero_mv);
   vpx_free(cr);
 }
 
@@ -244,7 +234,6 @@
                                              BLOCK_SIZE bsize) {
   const VP9_COMMON *const cm = &cpi->common;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
-  MV mv = mi->mv[0].as_mv;
   const int bw = num_8x8_blocks_wide_lookup[bsize];
   const int bh = num_8x8_blocks_high_lookup[bsize];
   const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
@@ -268,15 +257,8 @@
             clamp(cm->base_qindex + cr->qindex_delta[mi->segment_id],
                   0, MAXQ),
             cr->last_coded_q_map[map_offset]);
-      // Update the consecutive zero/low_mv count.
-      if (is_inter_block(mi) && (abs(mv.row) < 8 && abs(mv.col) < 8)) {
-        if (cr->consec_zero_mv[map_offset] < 255)
-          cr->consec_zero_mv[map_offset]++;
-      } else {
-        cr->consec_zero_mv[map_offset] = 0;
       }
     }
-  }
 }
 
 // Update the actual number of blocks that were applied the segment delta q.
@@ -410,13 +392,18 @@
   cr->target_num_seg_blocks = 0;
   if (cpi->oxcf.content != VP9E_CONTENT_SCREEN) {
     consec_zero_mv_thresh = 100;
-   if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium)
-     consec_zero_mv_thresh = 80;
   }
   qindex_thresh =
       cpi->oxcf.content == VP9E_CONTENT_SCREEN
       ? vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST2, cm->base_qindex)
       : vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST1, cm->base_qindex);
+  // More aggressive settings for noisy content.
+  if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium) {
+    consec_zero_mv_thresh = 80;
+    qindex_thresh =
+        VPXMAX(vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST1, cm->base_qindex),
+                              7 * cm->base_qindex >> 3);
+  }
   do {
     int sum_map = 0;
     // Get the mi_row/mi_col corresponding to superblock index i.
@@ -441,7 +428,7 @@
         if (cr->map[bl_index2] == 0) {
           count_tot++;
           if (cr->last_coded_q_map[bl_index2] > qindex_thresh ||
-              cr->consec_zero_mv[bl_index2] < consec_zero_mv_thresh) {
+              cpi->consec_zero_mv[bl_index2] < consec_zero_mv_thresh) {
             sum_map++;
             count_sel++;
           }
@@ -480,6 +467,8 @@
     cr->percent_refresh = 5;
   cr->max_qdelta_perc = 50;
   cr->time_for_refresh = 0;
+  cr->motion_thresh = 32;
+  cr->rate_boost_fac = 15;
   // Use larger delta-qp (increase rate_ratio_qdelta) for first few (~4)
   // periods of the refresh cycle, after a key frame.
   // Account for larger interval on base layer for temporal layers.
@@ -489,9 +478,11 @@
     cr->rate_ratio_qdelta = 3.0;
   } else {
     cr->rate_ratio_qdelta = 2.0;
-  if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium)
-    // Reduce the delta-qp if the estimated source noise is above threshold.
-    cr->rate_ratio_qdelta = 1.5;
+    if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium) {
+      // Reduce the delta-qp if the estimated source noise is above threshold.
+      cr->rate_ratio_qdelta = 1.7;
+      cr->rate_boost_fac = 13;
+    }
   }
   // Adjust some parameters for low resolutions at low bitrates.
   if (cm->width <= 352 &&
@@ -499,9 +490,6 @@
       rc->avg_frame_bandwidth < 3400) {
     cr->motion_thresh = 4;
     cr->rate_boost_fac = 10;
-  } else {
-    cr->motion_thresh = 32;
-    cr->rate_boost_fac = 15;
   }
   if (cpi->svc.spatial_layer_id > 0) {
     cr->motion_thresh = 4;
@@ -544,8 +532,6 @@
     if (cm->frame_type == KEY_FRAME) {
       memset(cr->last_coded_q_map, MAXQ,
              cm->mi_rows * cm->mi_cols * sizeof(*cr->last_coded_q_map));
-      memset(cr->consec_zero_mv, 0,
-             cm->mi_rows * cm->mi_cols * sizeof(*cr->consec_zero_mv));
       cr->sb_index = 0;
     }
     return;
@@ -620,7 +606,6 @@
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   memset(cr->map, 0, cm->mi_rows * cm->mi_cols);
   memset(cr->last_coded_q_map, MAXQ, cm->mi_rows * cm->mi_cols);
-  memset(cr->consec_zero_mv, 0, cm->mi_rows * cm->mi_cols);
   cr->sb_index = 0;
   cpi->refresh_golden_frame = 1;
   cpi->refresh_alt_ref_frame = 1;
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.h b/vp9/encoder/vp9_aq_cyclicrefresh.h
index 095b928..35eea18 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.h
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.h
@@ -53,8 +53,6 @@
   signed char *map;
   // Map of the last q a block was coded at.
   uint8_t *last_coded_q_map;
-  // Count on how many consecutive times a block uses ZER0MV for encoding.
-  uint8_t *consec_zero_mv;
   // Thresholds applied to the projected rate/distortion of the coding block,
   // when deciding whether block should be refreshed.
   int64_t thresh_rate_sb;
diff --git a/vp9/encoder/vp9_aq_variance.c b/vp9/encoder/vp9_aq_variance.c
index d8f7d07..59ef5fa 100644
--- a/vp9/encoder/vp9_aq_variance.c
+++ b/vp9/encoder/vp9_aq_variance.c
@@ -167,7 +167,7 @@
                 vp9_64_zeros, 0, bw, bh, &sse, &avg);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     var = sse - (((int64_t)avg * avg) / (bw * bh));
-    return (256 * var) / (bw * bh);
+    return (unsigned int)(((uint64_t)256 * var) / (bw * bh));
   } else {
 #if CONFIG_VP9_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -185,7 +185,7 @@
                              x->plane[0].src.stride,
                              vp9_64_zeros, 0, &sse);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-    return (256 * var) >> num_pels_log2_lookup[bs];
+    return (unsigned int)(((uint64_t)256 * var) >> num_pels_log2_lookup[bs]);
   }
 }
 
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 5600ed4..73a2db0 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -545,8 +545,8 @@
                 int u = 0;
                 if (t == PIVOT_NODE)
                   s = vp9_prob_diff_update_savings_search_model(
-                      frame_branch_ct[i][j][k][l][0],
-                      old_coef_probs[i][j][k][l], &newp, upd, stepsize);
+                      frame_branch_ct[i][j][k][l][0], oldp, &newp, upd,
+                      stepsize);
                 else
                   s = vp9_prob_diff_update_savings_search(
                       frame_branch_ct[i][j][k][l][t], oldp, &newp, upd);
@@ -584,7 +584,7 @@
                 if (t == PIVOT_NODE)
                   s = vp9_prob_diff_update_savings_search_model(
                       frame_branch_ct[i][j][k][l][0],
-                      old_coef_probs[i][j][k][l], &newp, upd, stepsize);
+                      *oldp, &newp, upd, stepsize);
                 else
                   s = vp9_prob_diff_update_savings_search(
                       frame_branch_ct[i][j][k][l][t],
@@ -622,7 +622,7 @@
                 if (t == PIVOT_NODE) {
                   s = vp9_prob_diff_update_savings_search_model(
                       frame_branch_ct[i][j][k][l][0],
-                      old_coef_probs[i][j][k][l], &newp, upd, stepsize);
+                      *oldp, &newp, upd, stepsize);
                 } else {
                   s = vp9_prob_diff_update_savings_search(
                       frame_branch_ct[i][j][k][l][t],
@@ -891,7 +891,7 @@
     vpx_wb_write_bit(wb, cm->log2_tile_rows != 1);
 }
 
-static int get_refresh_mask(VP9_COMP *cpi) {
+int vp9_get_refresh_mask(VP9_COMP *cpi) {
   if (vp9_preserve_existing_gf(cpi)) {
     // We have decided to preserve the previously existing golden frame as our
     // new ARF frame. However, in the short term we leave it in the GF slot and,
@@ -1107,11 +1107,11 @@
         write_bitdepth_colorspace_sampling(cm, wb);
       }
 
-      vpx_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
+      vpx_wb_write_literal(wb, vp9_get_refresh_mask(cpi), REF_FRAMES);
       write_frame_size(cm, wb);
     } else {
       MV_REFERENCE_FRAME ref_frame;
-      vpx_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
+      vpx_wb_write_literal(wb, vp9_get_refresh_mask(cpi), REF_FRAMES);
       for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
         assert(get_ref_frame_map_idx(cpi, ref_frame) != INVALID_IDX);
         vpx_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame),
diff --git a/vp9/encoder/vp9_bitstream.h b/vp9/encoder/vp9_bitstream.h
index da6b414..f24d20f 100644
--- a/vp9/encoder/vp9_bitstream.h
+++ b/vp9/encoder/vp9_bitstream.h
@@ -18,6 +18,8 @@
 
 #include "vp9/encoder/vp9_encoder.h"
 
+int vp9_get_refresh_mask(VP9_COMP *cpi);
+
 void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size);
 
 static INLINE int vp9_preserve_existing_gf(VP9_COMP *cpi) {
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 147743e..069c335 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -145,6 +145,11 @@
 
   uint8_t sb_is_skin;
 
+  // Used to save the status of whether a block has a low variance in
+  // choose_partitioning. 0 for 64x64, 1~2 for 64x32, 3~4 for 32x64, 5~8 for
+  // 32x32, 9~24 for 16x16.
+  uint8_t variance_low[25];
+
   void (*fwd_txm4x4)(const int16_t *input, tran_low_t *output, int stride);
   void (*itxm_add)(const tran_low_t *input, uint8_t *dest, int stride, int eob);
 #if CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index dbca633..42d456e 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -21,12 +21,6 @@
 #include "vp9/encoder/vp9_denoiser.h"
 #include "vp9/encoder/vp9_encoder.h"
 
-/* The VP9 denoiser is similar to that of the VP8 denoiser. While
- * choosing the motion vectors / reference frames, the denoiser is run, and if
- * it did not modify the signal to much, the denoised block is copied to the
- * signal.
- */
-
 #ifdef OUTPUT_YUV_DENOISED
 static void make_grayscale(YV12_BUFFER_CONFIG *yuv);
 #endif
@@ -49,16 +43,19 @@
 }
 
 static unsigned int sse_thresh(BLOCK_SIZE bs, int increase_denoising) {
-  return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 60 : 40);
+  return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 80 : 40);
 }
 
 static int sse_diff_thresh(BLOCK_SIZE bs, int increase_denoising,
                            int motion_magnitude) {
   if (motion_magnitude >
       noise_motion_thresh(bs, increase_denoising)) {
-    return 0;
+    if (increase_denoising)
+      return (1 << num_pels_log2_lookup[bs]) << 2;
+    else
+      return 0;
   } else {
-    return (1 << num_pels_log2_lookup[bs]) * 20;
+    return (1 << num_pels_log2_lookup[bs]) << 4;
   }
 }
 
@@ -215,12 +212,14 @@
   // Avoid denoising for small block (unless motion is small).
   // Small blocks are selected in variance partition (before encoding) and
   // will typically lie on moving areas.
-  if (motion_magnitude > 16 && bs <= BLOCK_8X8)
+  if (denoiser->denoising_level < kDenHigh &&
+      motion_magnitude > 16 && bs <= BLOCK_8X8)
     return COPY_BLOCK;
 
   // If the best reference frame uses inter-prediction and there is enough of a
   // difference in sum-squared-error, use it.
   if (frame != INTRA_FRAME &&
+      ctx->newmv_sse != UINT_MAX &&
       sse_diff > sse_diff_thresh(bs, increase_denoising, motion_magnitude)) {
     mi->ref_frame[0] = ctx->best_reference_frame;
     mi->mode = ctx->best_sse_inter_mode;
@@ -242,6 +241,9 @@
     ctx->best_sse_inter_mode = ZEROMV;
     ctx->best_sse_mv.as_int = 0;
     *zeromv_filter = 1;
+    if (denoiser->denoising_level > kDenMedium) {
+      motion_magnitude = 0;
+    }
   }
 
   if (ctx->newmv_sse > sse_thresh(bs, increase_denoising)) {
@@ -334,14 +336,13 @@
 
   if (cpi->use_skin_detection &&
       bs <= BLOCK_32X32 &&
-      denoiser->denoising_level >= kDenLow) {
+      denoiser->denoising_level < kDenHigh) {
     int motion_level = (motion_magnitude < 16) ? 0 : 1;
     // If motion for current block is small/zero, compute consec_zeromv for
     // skin detection (early exit in skin detection is done for large
     // consec_zeromv when current block has small/zero motion).
     consec_zeromv = 0;
     if (motion_level == 0) {
-      CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
       VP9_COMMON * const cm = &cpi->common;
       int j, i;
       // Loop through the 8x8 sub-blocks.
@@ -354,7 +355,7 @@
       for (i = 0; i < ymis; i++) {
         for (j = 0; j < xmis; j++) {
           int bl_index = block_index + i * cm->mi_cols + j;
-          consec_zeromv = VPXMIN(cr->consec_zero_mv[bl_index], consec_zeromv);
+          consec_zeromv = VPXMIN(cpi->consec_zero_mv[bl_index], consec_zeromv);
           // No need to keep checking 8x8 blocks if any of the sub-blocks
           // has small consec_zeromv (since threshold for no_skin based on
           // zero/small motion in skin detection is high, i.e, > 4).
@@ -376,8 +377,7 @@
                                      motion_level);
   }
   if (!is_skin &&
-      denoiser->denoising_level == kDenHigh &&
-      motion_magnitude < 16) {
+      denoiser->denoising_level == kDenHigh) {
     denoiser->increase_denoising = 1;
   } else {
     denoiser->increase_denoising = 0;
@@ -494,12 +494,12 @@
   ctx->zeromv_sse = UINT_MAX;
   ctx->newmv_sse = UINT_MAX;
   ctx->zeromv_lastref_sse = UINT_MAX;
+  ctx->best_sse_mv.as_int = 0;
 }
 
 void vp9_denoiser_update_frame_stats(MODE_INFO *mi, unsigned int sse,
                                      PREDICTION_MODE mode,
                                      PICK_MODE_CONTEXT *ctx) {
-  // TODO(tkopp): Use both MVs if possible
   if (mi->mv[0].as_int == 0 && sse < ctx->zeromv_sse) {
     ctx->zeromv_sse = sse;
     ctx->best_zeromv_reference_frame = mi->ref_frame[0];
diff --git a/vp9/encoder/vp9_denoiser.h b/vp9/encoder/vp9_denoiser.h
index 84189b4..9c86e5a 100644
--- a/vp9/encoder/vp9_denoiser.h
+++ b/vp9/encoder/vp9_denoiser.h
@@ -45,6 +45,18 @@
   VP9_DENOISER_LEVEL prev_denoising_level;
 } VP9_DENOISER;
 
+typedef struct {
+  int64_t zero_last_cost_orig;
+  int *ref_frame_cost;
+  int_mv (*frame_mv)[MAX_REF_FRAMES];
+  int reuse_inter_pred;
+  TX_SIZE best_tx_size;
+  PREDICTION_MODE best_mode;
+  MV_REFERENCE_FRAME best_ref_frame;
+  INTERP_FILTER best_pred_filter;
+  uint8_t best_mode_skip_txfm;
+} VP9_PICKMODE_CTX_DEN;
+
 struct VP9_COMP;
 
 void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser,
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 1d8bfa9..e6a75d9 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -134,7 +134,7 @@
                                0, &sse);
       break;
   }
-  return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
+  return ROUND_POWER_OF_TWO((int64_t)var, num_pels_log2_lookup[bs]);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
@@ -662,12 +662,79 @@
   }
 }
 
+#if !CONFIG_VP9_HIGHBITDEPTH
+// Check if most of the superblock is skin content, and if so, force split to
+// 32x32, and set x->sb_is_skin for use in mode selection.
+static int skin_sb_split(VP9_COMP *cpi, MACROBLOCK *x, const int low_res,
+                         int mi_row, int mi_col, int *force_split) {
+  VP9_COMMON * const cm = &cpi->common;
+  // Avoid checking superblocks on/near boundary and avoid low resolutions.
+  // Note superblock may still pick 64X64 if y_sad is very small
+  // (i.e., y_sad < cpi->vbp_threshold_sad) below. For now leave this as is.
+  if (!low_res && (mi_col >= 8 && mi_col + 8 < cm->mi_cols && mi_row >= 8 &&
+      mi_row + 8 < cm->mi_rows)) {
+    int num_16x16_skin = 0;
+    int num_16x16_nonskin = 0;
+    uint8_t *ysignal = x->plane[0].src.buf;
+    uint8_t *usignal = x->plane[1].src.buf;
+    uint8_t *vsignal = x->plane[2].src.buf;
+    int sp = x->plane[0].src.stride;
+    int spuv = x->plane[1].src.stride;
+    const int block_index = mi_row * cm->mi_cols + mi_col;
+    const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64];
+    const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64];
+    const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
+    const int ymis = VPXMIN(cm->mi_rows - mi_row, bh);
+    // Loop through the 16x16 sub-blocks.
+    int i, j;
+    for (i = 0; i < ymis; i+=2) {
+      for (j = 0; j < xmis; j+=2) {
+        int bl_index = block_index + i * cm->mi_cols + j;
+        int bl_index1 = bl_index + 1;
+        int bl_index2 = bl_index + cm->mi_cols;
+        int bl_index3 = bl_index2 + 1;
+        int consec_zeromv = VPXMIN(cpi->consec_zero_mv[bl_index],
+                                   VPXMIN(cpi->consec_zero_mv[bl_index1],
+                                   VPXMIN(cpi->consec_zero_mv[bl_index2],
+                                   cpi->consec_zero_mv[bl_index3])));
+        int is_skin = vp9_compute_skin_block(ysignal,
+                                             usignal,
+                                             vsignal,
+                                             sp,
+                                             spuv,
+                                             BLOCK_16X16,
+                                             consec_zeromv,
+                                             0);
+        num_16x16_skin += is_skin;
+        num_16x16_nonskin += (1 - is_skin);
+        if (num_16x16_nonskin > 3) {
+          // Exit loop if at least 4 of the 16x16 blocks are not skin.
+          i = ymis;
+          break;
+        }
+        ysignal += 16;
+        usignal += 8;
+        vsignal += 8;
+      }
+      ysignal += (sp << 4) - 64;
+      usignal += (spuv << 3) - 32;
+      vsignal += (spuv << 3) - 32;
+    }
+    if (num_16x16_skin > 12) {
+      *force_split = 1;
+      return 1;
+    }
+  }
+  return 0;
+}
+#endif
+
 // This function chooses partitioning based on the variance between source and
 // reconstructed last, where variance is computed for down-sampled inputs.
 static int choose_partitioning(VP9_COMP *cpi,
-                                const TileInfo *const tile,
-                                MACROBLOCK *x,
-                                int mi_row, int mi_col) {
+                               const TileInfo *const tile,
+                               MACROBLOCK *x,
+                               int mi_row, int mi_col) {
   VP9_COMMON * const cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   int i, j, k, m;
@@ -680,6 +747,8 @@
   const uint8_t *d;
   int sp;
   int dp;
+  // Ref frame used in partitioning.
+  MV_REFERENCE_FRAME ref_frame_partition = LAST_FRAME;
   int pixels_wide = 64, pixels_high = 64;
   int64_t thresholds[4] = {cpi->vbp_thresholds[0], cpi->vbp_thresholds[1],
       cpi->vbp_thresholds[2], cpi->vbp_thresholds[3]};
@@ -704,6 +773,8 @@
     }
   }
 
+  memset(x->variance_low, 0, sizeof(x->variance_low));
+
   if (xd->mb_to_right_edge < 0)
     pixels_wide += (xd->mb_to_right_edge >> 3);
   if (xd->mb_to_bottom_edge < 0)
@@ -725,7 +796,7 @@
     const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
 
     const YV12_BUFFER_CONFIG *yv12_g = NULL;
-    unsigned int y_sad, y_sad_g;
+    unsigned int y_sad, y_sad_g, y_sad_thr;
     const BLOCK_SIZE bsize = BLOCK_32X32
         + (mi_col + 4 < cm->mi_cols) * 2 + (mi_row + 4 < cm->mi_rows);
 
@@ -758,84 +829,31 @@
     mi->interp_filter = BILINEAR;
 
     y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col);
-    if (y_sad_g < y_sad) {
+    // Pick ref frame for partitioning, bias last frame when y_sad_g and y_sad
+    // are close if short_circuit_low_temp_var is on.
+    y_sad_thr = cpi->sf.short_circuit_low_temp_var ? (y_sad * 7) >> 3 : y_sad;
+    if (y_sad_g < y_sad_thr) {
       vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
                            &cm->frame_refs[GOLDEN_FRAME - 1].sf);
       mi->ref_frame[0] = GOLDEN_FRAME;
       mi->mv[0].as_int = 0;
       y_sad = y_sad_g;
+      ref_frame_partition = GOLDEN_FRAME;
     } else {
       x->pred_mv[LAST_FRAME] = mi->mv[0].as_mv;
+      ref_frame_partition = LAST_FRAME;
     }
 
     set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64);
 
-    // Check if most of the superblock is skin content, and if so, force split
-    // to 32x32, and set x->sb_is_skin for use in mode selection.
-    // Avoid checking superblocks on/near boundary and avoid low resolutions.
-    // Note superblock may still pick 64X64 if y_sad is very small
-    // (i.e., y_sad < cpi->vbp_threshold_sad) below. For now leave this as is.
     x->sb_is_skin = 0;
 #if !CONFIG_VP9_HIGHBITDEPTH
-    if (cpi->use_skin_detection && !low_res && (mi_col >= 8 &&
-        mi_col + 8 < cm->mi_cols && mi_row >= 8 && mi_row + 8 < cm->mi_rows)) {
-      CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
-      int bl_index1, bl_index2, bl_index3;
-      int num_16x16_skin = 0;
-      int num_16x16_nonskin = 0;
-      int is_skin = 0;
-      int consec_zeromv = 0;
-      uint8_t *ysignal = x->plane[0].src.buf;
-      uint8_t *usignal = x->plane[1].src.buf;
-      uint8_t *vsignal = x->plane[2].src.buf;
-      int spuv = x->plane[1].src.stride;
-      const int block_index = mi_row * cm->mi_cols + mi_col;
-      const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64];
-      const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64];
-      const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
-      const int ymis = VPXMIN(cm->mi_rows - mi_row, bh);
-      // Loop through the 16x16 sub-blocks.
-      int j, i;
-      for (i = 0; i < ymis; i+=2) {
-        for (j = 0; j < xmis; j+=2) {
-          int bl_index = block_index + i * cm->mi_cols + j;
-          bl_index1 = bl_index + 1;
-          bl_index2 = bl_index + cm->mi_cols;
-          bl_index3 = bl_index2 + 1;
-          consec_zeromv = VPXMIN(cr->consec_zero_mv[bl_index],
-                                 VPXMIN(cr->consec_zero_mv[bl_index1],
-                                 VPXMIN(cr->consec_zero_mv[bl_index2],
-                                 cr->consec_zero_mv[bl_index3])));
-          is_skin = vp9_compute_skin_block(ysignal,
-                                           usignal,
-                                           vsignal,
-                                           sp,
-                                           spuv,
-                                           BLOCK_16X16,
-                                           consec_zeromv,
-                                           0);
-          num_16x16_skin += is_skin;
-          num_16x16_nonskin += (1 - is_skin);
-          if (num_16x16_nonskin > 3) {
-            // Exit loop if at least 4 of the 16x16 blocks are not skin.
-            i = ymis;
-            j = xmis;
-          }
-          ysignal += 16;
-          usignal += 8;
-          vsignal += 8;
-        }
-        ysignal += (sp << 4) - 64;
-        usignal += (spuv << 3) - 32;
-        vsignal += (spuv << 3) - 32;
-      }
-      if (num_16x16_skin > 12) {
-        x->sb_is_skin = 1;
-        force_split[0] = 1;
-      }
-    }
+    if (cpi->use_skin_detection)
+      x->sb_is_skin = skin_sb_split(cpi, x, low_res, mi_row, mi_col,
+                                    &force_split[0]);
 #endif
+
     for (i = 1; i <= 2; ++i) {
       struct macroblock_plane  *p = &x->plane[i];
       struct macroblockd_plane *pd = &xd->plane[i];
@@ -1064,6 +1082,59 @@
       }
     }
   }
+
+  if (cpi->sf.short_circuit_low_temp_var) {
+    const int mv_thr = cm->width > 640 ? 8 : 4;
+    // Check temporal variance for bsize >= 16x16, if LAST_FRAME was selected
+    // and int_pro mv is small. If the temporal variance is small set the
+    // variance_low flag for the block. The variance threshold can be adjusted,
+    // the higher the more aggressive.
+    if (ref_frame_partition == LAST_FRAME &&
+        (cpi->sf.short_circuit_low_temp_var == 1 ||
+         (xd->mi[0]->mv[0].as_mv.col < mv_thr &&
+          xd->mi[0]->mv[0].as_mv.col > -mv_thr &&
+          xd->mi[0]->mv[0].as_mv.row < mv_thr &&
+          xd->mi[0]->mv[0].as_mv.row > -mv_thr))) {
+      if (xd->mi[0]->sb_type == BLOCK_64X64 &&
+          vt.part_variances.none.variance < (thresholds[0] >> 1)) {
+        x->variance_low[0] = 1;
+      } else if (xd->mi[0]->sb_type == BLOCK_64X32) {
+        for (j = 0; j < 2; j++) {
+          if (vt.part_variances.horz[j].variance < (thresholds[0] >> 2))
+            x->variance_low[j + 1] = 1;
+        }
+      } else if (xd->mi[0]->sb_type == BLOCK_32X64) {
+        for (j = 0; j < 2; j++) {
+          if (vt.part_variances.vert[j].variance < (thresholds[0] >> 2))
+            x->variance_low[j + 3] = 1;
+        }
+      } else {
+        for (i = 0; i < 4; i++) {
+          if (!force_split[i + 1]) {
+            // 32x32
+            if (vt.split[i].part_variances.none.variance <
+                (thresholds[1] >> 1))
+              x->variance_low[i + 5] = 1;
+          } else if (cpi->sf.short_circuit_low_temp_var == 2) {
+            int idx[4] = {0, 4, xd->mi_stride << 2, (xd->mi_stride << 2) + 4};
+            const int idx_str = cm->mi_stride * mi_row + mi_col + idx[i];
+            MODE_INFO **this_mi = cm->mi_grid_visible + idx_str;
+            // For 32x16 and 16x32 blocks, the flag is set on each 16x16 block
+            // inside.
+            if ((*this_mi)->sb_type == BLOCK_16X16 ||
+                (*this_mi)->sb_type == BLOCK_32X16 ||
+                (*this_mi)->sb_type == BLOCK_16X32) {
+              for (j = 0; j < 4; j++) {
+                if (vt.split[i].split[j].part_variances.none.variance <
+                    (thresholds[2] >> 8))
+                  x->variance_low[(i << 2) + j + 9] = 1;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
   return 0;
 }
 
@@ -3752,8 +3823,14 @@
         break;
       case REFERENCE_PARTITION:
         set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
-        if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled &&
-            xd->mi[0]->segment_id) {
+        // Use nonrd_pick_partition on scene-cut for VBR, or on qp-segment
+        // if cyclic_refresh is enabled.
+        // nonrd_pick_partition does not support 4x4 partition, so avoid it
+        // on key frame for now.
+        if ((cpi->oxcf.rc_mode == VPX_VBR && cpi->rc.high_source_sad &&
+            cm->frame_type != KEY_FRAME) ||
+            (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled &&
+            xd->mi[0]->segment_id)) {
           // Use lower max_partition_size for low resoultions.
           if (cm->width <= 352 && cm->height <= 288)
             x->max_partition_size = BLOCK_32X32;
@@ -4026,7 +4103,6 @@
   rdc->m_search_count = 0;   // Count of motion search hits.
   rdc->ex_search_count = 0;  // Exhaustive mesh search hits.
 
-
   xd->lossless = cm->base_qindex == 0 &&
                  cm->y_dc_delta_q == 0 &&
                  cm->uv_dc_delta_q == 0 &&
@@ -4138,6 +4214,31 @@
   }
 }
 
+static int compute_frame_aq_offset(struct VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  MODE_INFO **mi_8x8_ptr = cm->mi_grid_visible;
+  struct segmentation *const seg = &cm->seg;
+
+  int mi_row, mi_col;
+  int sum_delta = 0;
+  int map_index = 0;
+  int qdelta_index;
+  int segment_id;
+
+  for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) {
+    MODE_INFO **mi_8x8 = mi_8x8_ptr;
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col++, mi_8x8++) {
+      segment_id = mi_8x8[0]->segment_id;
+      qdelta_index = get_segdata(seg, segment_id, SEG_LVL_ALT_Q);
+      sum_delta += qdelta_index;
+      map_index++;
+    }
+    mi_8x8_ptr += cm->mi_stride;
+  }
+
+  return sum_delta / (cm->mi_rows * cm->mi_cols);
+}
+
 void vp9_encode_frame(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
 
@@ -4260,8 +4361,13 @@
     cm->reference_mode = SINGLE_REFERENCE;
     encode_frame_internal(cpi);
   }
-}
 
+  // If segmentated AQ is enabled compute the average AQ weighting.
+  if (cm->seg.enabled && (cpi->oxcf.aq_mode != NO_AQ) &&
+      (cm->seg.update_map || cm->seg.update_data)) {
+    cm->seg.aq_av_offset = compute_frame_aq_offset(cpi);
+  }
+}
 static void sum_intra_stats(FRAME_COUNTS *counts, const MODE_INFO *mi) {
   const PREDICTION_MODE y_mode = mi->mode;
   const PREDICTION_MODE uv_mode = mi->uv_mode;
@@ -4281,6 +4387,32 @@
   ++counts->uv_mode[y_mode][uv_mode];
 }
 
+static void update_zeromv_cnt(VP9_COMP *const cpi,
+                              const MODE_INFO *const mi,
+                              int mi_row, int mi_col,
+                              BLOCK_SIZE bsize) {
+  const VP9_COMMON *const cm = &cpi->common;
+  MV mv = mi->mv[0].as_mv;
+  const int bw = num_8x8_blocks_wide_lookup[bsize];
+  const int bh = num_8x8_blocks_high_lookup[bsize];
+  const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
+  const int ymis = VPXMIN(cm->mi_rows - mi_row, bh);
+  const int block_index = mi_row * cm->mi_cols + mi_col;
+  int x, y;
+  for (y = 0; y < ymis; y++)
+    for (x = 0; x < xmis; x++) {
+      int map_offset = block_index + y * cm->mi_cols + x;
+      if (is_inter_block(mi) && mi->segment_id <= CR_SEGMENT_ID_BOOST2) {
+        if (abs(mv.row) < 8 && abs(mv.col) < 8) {
+          if (cpi->consec_zero_mv[map_offset] < 255)
+           cpi->consec_zero_mv[map_offset]++;
+        } else {
+          cpi->consec_zero_mv[map_offset] = 0;
+        }
+      }
+    }
+}
+
 static void encode_superblock(VP9_COMP *cpi, ThreadData *td,
                               TOKENEXTRA **t, int output_enabled,
                               int mi_row, int mi_col, BLOCK_SIZE bsize,
@@ -4311,9 +4443,15 @@
 
   if (!is_inter_block(mi)) {
     int plane;
+#if CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH
+    if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) &&
+        (xd->above_mi == NULL || xd->left_mi == NULL) &&
+        need_top_left[mi->uv_mode])
+      assert(0);
+#endif  // CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH
     mi->skip = 1;
     for (plane = 0; plane < MAX_MB_PLANE; ++plane)
-      vp9_encode_intra_block_plane(x, VPXMAX(bsize, BLOCK_8X8), plane);
+      vp9_encode_intra_block_plane(x, VPXMAX(bsize, BLOCK_8X8), plane, 1);
     if (output_enabled)
       sum_intra_stats(td->counts, mi);
     vp9_tokenize_sb(cpi, td, t, !output_enabled, seg_skip,
@@ -4361,5 +4499,7 @@
     ++td->counts->tx.tx_totals[get_uv_tx_size(mi, &xd->plane[1])];
     if (cm->seg.enabled && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
       vp9_cyclic_refresh_update_sb_postencode(cpi, mi, mi_row, mi_col, bsize);
+    if (cpi->oxcf.pass == 0 && cpi->svc.temporal_layer_id == 0)
+      update_zeromv_cnt(cpi, mi, mi_row, mi_col, bsize);
   }
 }
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 689e8c0..fdf403e 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -50,29 +50,22 @@
                      pd->dst.buf, pd->dst.stride);
 }
 
-#define RDTRUNC(RM, DM, R, D)                        \
-  (((1 << (VP9_PROB_COST_SHIFT - 1)) + (R) * (RM)) & \
-   ((1 << VP9_PROB_COST_SHIFT) - 1))
-
+// TODO(aconverse): Re-pack this structure.
 typedef struct vp9_token_state {
   int           rate;
-  int           error;
+  int64_t       error;
   int           next;
   int16_t       token;
-  int16_t       qc;
+  tran_low_t    qc;
+  tran_low_t    dqc;
 } vp9_token_state;
 
-// TODO(jimbankoski): experiment to find optimal RD numbers.
-static const int plane_rd_mult[PLANE_TYPES] = { 4, 2 };
+static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] ={ {10, 6}, {8, 7}, };
 
 #define UPDATE_RD_COST()\
 {\
   rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);\
   rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);\
-  if (rd_cost0 == rd_cost1) {\
-    rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);\
-    rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);\
-  }\
 }
 
 // This function is a place holder for now but may ultimately need
@@ -103,16 +96,19 @@
   const int eob = p->eobs[block];
   const PLANE_TYPE type = get_plane_type(plane);
   const int default_eob = 16 << (tx_size << 1);
-  const int mul = 1 + (tx_size == TX_32X32);
-  const int16_t *dequant_ptr = pd->dequant;
-  const uint8_t *const band_translate = get_band_translate(tx_size);
+  const int shift = (tx_size == TX_32X32);
+  const int16_t* const dequant_ptr = pd->dequant;
+  const uint8_t* const band_translate = get_band_translate(tx_size);
   const scan_order *const so = get_scan(xd, tx_size, type, block);
   const int16_t *const scan = so->scan;
   const int16_t *const nb = so->neighbors;
+  const int dq_step[2] = { dequant_ptr[0] >> shift, dequant_ptr[1] >> shift };
   int next = eob, sz = 0;
-  int64_t rdmult = mb->rdmult * plane_rd_mult[type], rddiv = mb->rddiv;
+  const int64_t rdmult = (mb->rdmult * plane_rd_mult[ref][type]) >> 1;
+  const int64_t rddiv = mb->rddiv;
   int64_t rd_cost0, rd_cost1;
-  int rate0, rate1, error0, error1;
+  int rate0, rate1;
+  int64_t error0, error1;
   int16_t t0, t1;
   EXTRABIT e0;
   int best, band, pt, i, final_eob;
@@ -126,9 +122,6 @@
   assert(eob <= default_eob);
 
   /* Now set up a Viterbi trellis to evaluate alternative roundings. */
-  if (!ref)
-    rdmult = (rdmult * 9) >> 4;
-
   /* Initialize the sentinel node of the trellis. */
   tokens[eob][0].rate = 0;
   tokens[eob][0].error = 0;
@@ -167,7 +160,7 @@
       /* And pick the best. */
       best = rd_cost1 < rd_cost0;
       base_bits = vp9_get_cost(t0, e0, cat6_high_cost);
-      dx = mul * (dqcoeff[rc] - coeff[rc]);
+      dx = (dqcoeff[rc] - coeff[rc]) * (1 << shift);
 #if CONFIG_VP9_HIGHBITDEPTH
       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
         dx >>= xd->bd - 8;
@@ -179,14 +172,15 @@
       tokens[i][0].next = next;
       tokens[i][0].token = t0;
       tokens[i][0].qc = x;
+      tokens[i][0].dqc = dqcoeff[rc];
       best_index[i][0] = best;
 
       /* Evaluate the second possibility for this state. */
       rate0 = tokens[next][0].rate;
       rate1 = tokens[next][1].rate;
 
-      if ((abs(x) * dequant_ptr[rc != 0] > abs(coeff[rc]) * mul) &&
-          (abs(x) * dequant_ptr[rc != 0] < abs(coeff[rc]) * mul +
+      if ((abs(x) * dequant_ptr[rc != 0] > (abs(coeff[rc]) << shift)) &&
+          (abs(x) * dequant_ptr[rc != 0] < (abs(coeff[rc]) << shift) +
                                                dequant_ptr[rc != 0]))
         shortcut = 1;
       else
@@ -195,6 +189,11 @@
       if (shortcut) {
         sz = -(x < 0);
         x -= 2 * sz + 1;
+      } else {
+        tokens[i][1] = tokens[i][0];
+        best_index[i][1] = best_index[i][0];
+        next = i;
+        continue;
       }
 
       /* Consider both possible successor states. */
@@ -245,6 +244,24 @@
       tokens[i][1].next = next;
       tokens[i][1].token = best ? t1 : t0;
       tokens[i][1].qc = x;
+
+      if (x) {
+        tran_low_t offset = dq_step[rc != 0];
+        // The 32x32 transform coefficient uses half quantization step size.
+        // Account for the rounding difference in the dequantized coefficeint
+        // value when the quantization index is dropped from an even number
+        // to an odd number.
+        if (shift & x)
+          offset += (dequant_ptr[rc != 0] & 0x01);
+
+        if (sz == 0)
+          tokens[i][1].dqc = dqcoeff[rc] - offset;
+        else
+          tokens[i][1].dqc = dqcoeff[rc] + offset;
+      } else {
+        tokens[i][1].dqc = 0;
+      }
+
       best_index[i][1] = best;
       /* Finally, make this the new head of the trellis. */
       next = i;
@@ -284,18 +301,13 @@
   UPDATE_RD_COST();
   best = rd_cost1 < rd_cost0;
   final_eob = -1;
-  memset(qcoeff, 0, sizeof(*qcoeff) * (16 << (tx_size * 2)));
-  memset(dqcoeff, 0, sizeof(*dqcoeff) * (16 << (tx_size * 2)));
+
   for (i = next; i < eob; i = next) {
     const int x = tokens[i][best].qc;
     const int rc = scan[i];
-    if (x) {
-      final_eob = i;
-    }
-
+    if (x) final_eob = i;
     qcoeff[rc] = x;
-    dqcoeff[rc] = (x * dequant_ptr[rc != 0]) / mul;
-
+    dqcoeff[rc] = tokens[i][best].dqc;
     next = tokens[i][best].next;
     best = best_index[i][best];
   }
@@ -785,10 +797,19 @@
   const int src_stride = p->src.stride;
   const int dst_stride = pd->dst.stride;
   int i, j;
+  struct optimize_ctx *const ctx = args->ctx;
+  ENTROPY_CONTEXT *a = NULL;
+  ENTROPY_CONTEXT *l = NULL;
+  int entropy_ctx = 0;
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
   dst = &pd->dst.buf[4 * (j * dst_stride + i)];
   src = &p->src.buf[4 * (j * src_stride + i)];
   src_diff = &p->src_diff[4 * (j * diff_stride + i)];
+  if (args->ctx != NULL) {
+    a = &ctx->ta[plane][i];
+    l = &ctx->tl[plane][j];
+    entropy_ctx = combine_entropy_contexts(*a, *l);
+  }
 
   if (tx_size == TX_4X4) {
     tx_type = get_tx_type_4x4(get_plane_type(plane), xd, block);
@@ -907,6 +928,9 @@
                              pd->dequant, eob, scan_order->scan,
                              scan_order->iscan);
       }
+      if (args->ctx != NULL && !x->skip_recode) {
+       *a = *l = optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
+      }
       if (!x->skip_encode && *eob)
         vp9_idct32x32_add(dqcoeff, dst, dst_stride, *eob);
       break;
@@ -920,6 +944,9 @@
                        pd->dequant, eob, scan_order->scan,
                        scan_order->iscan);
       }
+      if (args->ctx != NULL && !x->skip_recode) {
+        *a = *l = optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
+      }
       if (!x->skip_encode && *eob)
         vp9_iht16x16_add(tx_type, dqcoeff, dst, dst_stride, *eob);
       break;
@@ -933,6 +960,9 @@
                        pd->dequant, eob, scan_order->scan,
                        scan_order->iscan);
       }
+      if (args->ctx != NULL && !x->skip_recode) {
+        *a = *l = optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
+      }
       if (!x->skip_encode && *eob)
         vp9_iht8x8_add(tx_type, dqcoeff, dst, dst_stride, *eob);
       break;
@@ -949,7 +979,9 @@
                        pd->dequant, eob, scan_order->scan,
                        scan_order->iscan);
       }
-
+      if (args->ctx != NULL && !x->skip_recode) {
+        *a = *l = optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
+      }
       if (!x->skip_encode && *eob) {
         if (tx_type == DCT_DCT)
           // this is like vp9_short_idct4x4 but has a special case around eob<=1
@@ -968,10 +1000,21 @@
     *(args->skip) = 0;
 }
 
-void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
+void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane,
+                                  int enable_optimize_b) {
   const MACROBLOCKD *const xd = &x->e_mbd;
+  struct optimize_ctx ctx;
   struct encode_b_args arg = {x, NULL, &xd->mi[0]->skip};
 
+  if (enable_optimize_b && x->optimize &&
+      (!x->skip_recode || !x->skip_optimize)) {
+    const struct macroblockd_plane* const pd = &xd->plane[plane];
+    const TX_SIZE tx_size = plane ? get_uv_tx_size(xd->mi[0], pd) :
+        xd->mi[0]->tx_size;
+    vp9_get_entropy_contexts(bsize, tx_size, pd, ctx.ta[plane], ctx.tl[plane]);
+    arg.ctx = &ctx;
+  }
+
   vp9_foreach_transformed_block_in_plane(xd, bsize, plane,
                                          vp9_encode_block_intra, &arg);
 }
diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h
index 97df8a6..25b0b23 100644
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@@ -37,7 +37,8 @@
 void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
                             TX_SIZE tx_size, void *arg);
 
-void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
+void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane,
+                                  int enable_optimize_b);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 845f5aa..fde1cb9 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -86,6 +86,25 @@
 FILE *keyfile;
 #endif
 
+static const Vp9LevelSpec vp9_level_defs[VP9_LEVELS] = {
+  {LEVEL_1,   829440,      36864,    200,    400,   2, 1,  4,  8},
+  {LEVEL_1_1, 2764800,     73728,    800,    1000,  2, 1,  4,  8},
+  {LEVEL_2,   4608000,     122880,   1800,   1500,  2, 1,  4,  8},
+  {LEVEL_2_1, 9216000,     245760,   3600,   2800,  2, 2,  4,  8},
+  {LEVEL_3,   20736000,    552960,   7200,   6000,  2, 4,  4,  8},
+  {LEVEL_3_1, 36864000,    983040,   12000,  10000, 2, 4,  4,  8},
+  {LEVEL_4,   83558400,    2228224,  18000,  16000, 4, 4,  4,  8},
+  {LEVEL_4_1, 160432128,   2228224,  30000,  18000, 4, 4,  5,  6},
+  {LEVEL_5,   311951360,   8912896,  60000,  36000, 6, 8,  6,  4},
+  {LEVEL_5_1, 588251136,   8912896,  120000, 46000, 8, 8,  10, 4},
+  // TODO(huisu): update max_cpb_size for level 5_2 ~ 6_2 when
+  // they are finalized (currently TBD).
+  {LEVEL_5_2, 1176502272,  8912896,  180000, 0,     8, 8,  10, 4},
+  {LEVEL_6,   1176502272,  35651584, 180000, 0,     8, 16, 10, 4},
+  {LEVEL_6_1, 2353004544u, 35651584, 240000, 0,     8, 16, 10, 4},
+  {LEVEL_6_2, 4706009088u, 35651584, 480000, 0,     8, 16, 10, 4},
+};
+
 static INLINE void Scale2Ratio(VPX_SCALING mode, int *hr, int *hs) {
   switch (mode) {
     case NORMAL:
@@ -159,6 +178,39 @@
   }
 }
 
+static void init_level_info(Vp9LevelInfo *level_info) {
+  Vp9LevelStats *const level_stats = &level_info->level_stats;
+  Vp9LevelSpec *const level_spec = &level_info->level_spec;
+
+  memset(level_stats, 0, sizeof(*level_stats));
+  memset(level_spec, 0, sizeof(*level_spec));
+  level_spec->level = LEVEL_UNKNOWN;
+  level_spec->min_altref_distance = INT_MAX;
+}
+
+VP9_LEVEL vp9_get_level(const Vp9LevelSpec * const level_spec) {
+  int i;
+  const Vp9LevelSpec *this_level;
+
+  vpx_clear_system_state();
+
+  for (i = 0; i < VP9_LEVELS; ++i) {
+    this_level = &vp9_level_defs[i];
+    if ((double)level_spec->max_luma_sample_rate * (1 + SAMPLE_RATE_GRACE_P) >
+        (double)this_level->max_luma_sample_rate ||
+        level_spec->max_luma_picture_size > this_level->max_luma_picture_size ||
+        level_spec->average_bitrate > this_level->average_bitrate ||
+        level_spec->max_cpb_size > this_level->max_cpb_size ||
+        level_spec->compression_ratio < this_level->compression_ratio ||
+        level_spec->max_col_tiles > this_level->max_col_tiles ||
+        level_spec->min_altref_distance < this_level->min_altref_distance ||
+        level_spec->max_ref_frame_buffers > this_level->max_ref_frame_buffers)
+      continue;
+    break;
+  }
+  return (i == VP9_LEVELS) ? LEVEL_UNKNOWN : vp9_level_defs[i].level;
+}
+
 int vp9_set_active_map(VP9_COMP* cpi,
                        unsigned char* new_map_16x16,
                        int rows,
@@ -375,6 +427,9 @@
   vpx_free(cpi->active_map.map);
   cpi->active_map.map = NULL;
 
+  vpx_free(cpi->consec_zero_mv);
+  cpi->consec_zero_mv = NULL;
+
   vp9_free_ref_frame_buffers(cm->buffer_pool);
 #if CONFIG_VP9_POSTPROC
   vp9_free_postproc_buffers(cm);
@@ -771,7 +826,6 @@
 
   cpi->oxcf = *oxcf;
   cpi->framerate = oxcf->init_framerate;
-
   cm->profile = oxcf->profile;
   cm->bit_depth = oxcf->bit_depth;
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -780,6 +834,9 @@
   cm->color_space = oxcf->color_space;
   cm->color_range = oxcf->color_range;
 
+  cpi->target_level = oxcf->target_level;
+  cpi->keep_level_stats = oxcf->target_level != LEVEL_MAX;
+
   cm->width = oxcf->width;
   cm->height = oxcf->height;
   alloc_compressor_data(cpi);
@@ -1470,6 +1527,9 @@
   cm->color_space = oxcf->color_space;
   cm->color_range = oxcf->color_range;
 
+  cpi->target_level = oxcf->target_level;
+  cpi->keep_level_stats = oxcf->target_level != LEVEL_MAX;
+
   if (cm->profile <= PROFILE_1)
     assert(cm->bit_depth == VPX_BITS_8);
   else
@@ -1549,9 +1609,12 @@
 
   update_frame_size(cpi);
 
-  if ((last_w != cpi->oxcf.width || last_h != cpi->oxcf.height) &&
-      cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
-    vp9_cyclic_refresh_reset_resize(cpi);
+  if (last_w != cpi->oxcf.width || last_h != cpi->oxcf.height) {
+    memset(cpi->consec_zero_mv, 0,
+               cm->mi_rows * cm->mi_cols * sizeof(*cpi->consec_zero_mv));
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+      vp9_cyclic_refresh_reset_resize(cpi);
+  }
 
   if ((cpi->svc.number_temporal_layers > 1 &&
       cpi->oxcf.rc_mode == VPX_CBR) ||
@@ -1649,7 +1712,6 @@
   } while (++i <= MV_MAX);
 }
 
-
 VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
                                 BufferPool *const pool) {
   unsigned int i;
@@ -1686,9 +1748,6 @@
   cpi->use_skin_detection = 0;
   cpi->common.buffer_pool = pool;
 
-  cpi->rc.high_source_sad = 0;
-  cpi->rc.count_last_scene_change = 0;
-
   init_config(cpi, oxcf);
   vp9_rc_init(&cpi->oxcf, oxcf->pass, &cpi->rc);
 
@@ -1698,6 +1757,10 @@
 
   realloc_segmentation_maps(cpi);
 
+  CHECK_MEM_ERROR(cm, cpi->consec_zero_mv,
+                  vpx_calloc(cm->mi_rows * cm->mi_cols,
+                             sizeof(*cpi->consec_zero_mv)));
+
   CHECK_MEM_ERROR(cm, cpi->nmvcosts[0],
                   vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[0])));
   CHECK_MEM_ERROR(cm, cpi->nmvcosts[1],
@@ -1737,6 +1800,9 @@
   cpi->multi_arf_last_grp_enabled = 0;
 
   cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
+
+  init_level_info(&cpi->level_info);
+
 #if CONFIG_INTERNAL_STATS
   cpi->b_calculate_blockiness = 1;
   cpi->b_calculate_consistency = 1;
@@ -2009,6 +2075,8 @@
       const double dr =
           (double)cpi->bytes * (double) 8 / (double)1000 / time_encoded;
       const double peak = (double)((1 << cpi->oxcf.input_bit_depth) - 1);
+      const double target_rate = (double)cpi->oxcf.target_bandwidth / 1000;
+      const double rate_err = ((100.0 * (dr - target_rate)) / target_rate);
 
       if (cpi->b_calculate_psnr) {
         const double total_psnr =
@@ -2053,8 +2121,9 @@
           SNPRINT2(results, "\t%7.3f", consistency);
           SNPRINT2(results, "\t%7.3f", cpi->worst_consistency);
         }
-        fprintf(f, "%s\t    Time\n", headings);
-        fprintf(f, "%s\t%8.0f\n", results, total_encode_time);
+        fprintf(f, "%s\t    Time  Rc-Err Abs Err\n", headings);
+        fprintf(f, "%s\t%8.0f %7.2f %7.2f\n", results,
+                total_encode_time, rate_err, fabs(rate_err));
       }
 
       fclose(f);
@@ -2515,7 +2584,7 @@
   } else if (vp9_preserve_existing_gf(cpi)) {
     // We have decided to preserve the previously existing golden frame as our
     // new ARF frame. However, in the short term in function
-    // vp9_bitstream.c::get_refresh_mask() we left it in the GF slot and, if
+    // vp9_get_refresh_mask() we left it in the GF slot and, if
     // we're updating the GF with the current decoded frame, we save it to the
     // ARF slot instead.
     // We now have to update the ARF with the current frame and swap gld_fb_idx
@@ -2859,7 +2928,7 @@
        "%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf"
         "%6d %6d %5d %5d %5d "
         "%10"PRId64" %10.3lf"
-        "%10lf %8u %10"PRId64" %10d %10d %10d %10d\n",
+        "%10lf %8u %10"PRId64" %10d %10d %10d %10d %10d\n",
         cpi->common.current_video_frame,
         cm->width, cm->height,
         cpi->td.rd_counts.m_search_count,
@@ -2893,7 +2962,8 @@
         cpi->tot_recode_hits, recon_err, cpi->rc.kf_boost,
         cpi->twopass.kf_zeromotion_pct,
         cpi->twopass.fr_content_type,
-        cm->lf.filter_level);
+        cm->lf.filter_level,
+        cm->seg.aq_av_offset);
   }
   fclose(f);
 
@@ -3162,6 +3232,12 @@
                                              cpi->unscaled_last_source,
                                              &cpi->scaled_last_source,
                                              (cpi->oxcf.pass == 0));
+
+  if (cm->frame_type == KEY_FRAME || cpi->resize_pending != 0) {
+    memset(cpi->consec_zero_mv, 0,
+           cm->mi_rows * cm->mi_cols * sizeof(*cpi->consec_zero_mv));
+  }
+
   vp9_update_noise_estimate(cpi);
 
   if (cpi->oxcf.pass == 0 &&
@@ -4130,6 +4206,124 @@
 }
 #endif  // CONFIG_INTERNAL_STATS
 
+static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) {
+  VP9_COMMON *const cm = &cpi->common;
+  Vp9LevelInfo *const level_info = &cpi->level_info;
+  Vp9LevelSpec *const level_spec = &level_info->level_spec;
+  Vp9LevelStats *const level_stats = &level_info->level_stats;
+  int i, idx;
+  uint64_t luma_samples, dur_end;
+  const uint32_t luma_pic_size = cm->width * cm->height;
+  double cpb_data_size;
+
+  vpx_clear_system_state();
+
+  // update level_stats
+  level_stats->total_compressed_size += *size;
+  if (cm->show_frame) {
+    level_stats->total_uncompressed_size +=
+        luma_pic_size +
+        2 * (luma_pic_size >> (cm->subsampling_x + cm->subsampling_y));
+    level_stats->time_encoded =
+        (cpi->last_end_time_stamp_seen - cpi->first_time_stamp_ever) /
+        (double)TICKS_PER_SEC;
+  }
+
+  if (arf_src_index > 0) {
+    if (!level_stats->seen_first_altref) {
+      level_stats->seen_first_altref = 1;
+    } else if (level_stats->frames_since_last_altref <
+             level_spec->min_altref_distance) {
+      level_spec->min_altref_distance = level_stats->frames_since_last_altref;
+    }
+    level_stats->frames_since_last_altref = 0;
+  } else {
+    ++level_stats->frames_since_last_altref;
+  }
+
+  if (level_stats->frame_window_buffer.len < FRAME_WINDOW_SIZE - 1) {
+    idx = (level_stats->frame_window_buffer.start +
+           level_stats->frame_window_buffer.len++) % FRAME_WINDOW_SIZE;
+  } else {
+    idx = level_stats->frame_window_buffer.start;
+    level_stats->frame_window_buffer.start = (idx + 1) % FRAME_WINDOW_SIZE;
+  }
+  level_stats->frame_window_buffer.buf[idx].ts = cpi->last_time_stamp_seen;
+  level_stats->frame_window_buffer.buf[idx].size = (uint32_t)(*size);
+  level_stats->frame_window_buffer.buf[idx].luma_samples = luma_pic_size;
+
+  if (cm->frame_type == KEY_FRAME) {
+    level_stats->ref_refresh_map = 0;
+  } else {
+    int count = 0;
+    level_stats->ref_refresh_map |= vp9_get_refresh_mask(cpi);
+    // Also need to consider the case where the encoder refers to a buffer
+    // that has been implicitly refreshed after encoding a keyframe.
+    if (!cm->intra_only) {
+      level_stats->ref_refresh_map |= (1 << cpi->lst_fb_idx);
+      level_stats->ref_refresh_map |= (1 << cpi->gld_fb_idx);
+      level_stats->ref_refresh_map |= (1 << cpi->alt_fb_idx);
+    }
+    for (i = 0; i < REF_FRAMES; ++i) {
+      count += (level_stats->ref_refresh_map >> i) & 1;
+    }
+    if (count > level_spec->max_ref_frame_buffers) {
+      level_spec->max_ref_frame_buffers = count;
+    }
+  }
+
+  // update average_bitrate
+  level_spec->average_bitrate =
+      (double)level_stats->total_compressed_size / 125.0 /
+      level_stats->time_encoded;
+
+  // update max_luma_sample_rate
+  luma_samples = 0;
+  for (i = 0; i < level_stats->frame_window_buffer.len; ++i) {
+    idx = (level_stats->frame_window_buffer.start +
+           level_stats->frame_window_buffer.len - 1 - i) % FRAME_WINDOW_SIZE;
+    if (i == 0) {
+      dur_end = level_stats->frame_window_buffer.buf[idx].ts;
+    }
+    if (dur_end - level_stats->frame_window_buffer.buf[idx].ts >=
+        TICKS_PER_SEC) {
+      break;
+    }
+    luma_samples += level_stats->frame_window_buffer.buf[idx].luma_samples;
+  }
+  if (luma_samples > level_spec->max_luma_sample_rate) {
+    level_spec->max_luma_sample_rate = luma_samples;
+  }
+
+  // update max_cpb_size
+  cpb_data_size = 0;
+  for (i = 0; i < CPB_WINDOW_SIZE; ++i) {
+    if (i >= level_stats->frame_window_buffer.len) break;
+    idx = (level_stats->frame_window_buffer.start +
+           level_stats->frame_window_buffer.len - 1 - i) % FRAME_WINDOW_SIZE;
+    cpb_data_size += level_stats->frame_window_buffer.buf[idx].size;
+  }
+  cpb_data_size = cpb_data_size / 125.0;
+  if (cpb_data_size > level_spec->max_cpb_size) {
+    level_spec->max_cpb_size = cpb_data_size;
+  }
+
+  // update max_luma_picture_size
+  if (luma_pic_size > level_spec->max_luma_picture_size) {
+    level_spec->max_luma_picture_size = luma_pic_size;
+  }
+
+  // update compression_ratio
+  level_spec->compression_ratio =
+      (double)level_stats->total_uncompressed_size * cm->bit_depth /
+      level_stats->total_compressed_size / 8.0;
+
+  // update max_col_tiles
+  if (level_spec->max_col_tiles < (1 << cm->log2_tile_cols)) {
+    level_spec->max_col_tiles = (1 << cm->log2_tile_cols);
+  }
+}
+
 int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
                             size_t *size, uint8_t *dest,
                             int64_t *time_stamp, int64_t *time_end, int flush) {
@@ -4400,6 +4594,9 @@
   if (cpi->b_calculate_psnr && oxcf->pass != 1 && cm->show_frame)
     generate_psnr_packet(cpi);
 
+  if (cpi->keep_level_stats && oxcf->pass != 1)
+    update_level_info(cpi, size, arf_src_index);
+
 #if CONFIG_INTERNAL_STATS
 
   if (oxcf->pass != 1) {
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 02d223a..9a8b15a 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -20,6 +20,7 @@
 #include "vpx_dsp/ssim.h"
 #endif
 #include "vpx_dsp/variance.h"
+#include "vpx_ports/system_state.h"
 #include "vpx_util/vpx_thread.h"
 
 #include "vp9/common/vp9_alloccommon.h"
@@ -51,6 +52,9 @@
 extern "C" {
 #endif
 
+// vp9 uses 10,000,000 ticks/second as time stamp
+#define TICKS_PER_SEC 10000000
+
 typedef struct {
   int nmvjointcost[MV_JOINTS];
   int nmvcosts[2][MV_VALS];
@@ -129,7 +133,7 @@
   int height;  // height of data passed to the compressor
   unsigned int input_bit_depth;  // Input bit depth.
   double init_framerate;  // set to passed in framerate
-  int64_t target_bandwidth;  // bandwidth to be used in kilobits per second
+  int64_t target_bandwidth;  // bandwidth to be used in bits per second
 
   int noise_sensitivity;  // pre processing blur: recommendation 0
   int sharpness;  // sharpening output: recommendation 0:
@@ -227,6 +231,8 @@
 
   int max_threads;
 
+  int target_level;
+
   vpx_fixed_buf_t two_pass_stats_in;
   struct vpx_codec_pkt_list *output_pkt_list;
 
@@ -295,6 +301,69 @@
   double worst;
 } ImageStat;
 
+#define CPB_WINDOW_SIZE 4
+#define FRAME_WINDOW_SIZE 128
+#define SAMPLE_RATE_GRACE_P 0.015
+#define VP9_LEVELS 14
+
+typedef enum {
+  LEVEL_UNKNOWN = 0,
+  LEVEL_1 = 10,
+  LEVEL_1_1 = 11,
+  LEVEL_2 = 20,
+  LEVEL_2_1 = 21,
+  LEVEL_3 = 30,
+  LEVEL_3_1 = 31,
+  LEVEL_4 = 40,
+  LEVEL_4_1 = 41,
+  LEVEL_5 = 50,
+  LEVEL_5_1 = 51,
+  LEVEL_5_2 = 52,
+  LEVEL_6 = 60,
+  LEVEL_6_1 = 61,
+  LEVEL_6_2 = 62,
+  LEVEL_MAX = 255
+} VP9_LEVEL;
+
+typedef struct {
+  VP9_LEVEL level;
+  uint64_t max_luma_sample_rate;
+  uint32_t max_luma_picture_size;
+  double average_bitrate;  // in kilobits per second
+  double max_cpb_size;  // in kilobits
+  double compression_ratio;
+  uint8_t max_col_tiles;
+  uint32_t min_altref_distance;
+  uint8_t max_ref_frame_buffers;
+} Vp9LevelSpec;
+
+typedef struct {
+  int64_t ts;  // timestamp
+  uint32_t luma_samples;
+  uint32_t size;  // in bytes
+} FrameRecord;
+
+typedef struct {
+  FrameRecord buf[FRAME_WINDOW_SIZE];
+  uint8_t start;
+  uint8_t len;
+} FrameWindowBuffer;
+
+typedef struct {
+  uint8_t seen_first_altref;
+  uint32_t frames_since_last_altref;
+  uint64_t total_compressed_size;
+  uint64_t total_uncompressed_size;
+  double time_encoded;  // in seconds
+  FrameWindowBuffer frame_window_buffer;
+  int ref_refresh_map;
+} Vp9LevelStats;
+
+typedef struct {
+  Vp9LevelStats level_stats;
+  Vp9LevelSpec level_spec;
+} Vp9LevelInfo;
+
 typedef struct VP9_COMP {
   QUANTS quants;
   ThreadData td;
@@ -494,8 +563,13 @@
 
   int use_skin_detection;
 
+  int target_level;
+
   NOISE_ESTIMATE noise_estimate;
 
+  // Count on how many consecutive times a block uses small/zeromv for encoding.
+  uint8_t *consec_zero_mv;
+
   // VAR_BASED_PARTITION thresholds
   // 0 - threshold_64x64; 1 - threshold_32x32;
   // 2 - threshold_16x16; 3 - vbp_threshold_8x8;
@@ -509,6 +583,9 @@
   VPxWorker *workers;
   struct EncWorkerData *tile_thr_data;
   VP9LfSync lf_row_sync;
+
+  int keep_level_stats;
+  Vp9LevelInfo level_info;
 } VP9_COMP;
 
 void vp9_initialize_enc(void);
@@ -658,6 +735,8 @@
   return cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL;
 }
 
+VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec);
+
 void vp9_new_framerate(VP9_COMP *cpi, double framerate);
 
 #define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl))
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 3e34c01..f6e61b6 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -41,11 +41,8 @@
 #define OUTPUT_FPF          0
 #define ARF_STATS_OUTPUT    0
 
-#define GROUP_ADAPTIVE_MAXQ 1
-
 #define BOOST_BREAKOUT      12.5
 #define BOOST_FACTOR        12.5
-#define ERR_DIVISOR         128.0
 #define FACTOR_PT_LOW       0.70
 #define FACTOR_PT_HIGH      0.90
 #define FIRST_PASS_Q        10.0
@@ -741,7 +738,7 @@
       xd->mi[0]->mode = DC_PRED;
       xd->mi[0]->tx_size = use_dc_pred ?
          (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4;
-      vp9_encode_intra_block_plane(x, bsize, 0);
+      vp9_encode_intra_block_plane(x, bsize, 0, 0);
       this_error = vpx_get_mb_ss(x->plane[0].src_diff);
 
       // Keep a record of blocks that have almost no intra error residual
@@ -1124,7 +1121,8 @@
     fps.intra_skip_pct = (double)intra_skip_count / num_mbs;
     fps.intra_smooth_pct = (double)intra_smooth_count / num_mbs;
     fps.inactive_zone_rows = (double)image_data_start_row;
-    fps.inactive_zone_cols = (double)0;  // TODO(paulwilkins): fix
+    // Currently set to 0 as most issues relate to letter boxing.
+    fps.inactive_zone_cols = (double)0;
 
     if (mvcount > 0) {
       fps.MVr = (double)sum_mvr / mvcount;
@@ -1150,10 +1148,9 @@
       fps.pcnt_motion = 0.0;
     }
 
-    // TODO(paulwilkins):  Handle the case when duration is set to 0, or
-    // something less than the full time between subsequent values of
-    // cpi->source_time_stamp.
-    fps.duration = (double)(source->ts_end - source->ts_start);
+    // Dont allow a value of 0 for duration.
+    // (Section duration is also defaulted to minimum of 1.0).
+    fps.duration = VPXMAX(1.0, (double)(source->ts_end - source->ts_start));
 
     // Don't want to do output stats with a stack variable!
     twopass->this_frame_stats = fps;
@@ -1241,18 +1238,15 @@
   return fclamp(pow(error_term, power_term), 0.05, 5.0);
 }
 
-// Larger image formats are expected to be a little harder to code relatively
-// given the same prediction error score. This in part at least relates to the
-// increased size and hence coding cost of motion vectors.
-#define EDIV_SIZE_FACTOR 800
-
-static int get_twopass_worst_quality(const VP9_COMP *cpi,
+#define ERR_DIVISOR         115.0
+static int get_twopass_worst_quality(VP9_COMP *cpi,
                                      const double section_err,
                                      double inactive_zone,
-                                     int section_target_bandwidth,
-                                     double group_weight_factor) {
+                                     int section_target_bandwidth) {
   const RATE_CONTROL *const rc = &cpi->rc;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  TWO_PASS *const twopass = &cpi->twopass;
+
   // Clamp the target rate to VBR min / max limts.
   const int target_rate =
       vp9_rc_clamp_pframe_target_size(cpi, section_target_bandwidth);
@@ -1267,29 +1261,36 @@
     const int active_mbs = VPXMAX(1, num_mbs - (int)(num_mbs * inactive_zone));
     const double av_err_per_mb = section_err / active_mbs;
     const double speed_term = 1.0 + 0.04 * oxcf->speed;
-    const double ediv_size_correction = (double)num_mbs / EDIV_SIZE_FACTOR;
+    double last_group_rate_err;
     const int target_norm_bits_per_mb = ((uint64_t)target_rate <<
                                          BPER_MB_NORMBITS) / active_mbs;
-
     int q;
     int is_svc_upper_layer = 0;
 
     if (is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id > 0)
       is_svc_upper_layer = 1;
 
+    // based on recent history adjust expectations of bits per macroblock.
+    last_group_rate_err = (double)twopass->rolling_arf_group_actual_bits /
+        DOUBLE_DIVIDE_CHECK((double)twopass->rolling_arf_group_target_bits);
+    last_group_rate_err =
+        VPXMAX(0.25, VPXMIN(4.0, last_group_rate_err));
+    twopass->bpm_factor *= (3.0 + last_group_rate_err) / 4.0;
+    twopass->bpm_factor =
+        VPXMAX(0.25, VPXMIN(4.0, twopass->bpm_factor));
 
     // Try and pick a max Q that will be high enough to encode the
     // content at the given rate.
     for (q = rc->best_quality; q < rc->worst_quality; ++q) {
       const double factor =
           calc_correction_factor(av_err_per_mb,
-                                 ERR_DIVISOR - ediv_size_correction,
+                                 ERR_DIVISOR,
                                  is_svc_upper_layer ? SVC_FACTOR_PT_LOW :
                                  FACTOR_PT_LOW, FACTOR_PT_HIGH, q,
                                  cpi->common.bit_depth);
       const int bits_per_mb =
         vp9_rc_bits_per_mb(INTER_FRAME, q,
-                           factor * speed_term * group_weight_factor,
+                           factor * speed_term * cpi->twopass.bpm_factor,
                            cpi->common.bit_depth);
       if (bits_per_mb <= target_norm_bits_per_mb)
         break;
@@ -1340,6 +1341,7 @@
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   const int is_two_pass_svc = (svc->number_spatial_layers > 1) ||
                               (svc->number_temporal_layers > 1);
+  RATE_CONTROL *const rc = &cpi->rc;
   TWO_PASS *const twopass = is_two_pass_svc ?
       &svc->layer_context[svc->spatial_layer_id].twopass : &cpi->twopass;
   double frame_rate;
@@ -1396,15 +1398,21 @@
   }
 
   // Reset the vbr bits off target counters
-  cpi->rc.vbr_bits_off_target = 0;
-  cpi->rc.vbr_bits_off_target_fast = 0;
-
-  cpi->rc.rate_error_estimate = 0;
+  rc->vbr_bits_off_target = 0;
+  rc->vbr_bits_off_target_fast = 0;
+  rc->rate_error_estimate = 0;
 
   // Static sequence monitor variables.
   twopass->kf_zeromotion_pct = 100;
   twopass->last_kfgroup_zeromotion_pct = 100;
 
+  // Initialize bits per macro_block estimate correction factor.
+  twopass->bpm_factor = 1.0;
+  // Initiallize actual and target bits counters for ARF groups so that
+  // at the start we have a neutral bpm adjustment.
+  twopass->rolling_arf_group_target_bits = 1;
+  twopass->rolling_arf_group_actual_bits = 1;
+
   if (oxcf->resize_mode != RESIZE_NONE) {
     init_subsampling(cpi);
   }
@@ -1929,9 +1937,7 @@
   double boost_score = 0.0;
   double old_boost_score = 0.0;
   double gf_group_err = 0.0;
-#if GROUP_ADAPTIVE_MAXQ
   double gf_group_raw_error = 0.0;
-#endif
   double gf_group_skip_pct = 0.0;
   double gf_group_inactive_zone_rows = 0.0;
   double gf_first_frame_err = 0.0;
@@ -1981,9 +1987,7 @@
   // the error score / cost of this frame has already been accounted for.
   if (arf_active_or_kf) {
     gf_group_err -= gf_first_frame_err;
-#if GROUP_ADAPTIVE_MAXQ
     gf_group_raw_error -= this_frame->coded_error;
-#endif
     gf_group_skip_pct -= this_frame->intra_skip_pct;
     gf_group_inactive_zone_rows -= this_frame->inactive_zone_rows;
   }
@@ -2037,9 +2041,7 @@
     // Accumulate error score of frames in this gf group.
     mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
     gf_group_err += mod_frame_err;
-#if GROUP_ADAPTIVE_MAXQ
     gf_group_raw_error += this_frame->coded_error;
-#endif
     gf_group_skip_pct += this_frame->intra_skip_pct;
     gf_group_inactive_zone_rows += this_frame->inactive_zone_rows;
 
@@ -2104,8 +2106,6 @@
     old_boost_score = boost_score;
   }
 
-  twopass->gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0);
-
   // Was the group length constrained by the requirement for a new KF?
   rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0;
 
@@ -2141,9 +2141,7 @@
       if (EOF == input_stats(twopass, this_frame))
         break;
       gf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
-#if GROUP_ADAPTIVE_MAXQ
       gf_group_raw_error += this_frame->coded_error;
-#endif
       gf_group_skip_pct += this_frame->intra_skip_pct;
       gf_group_inactive_zone_rows += this_frame->inactive_zone_rows;
     }
@@ -2158,7 +2156,6 @@
   // Calculate the bits to be allocated to the gf/arf group as a whole
   gf_group_bits = calculate_total_gf_group_bits(cpi, gf_group_err);
 
-#if GROUP_ADAPTIVE_MAXQ
   // Calculate an estimate of the maxq needed for the group.
   // We are more agressive about correcting for sections
   // where there could be significant overshoot than for easier
@@ -2173,26 +2170,13 @@
     const double group_av_inactive_zone =
       ((gf_group_inactive_zone_rows * 2) /
        (rc->baseline_gf_interval * (double)cm->mb_rows));
-
-    int tmp_q;
-    // rc factor is a weight factor that corrects for local rate control drift.
-    double rc_factor = 1.0;
-    if (rc->rate_error_estimate > 0) {
-      rc_factor = VPXMAX(RC_FACTOR_MIN,
-                         (double)(100 - rc->rate_error_estimate) / 100.0);
-    } else {
-      rc_factor = VPXMIN(RC_FACTOR_MAX,
-                         (double)(100 - rc->rate_error_estimate) / 100.0);
-    }
-    tmp_q =
-      get_twopass_worst_quality(cpi, group_av_err,
-                                (group_av_skip_pct + group_av_inactive_zone),
-                                vbr_group_bits_per_frame,
-                                twopass->kfgroup_inter_fraction * rc_factor);
+    int tmp_q =
+        get_twopass_worst_quality(cpi, group_av_err,
+                                  (group_av_skip_pct + group_av_inactive_zone),
+                                  vbr_group_bits_per_frame);
     twopass->active_worst_quality =
-        VPXMAX(tmp_q, twopass->active_worst_quality >> 1);
+        (tmp_q + (twopass->active_worst_quality * 3)) >> 2;
   }
-#endif
 
   // Calculate the extra bits to be used for boosted frame(s)
   gf_arf_bits = calculate_boost_bits(rc->baseline_gf_interval,
@@ -2232,6 +2216,10 @@
     // Default to starting GF groups at normal frame size.
     cpi->rc.next_frame_size_selector = UNSCALED;
   }
+
+  // Reset rolling actual and target bits counters for ARF groups.
+  twopass->rolling_arf_group_target_bits = 0;
+  twopass->rolling_arf_group_actual_bits = 0;
 }
 
 // Threshold for use of the lagging second reference frame. High second ref
@@ -2569,16 +2557,6 @@
   kf_bits = calculate_boost_bits((rc->frames_to_key - 1),
                                   rc->kf_boost, twopass->kf_group_bits);
 
-  // Work out the fraction of the kf group bits reserved for the inter frames
-  // within the group after discounting the bits for the kf itself.
-  if (twopass->kf_group_bits) {
-    twopass->kfgroup_inter_fraction =
-      (double)(twopass->kf_group_bits - kf_bits) /
-      (double)twopass->kf_group_bits;
-  } else {
-    twopass->kfgroup_inter_fraction = 1.0;
-  }
-
   twopass->kf_group_bits -= kf_bits;
 
   // Save the bits to spend on the key frame.
@@ -2672,21 +2650,12 @@
   RATE_CONTROL *const rc = &cpi->rc;
   TWO_PASS *const twopass = &cpi->twopass;
   GF_GROUP *const gf_group = &twopass->gf_group;
-  int frames_left;
   FIRSTPASS_STATS this_frame;
 
   int target_rate;
   LAYER_CONTEXT *const lc = is_two_pass_svc(cpi) ?
         &cpi->svc.layer_context[cpi->svc.spatial_layer_id] : 0;
 
-  if (lc != NULL) {
-    frames_left = (int)(twopass->total_stats.count -
-                  lc->current_video_frame_in_layer);
-  } else {
-    frames_left = (int)(twopass->total_stats.count -
-                  cm->current_video_frame);
-  }
-
   if (!twopass->stats_in)
     return;
 
@@ -2728,6 +2697,9 @@
     twopass->active_worst_quality = cpi->oxcf.cq_level;
   } else if (cm->current_video_frame == 0 ||
              (lc != NULL && lc->current_video_frame_in_layer == 0)) {
+    const int frames_left = (int)(twopass->total_stats.count -
+        ((lc != NULL) ? lc->current_video_frame_in_layer
+                      : cm->current_video_frame));
     // Special case code for first frame.
     const int section_target_bandwidth = (int)(twopass->bits_left /
                                                frames_left);
@@ -2739,10 +2711,10 @@
     const double section_inactive_zone =
       (twopass->total_left_stats.inactive_zone_rows * 2) /
       ((double)cm->mb_rows * section_length);
-    const int tmp_q =
-      get_twopass_worst_quality(cpi, section_error,
-                                section_intra_skip + section_inactive_zone,
-                                section_target_bandwidth, DEFAULT_GRP_WEIGHT);
+    int tmp_q;
+
+    tmp_q = get_twopass_worst_quality(cpi, section_error,
+        section_intra_skip + section_inactive_zone, section_target_bandwidth);
 
     twopass->active_worst_quality = tmp_q;
     twopass->baseline_active_worst_quality = tmp_q;
@@ -2849,6 +2821,7 @@
 void vp9_twopass_postencode_update(VP9_COMP *cpi) {
   TWO_PASS *const twopass = &cpi->twopass;
   RATE_CONTROL *const rc = &cpi->rc;
+  VP9_COMMON *const cm = &cpi->common;
   const int bits_used = rc->base_frame_target;
 
   // VBR correction is done through rc->vbr_bits_off_target. Based on the
@@ -2859,6 +2832,10 @@
   rc->vbr_bits_off_target += rc->base_frame_target - rc->projected_frame_size;
   twopass->bits_left = VPXMAX(twopass->bits_left - bits_used, 0);
 
+  // Target vs actual bits for this arf group.
+  twopass->rolling_arf_group_target_bits += rc->this_frame_target;
+  twopass->rolling_arf_group_actual_bits += rc->projected_frame_size;
+
   // Calculate the pct rc error.
   if (rc->total_actual_bits) {
     rc->rate_error_estimate =
@@ -2880,12 +2857,27 @@
 
   // If the rate control is drifting consider adjustment to min or maxq.
   if ((cpi->oxcf.rc_mode != VPX_Q) &&
-      (cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD) &&
       !cpi->rc.is_src_frame_alt_ref) {
     const int maxq_adj_limit =
       rc->worst_quality - twopass->active_worst_quality;
     const int minq_adj_limit =
         (cpi->oxcf.rc_mode == VPX_CQ ? MINQ_ADJ_LIMIT_CQ : MINQ_ADJ_LIMIT);
+    int aq_extend_min = 0;
+    int aq_extend_max = 0;
+
+    // Extend min or Max Q range to account for imbalance from the base
+    // value when using AQ.
+    if (cpi->oxcf.aq_mode != NO_AQ) {
+      if (cm->seg.aq_av_offset < 0) {
+        // The balance of the AQ map tends towarda lowering the average Q.
+        aq_extend_min = 0;
+        aq_extend_max = VPXMIN(maxq_adj_limit, -cm->seg.aq_av_offset);
+      } else {
+        // The balance of the AQ map tends towards raising the average Q.
+        aq_extend_min = VPXMIN(minq_adj_limit, cm->seg.aq_av_offset);
+        aq_extend_max = 0;
+      }
+    }
 
     // Undershoot.
     if (rc->rate_error_estimate > cpi->oxcf.under_shoot_pct) {
@@ -2910,8 +2902,10 @@
         --twopass->extend_maxq;
     }
 
-    twopass->extend_minq = clamp(twopass->extend_minq, 0, minq_adj_limit);
-    twopass->extend_maxq = clamp(twopass->extend_maxq, 0, maxq_adj_limit);
+    twopass->extend_minq =
+        clamp(twopass->extend_minq, aq_extend_min, minq_adj_limit);
+    twopass->extend_maxq =
+        clamp(twopass->extend_maxq, aq_extend_max, maxq_adj_limit);
 
     // If there is a big and undexpected undershoot then feed the extra
     // bits back in quickly. One situation where this may happen is if a
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index 7eb44fa..7607288 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -39,8 +39,6 @@
 } FIRSTPASS_MB_STATS;
 #endif
 
-#define VLOW_MOTION_THRESHOLD 950
-
 typedef struct {
   double frame;
   double weight;
@@ -124,14 +122,13 @@
   // Error score of frames still to be coded in kf group
   int64_t kf_group_error_left;
 
-  // The fraction for a kf groups total bits allocated to the inter frames
-  double kfgroup_inter_fraction;
+  double bpm_factor;
+  int rolling_arf_group_target_bits;
+  int rolling_arf_group_actual_bits;
 
   int sr_update_lag;
-
   int kf_zeromotion_pct;
   int last_kfgroup_zeromotion_pct;
-  int gf_zeromotion_pct;
   int active_worst_quality;
   int baseline_active_worst_quality;
   int extend_minq;
diff --git a/vp9/encoder/vp9_lookahead.c b/vp9/encoder/vp9_lookahead.c
index def9b8c..441280c 100644
--- a/vp9/encoder/vp9_lookahead.c
+++ b/vp9/encoder/vp9_lookahead.c
@@ -20,8 +20,8 @@
 
 /* Return the buffer at the given absolute index and increment the index */
 static struct lookahead_entry *pop(struct lookahead_ctx *ctx,
-                                   unsigned int *idx) {
-  unsigned int index = *idx;
+                                   int *idx) {
+  int index = *idx;
   struct lookahead_entry *buf = ctx->buf + index;
 
   assert(index < ctx->max_sz);
@@ -35,7 +35,7 @@
 void vp9_lookahead_destroy(struct lookahead_ctx *ctx) {
   if (ctx) {
     if (ctx->buf) {
-      unsigned int i;
+      int i;
 
       for (i = 0; i < ctx->max_sz; i++)
         vpx_free_frame_buffer(&ctx->buf[i].img);
@@ -221,9 +221,9 @@
 
   if (index >= 0) {
     // Forward peek
-    if (index < (int)ctx->sz) {
+    if (index < ctx->sz) {
       index += ctx->read_idx;
-      if (index >= (int)ctx->max_sz)
+      if (index >= ctx->max_sz)
         index -= ctx->max_sz;
       buf = ctx->buf + index;
     }
diff --git a/vp9/encoder/vp9_lookahead.h b/vp9/encoder/vp9_lookahead.h
index 1382038..db0fd1c 100644
--- a/vp9/encoder/vp9_lookahead.h
+++ b/vp9/encoder/vp9_lookahead.h
@@ -36,10 +36,10 @@
 #define MAX_PRE_FRAMES 1
 
 struct lookahead_ctx {
-  unsigned int max_sz;         /* Absolute size of the queue */
-  unsigned int sz;             /* Number of buffers currently in the queue */
-  unsigned int read_idx;       /* Read index */
-  unsigned int write_idx;      /* Write index */
+  int max_sz;                  /* Absolute size of the queue */
+  int sz;                      /* Number of buffers currently in the queue */
+  int read_idx;                /* Read index */
+  int write_idx;               /* Write index */
   struct lookahead_entry *buf; /* Buffer list */
 };
 
diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index 3ae5b21..14a0b16 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -59,8 +59,8 @@
   // Try sub-pixel MC
   // if (bestsme > error_thresh && bestsme < INT_MAX)
   {
-    int distortion;
-    unsigned int sse;
+    uint32_t distortion;
+    uint32_t sse;
     cpi->find_fractional_mv_step(
         x, dst_mv, ref_mv, cpi->common.allow_high_precision_mv, x->errorperbit,
         &v_fn_ptr, 0, mv_sf->subpel_iters_per_step,
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 4669145..e747277 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -144,16 +144,6 @@
   cfg->total_steps = ss_count / cfg->searches_per_step;
 }
 
-/*
- * To avoid the penalty for crossing cache-line read, preload the reference
- * area in a small buffer, which is aligned to make sure there won't be crossing
- * cache-line read while reading from this buffer. This reduced the cpu
- * cycles spent on reading ref data in sub-pixel filter functions.
- * TODO: Currently, since sub-pixel search range here is -3 ~ 3, copy 22 rows x
- * 32 cols area that is enough for 16x16 macroblock. Later, for SPLITMV, we
- * could reduce the area.
- */
-
 /* Estimated (square) error cost of a motion vector (r,c). The 14 scale comes
  * from the same math as in mv_err_cost(). */
 #define MVC(r, c)                                              \
@@ -172,6 +162,33 @@
   return &buf[(r >> 3) * stride + (c >> 3)];
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+/* checks if (r, c) has better score than previous best */
+#define CHECK_BETTER(v, r, c) \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {              \
+    int64_t tmpmse;                                                    \
+    if (second_pred == NULL) {                                         \
+      thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c),      \
+                         sp(r), z, src_stride, &sse);                  \
+    } else {                                                           \
+      thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c),     \
+                          sp(r), z, src_stride, &sse, second_pred);    \
+    }                                                                  \
+    tmpmse = thismse;                                                  \
+    tmpmse += MVC(r, c);                                               \
+    if (tmpmse >= INT_MAX) {                                           \
+      v = INT_MAX;                                                     \
+    } else if ((v = (uint32_t)tmpmse) < besterr) {                     \
+      besterr = v;                                                     \
+      br = r;                                                          \
+      bc = c;                                                          \
+      *distortion = thismse;                                           \
+      *sse1 = sse;                                                     \
+    }                                                                  \
+  } else {                                                             \
+    v = INT_MAX;                                                       \
+  }
+#else
 /* checks if (r, c) has better score than previous best */
 #define CHECK_BETTER(v, r, c) \
   if (c >= minc && c <= maxc && r >= minr && r <= maxr) {              \
@@ -192,6 +209,7 @@
     v = INT_MAX;                                                       \
   }
 
+#endif
 #define FIRST_LEVEL_CHECKS                              \
   {                                                     \
     unsigned int left, right, up, down, diag;           \
@@ -320,10 +338,10 @@
                                        const uint8_t *second_pred,
                                        int w, int h, int offset,
                                        int *mvjcost, int *mvcost[2],
-                                       unsigned int *sse1,
-                                       int *distortion) {
-  unsigned int besterr;
+                                       uint32_t *sse1,
+                                       uint32_t *distortion) {
 #if CONFIG_VP9_HIGHBITDEPTH
+  uint64_t besterr;
   if (second_pred != NULL) {
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       DECLARE_ALIGNED(16, uint16_t, comp_pred16[64 * 64]);
@@ -339,9 +357,13 @@
   } else {
     besterr = vfp->vf(y + offset, y_stride, src, src_stride, sse1);
   }
-  *distortion = besterr;
+  *distortion = (uint32_t)besterr;
   besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+  if (besterr >= UINT32_MAX)
+    return UINT32_MAX;
+  return (uint32_t)besterr;
 #else
+  uint32_t besterr;
   (void) xd;
   if (second_pred != NULL) {
     DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
@@ -352,8 +374,8 @@
   }
   *distortion = besterr;
   besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
   return besterr;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 }
 
 static INLINE int divide_and_round(const int n, const int d) {
@@ -383,7 +405,7 @@
                          (cost_list[4] - 2 * cost_list[0] + cost_list[2]));
 }
 
-int vp9_skip_sub_pixel_tree(
+uint32_t vp9_skip_sub_pixel_tree(
     const MACROBLOCK *x,
     MV *bestmv, const MV *ref_mv,
     int allow_hp,
@@ -393,8 +415,8 @@
     int iters_per_step,
     int *cost_list,
     int *mvjcost, int *mvcost[2],
-    int *distortion,
-    unsigned int *sse1,
+    uint32_t *distortion,
+    uint32_t *sse1,
     const uint8_t *second_pred,
     int w, int h) {
   SETUP_SUBPEL_SEARCH;
@@ -428,7 +450,7 @@
   return besterr;
 }
 
-int vp9_find_best_sub_pixel_tree_pruned_evenmore(
+uint32_t vp9_find_best_sub_pixel_tree_pruned_evenmore(
     const MACROBLOCK *x,
     MV *bestmv, const MV *ref_mv,
     int allow_hp,
@@ -438,8 +460,8 @@
     int iters_per_step,
     int *cost_list,
     int *mvjcost, int *mvcost[2],
-    int *distortion,
-    unsigned int *sse1,
+    uint32_t *distortion,
+    uint32_t *sse1,
     const uint8_t *second_pred,
     int w, int h) {
   SETUP_SUBPEL_SEARCH;
@@ -508,7 +530,7 @@
   return besterr;
 }
 
-int vp9_find_best_sub_pixel_tree_pruned_more(const MACROBLOCK *x,
+uint32_t vp9_find_best_sub_pixel_tree_pruned_more(const MACROBLOCK *x,
                                              MV *bestmv, const MV *ref_mv,
                                              int allow_hp,
                                              int error_per_bit,
@@ -517,8 +539,8 @@
                                              int iters_per_step,
                                              int *cost_list,
                                              int *mvjcost, int *mvcost[2],
-                                             int *distortion,
-                                             unsigned int *sse1,
+                                             uint32_t *distortion,
+                                             uint32_t *sse1,
                                              const uint8_t *second_pred,
                                              int w, int h) {
   SETUP_SUBPEL_SEARCH;
@@ -582,7 +604,7 @@
   return besterr;
 }
 
-int vp9_find_best_sub_pixel_tree_pruned(const MACROBLOCK *x,
+uint32_t vp9_find_best_sub_pixel_tree_pruned(const MACROBLOCK *x,
                                         MV *bestmv, const MV *ref_mv,
                                         int allow_hp,
                                         int error_per_bit,
@@ -591,8 +613,8 @@
                                         int iters_per_step,
                                         int *cost_list,
                                         int *mvjcost, int *mvcost[2],
-                                        int *distortion,
-                                        unsigned int *sse1,
+                                        uint32_t *distortion,
+                                        uint32_t *sse1,
                                         const uint8_t *second_pred,
                                         int w, int h) {
   SETUP_SUBPEL_SEARCH;
@@ -684,19 +706,19 @@
     {0, -1}, {0, 1}, {-1, 0}, {1, 0}
 };
 
-int vp9_find_best_sub_pixel_tree(const MACROBLOCK *x,
-                                 MV *bestmv, const MV *ref_mv,
-                                 int allow_hp,
-                                 int error_per_bit,
-                                 const vp9_variance_fn_ptr_t *vfp,
-                                 int forced_stop,
-                                 int iters_per_step,
-                                 int *cost_list,
-                                 int *mvjcost, int *mvcost[2],
-                                 int *distortion,
-                                 unsigned int *sse1,
-                                 const uint8_t *second_pred,
-                                 int w, int h) {
+uint32_t vp9_find_best_sub_pixel_tree(const MACROBLOCK *x,
+                                      MV *bestmv, const MV *ref_mv,
+                                      int allow_hp,
+                                      int error_per_bit,
+                                      const vp9_variance_fn_ptr_t *vfp,
+                                      int forced_stop,
+                                      int iters_per_step,
+                                      int *cost_list,
+                                      int *mvjcost, int *mvcost[2],
+                                      uint32_t *distortion,
+                                      uint32_t *sse1,
+                                      const uint8_t *second_pred,
+                                      int w, int h) {
   const uint8_t *const z = x->plane[0].src.buf;
   const uint8_t *const src_address = z;
   const int src_stride = x->plane[0].src.stride;
@@ -835,7 +857,6 @@
 }
 
 #undef MVC
-#undef PRE
 #undef CHECK_BETTER
 
 static INLINE int check_bounds(const MACROBLOCK *x, int row, int col,
@@ -1392,12 +1413,22 @@
   const struct buf_2d *const what = &x->plane[0].src;
   const struct buf_2d *const in_what = &xd->plane[0].pre[0];
   const MV mv = {best_mv->row * 8, best_mv->col * 8};
-  unsigned int unused;
-
+  uint32_t unused;
+#if CONFIG_VP9_HIGHBITDEPTH
+  uint64_t err= vfp->vf(what->buf, what->stride,
+                        get_buf_from_mv(in_what, best_mv),
+                        in_what->stride, &unused);
+  err += (use_mvcost ?  mv_err_cost(&mv, center_mv, x->nmvjointcost,
+                                    x->mvcost, x->errorperbit) : 0);
+  if (err >= INT_MAX)
+    return INT_MAX;
+  return (int)err;
+#else
   return vfp->vf(what->buf, what->stride,
                  get_buf_from_mv(in_what, best_mv), in_what->stride, &unused) +
       (use_mvcost ?  mv_err_cost(&mv, center_mv, x->nmvjointcost,
                                  x->mvcost, x->errorperbit) : 0);
+#endif
 }
 
 int vp9_get_mvpred_av_var(const MACROBLOCK *x,
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index 1b0c860..86cd267 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -74,7 +74,7 @@
                                            BLOCK_SIZE bsize,
                                            int mi_row, int mi_col);
 
-typedef int (fractional_mv_step_fp) (
+typedef uint32_t (fractional_mv_step_fp) (
     const MACROBLOCK *x,
     MV *bestmv, const MV *ref_mv,
     int allow_hp,
@@ -84,7 +84,7 @@
     int iters_per_step,
     int *cost_list,
     int *mvjcost, int *mvcost[2],
-    int *distortion, unsigned int *sse1,
+    uint32_t *distortion, uint32_t *sse1,
     const uint8_t *second_pred,
     int w, int h);
 
diff --git a/vp9/encoder/vp9_noise_estimate.c b/vp9/encoder/vp9_noise_estimate.c
index 10ea010..4b43b38 100644
--- a/vp9/encoder/vp9_noise_estimate.c
+++ b/vp9/encoder/vp9_noise_estimate.c
@@ -39,12 +39,10 @@
   ne->num_frames_estimate = 20;
 }
 
-int enable_noise_estimation(VP9_COMP *const cpi) {
-  // Enable noise estimation if denoising is on (and cyclic refresh, since
-  // noise estimate is currently using a struct defined in cyclic refresh).
+static int enable_noise_estimation(VP9_COMP *const cpi) {
+  // Enable noise estimation if denoising is on.
 #if CONFIG_VP9_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0 &&
-      cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+  if (cpi->oxcf.noise_sensitivity > 0)
     return 1;
 #endif
   // Only allow noise estimate under certain encoding mode.
@@ -101,11 +99,10 @@
 
 void vp9_update_noise_estimate(VP9_COMP *const cpi) {
   const VP9_COMMON *const cm = &cpi->common;
-  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   NOISE_ESTIMATE *const ne = &cpi->noise_estimate;
   // Estimate of noise level every frame_period frames.
-  int frame_period = 10;
-  int thresh_consec_zeromv = 8;
+  int frame_period = 8;
+  int thresh_consec_zeromv = 6;
   unsigned int thresh_sum_diff = 100;
   unsigned int thresh_sum_spatial = (200 * 200) << 8;
   unsigned int thresh_spatial_var = (32 * 32) << 8;
@@ -131,6 +128,14 @@
       ne->last_h = cm->height;
     }
     return;
+  } else if (cpi->rc.avg_frame_low_motion < 50) {
+    // Force noise estimation to 0 and denoiser off if content has high motion.
+    ne->level = kLowLow;
+#if CONFIG_VP9_TEMPORAL_DENOISING
+    if (cpi->oxcf.noise_sensitivity > 0)
+      vp9_denoiser_set_noise_level(&cpi->denoiser, ne->level);
+#endif
+    return;
   } else {
     int num_samples = 0;
     uint64_t avg_est = 0;
@@ -153,7 +158,7 @@
     for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) {
       for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) {
         int bl_index = mi_row * cm->mi_cols + mi_col;
-        if (cr->consec_zero_mv[bl_index] > thresh_consec_zeromv)
+        if (cpi->consec_zero_mv[bl_index] > thresh_consec_zeromv)
           num_low_motion++;
       }
     }
@@ -173,23 +178,26 @@
           // been encoded as zero/low motion x (= thresh_consec_zeromv) frames
           // in a row. consec_zero_mv[] defined for 8x8 blocks, so consider all
           // 4 sub-blocks for 16x16 block. Also, avoid skin blocks.
-          int consec_zeromv = VPXMIN(cr->consec_zero_mv[bl_index],
-                                     VPXMIN(cr->consec_zero_mv[bl_index1],
-                                     VPXMIN(cr->consec_zero_mv[bl_index2],
-                                     cr->consec_zero_mv[bl_index3])));
-          int is_skin = vp9_compute_skin_block(src_y,
-                                               src_u,
-                                               src_v,
-                                               src_ystride,
-                                               src_uvstride,
-                                               bsize,
-                                               consec_zeromv,
-                                               0);
+          int consec_zeromv = VPXMIN(cpi->consec_zero_mv[bl_index],
+                                     VPXMIN(cpi->consec_zero_mv[bl_index1],
+                                     VPXMIN(cpi->consec_zero_mv[bl_index2],
+                                     cpi->consec_zero_mv[bl_index3])));
+          int is_skin = 0;
+          if (cpi->use_skin_detection) {
+            is_skin = vp9_compute_skin_block(src_y,
+                                             src_u,
+                                             src_v,
+                                             src_ystride,
+                                             src_uvstride,
+                                             bsize,
+                                             consec_zeromv,
+                                             0);
+          }
           if (frame_low_motion &&
-              cr->consec_zero_mv[bl_index] > thresh_consec_zeromv &&
-              cr->consec_zero_mv[bl_index1] > thresh_consec_zeromv &&
-              cr->consec_zero_mv[bl_index2] > thresh_consec_zeromv &&
-              cr->consec_zero_mv[bl_index3] > thresh_consec_zeromv &&
+              cpi->consec_zero_mv[bl_index] > thresh_consec_zeromv &&
+              cpi->consec_zero_mv[bl_index1] > thresh_consec_zeromv &&
+              cpi->consec_zero_mv[bl_index2] > thresh_consec_zeromv &&
+              cpi->consec_zero_mv[bl_index3] > thresh_consec_zeromv &&
               !is_skin) {
             // Compute variance.
             unsigned int sse;
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 2e27f94..c13f24f 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -40,6 +40,14 @@
   int in_use;
 } PRED_BUFFER;
 
+
+static const int pos_shift_16x16[4][4] = {
+  {9, 10, 13, 14},
+  {11, 12, 15, 16},
+  {17, 18, 21, 22},
+  {19, 20, 23, 24}
+};
+
 static int mv_refs_rt(VP9_COMP *cpi, const VP9_COMMON *cm,
                       const MACROBLOCK *x,
                       const MACROBLOCKD *xd,
@@ -149,7 +157,7 @@
   const int ref = mi->ref_frame[0];
   const MV ref_mv = x->mbmi_ext->ref_mvs[ref][0].as_mv;
   MV center_mv;
-  int dis;
+  uint32_t dis;
   int rate_mode;
   const int tmp_col_min = x->mv_col_min;
   const int tmp_col_max = x->mv_col_max;
@@ -244,7 +252,7 @@
                     &sse8x8[k], &sum8x8[k]);
       *sse += sse8x8[k];
       *sum += sum8x8[k];
-      var8x8[k] = sse8x8[k] - (((unsigned int)sum8x8[k] * sum8x8[k]) >> 6);
+      var8x8[k] = sse8x8[k] - (uint32_t)(((int64_t)sum8x8[k] * sum8x8[k]) >> 6);
       k++;
     }
   }
@@ -265,7 +273,7 @@
           sse_i[(i + 1) * nw + j] + sse_i[(i + 1) * nw + j + 1];
       sum_o[k] = sum_i[i * nw + j] + sum_i[i * nw + j + 1] +
           sum_i[(i + 1) * nw + j] + sum_i[(i + 1) * nw + j + 1];
-      var_o[k] = sse_o[k] - (((unsigned int)sum_o[k] * sum_o[k]) >>
+      var_o[k] = sse_o[k] - (uint32_t)(((int64_t)sum_o[k] * sum_o[k]) >>
           (b_width_log2_lookup[unit_size] +
               b_height_log2_lookup[unit_size] + 6));
       k++;
@@ -582,39 +590,46 @@
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
-                      int *skippable, int64_t *sse, int plane,
-                      BLOCK_SIZE bsize, TX_SIZE tx_size) {
+static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
+                      int *skippable, int64_t *sse, BLOCK_SIZE bsize,
+                      TX_SIZE tx_size) {
   MACROBLOCKD *xd = &x->e_mbd;
   unsigned int var_y, sse_y;
-  (void)plane;
+
   (void)tx_size;
-  model_rd_for_sb_y(cpi, bsize, x, xd, rate, dist, &var_y, &sse_y);
+  model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc->rate, &this_rdc->dist, &var_y,
+                    &sse_y);
   *sse = INT_MAX;
   *skippable = 0;
   return;
 }
 #else
-static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
-                      int *skippable, int64_t *sse, int plane,
-                      BLOCK_SIZE bsize, TX_SIZE tx_size) {
+static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
+                      int *skippable, int64_t *sse, BLOCK_SIZE bsize,
+                      TX_SIZE tx_size) {
   MACROBLOCKD *xd = &x->e_mbd;
-  const struct macroblockd_plane *pd = &xd->plane[plane];
-  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *pd = &xd->plane[0];
+  struct macroblock_plane *const p = &x->plane[0];
   const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
   const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
   const int step = 1 << (tx_size << 1);
   const int block_step = (1 << tx_size);
   int block = 0, r, c;
-  int shift = tx_size == TX_32X32 ? 0 : 2;
   const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 :
-      xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+      xd->mb_to_right_edge >> 5);
   const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 :
-      xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+      xd->mb_to_bottom_edge >> 5);
   int eob_cost = 0;
+  const int bw = 4 * num_4x4_w;
+  const int bh = 4 * num_4x4_h;
 
   (void)cpi;
-  vp9_subtract_plane(x, bsize, plane);
+
+  // The max tx_size passed in is TX_16X16.
+  assert(tx_size != TX_32X32);
+
+  vpx_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
+                     pd->dst.buf, pd->dst.stride);
   *skippable = 1;
   // Keep track of the row and column of the blocks we use so that we know
   // if we are in the unrestricted motion border.
@@ -626,18 +641,11 @@
         tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
         tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
         uint16_t *const eob = &p->eobs[block];
-        const int diff_stride = 4 * num_4x4_blocks_wide_lookup[bsize];
+        const int diff_stride = bw;
         const int16_t *src_diff;
         src_diff = &p->src_diff[(r * diff_stride + c) << 2];
 
         switch (tx_size) {
-          case TX_32X32:
-            vpx_fdct32x32_rd(src_diff, coeff, diff_stride);
-            vp9_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin,
-                                  p->round_fp, p->quant_fp, p->quant_shift,
-                                  qcoeff, dqcoeff, pd->dequant, eob,
-                                  scan_order->scan, scan_order->iscan);
-            break;
           case TX_16X16:
             vpx_hadamard_16x16(src_diff, diff_stride, (int16_t *)coeff);
             vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
@@ -670,18 +678,17 @@
     }
   }
 
-  if (*skippable && *sse < INT64_MAX) {
-    *rate = 0;
-    *dist = (*sse << 6) >> shift;
-    *sse = *dist;
-    return;
+  this_rdc->rate = 0;
+  if (*sse < INT64_MAX) {
+    *sse = (*sse << 6) >> 2;
+    if (*skippable) {
+      this_rdc->dist = *sse;
+      return;
+    }
   }
 
   block = 0;
-  *rate = 0;
-  *dist = 0;
-  if (*sse < INT64_MAX)
-    *sse = (*sse << 6) >> shift;
+  this_rdc->dist = 0;
   for (r = 0; r < max_blocks_high; r += block_step) {
     for (c = 0; c < num_4x4_w; c += block_step) {
       if (c < max_blocks_wide) {
@@ -691,25 +698,26 @@
         uint16_t *const eob = &p->eobs[block];
 
         if (*eob == 1)
-          *rate += (int)abs(qcoeff[0]);
+          this_rdc->rate += (int)abs(qcoeff[0]);
         else if (*eob > 1)
-          *rate += vpx_satd((const int16_t *)qcoeff, step << 4);
+          this_rdc->rate += vpx_satd((const int16_t *)qcoeff, step << 4);
 
-        *dist += vp9_block_error_fp(coeff, dqcoeff, step << 4) >> shift;
+        this_rdc->dist +=
+            vp9_block_error_fp(coeff, dqcoeff, step << 4) >> 2;
       }
       block += step;
     }
   }
 
   // If skippable is set, rate gets clobbered later.
-  *rate <<= (2 + VP9_PROB_COST_SHIFT);
-  *rate += (eob_cost << VP9_PROB_COST_SHIFT);
+  this_rdc->rate <<= (2 + VP9_PROB_COST_SHIFT);
+  this_rdc->rate += (eob_cost << VP9_PROB_COST_SHIFT);
 }
 #endif
 
 static void model_rd_for_sb_uv(VP9_COMP *cpi, BLOCK_SIZE plane_bsize,
                                MACROBLOCK *x, MACROBLOCKD *xd,
-                               int *out_rate_sum, int64_t *out_dist_sum,
+                               RD_COST *this_rdc,
                                unsigned int *var_y, unsigned int *sse_y,
                                int start_plane, int stop_plane) {
   // Note our transform coeffs are 8 times an orthogonal transform.
@@ -720,8 +728,8 @@
   int64_t dist;
   int i;
 
-  *out_rate_sum = 0;
-  *out_dist_sum = 0;
+  this_rdc->rate = 0;
+  this_rdc->dist = 0;
 
   for (i = start_plane; i <= stop_plane; ++i) {
     struct macroblock_plane *const p = &x->plane[i];
@@ -752,8 +760,8 @@
                                  dc_quant >> 3, &rate, &dist);
   #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-    *out_rate_sum += rate >> 1;
-    *out_dist_sum += dist << 3;
+    this_rdc->rate += rate >> 1;
+    this_rdc->dist += dist << 3;
 
   #if CONFIG_VP9_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -768,8 +776,8 @@
                                  ac_quant >> 3, &rate, &dist);
   #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-    *out_rate_sum += rate;
-    *out_dist_sum += dist << 4;
+    this_rdc->rate += rate;
+    this_rdc->dist += dist << 4;
   }
 }
 
@@ -906,8 +914,7 @@
   MACROBLOCK *x;
   PREDICTION_MODE mode;
   int skippable;
-  int rate;
-  int64_t dist;
+  RD_COST *rdc;
 };
 
 static void estimate_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
@@ -924,8 +931,7 @@
   const int src_stride = p->src.stride;
   const int dst_stride = pd->dst.stride;
   int i, j;
-  int rate;
-  int64_t dist;
+  RD_COST this_rdc;
 
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
 
@@ -942,19 +948,19 @@
   if (plane == 0) {
     int64_t this_sse = INT64_MAX;
     // TODO(jingning): This needs further refactoring.
-    block_yrd(cpi, x, &rate, &dist, &args->skippable, &this_sse, 0,
-              bsize_tx, VPXMIN(tx_size, TX_16X16));
+    block_yrd(cpi, x, &this_rdc, &args->skippable, &this_sse, bsize_tx,
+              VPXMIN(tx_size, TX_16X16));
   } else {
     unsigned int var = 0;
     unsigned int sse = 0;
-    model_rd_for_sb_uv(cpi, plane_bsize, x, xd, &rate, &dist, &var, &sse,
-                       plane, plane);
+    model_rd_for_sb_uv(cpi, plane_bsize, x, xd, &this_rdc, &var, &sse, plane,
+                       plane);
   }
 
   p->src.buf = src_buf_base;
   pd->dst.buf = dst_buf_base;
-  args->rate += rate;
-  args->dist += dist;
+  args->rdc->rate += this_rdc.rate;
+  args->rdc->dist += this_rdc.dist;
 }
 
 static const THR_MODES mode_idx[MAX_REF_FRAMES - 1][4] = {
@@ -1007,7 +1013,7 @@
   MODE_INFO *const mi = xd->mi[0];
   RD_COST this_rdc, best_rdc;
   PREDICTION_MODE this_mode;
-  struct estimate_block_intra_args args = { cpi, x, DC_PRED, 1, 0, 0 };
+  struct estimate_block_intra_args args = { cpi, x, DC_PRED, 1, 0 };
   const TX_SIZE intra_tx_size =
       VPXMIN(max_txsize_lookup[bsize],
              tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
@@ -1031,22 +1037,20 @@
   // Change the limit of this loop to add other intra prediction
   // mode tests.
   for (this_mode = DC_PRED; this_mode <= H_PRED; ++this_mode) {
+    this_rdc.dist = this_rdc.rate = 0;
     args.mode = this_mode;
     args.skippable = 1;
-    args.rate = 0;
-    args.dist = 0;
+    args.rdc = &this_rdc;
     mi->tx_size = intra_tx_size;
     vp9_foreach_transformed_block_in_plane(xd, bsize, 0,
                                            estimate_block_intra, &args);
     if (args.skippable) {
       x->skip_txfm[0] = SKIP_TXFM_AC_DC;
-      args.rate = vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), 1);
+      this_rdc.rate = vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), 1);
     } else {
       x->skip_txfm[0] = SKIP_TXFM_NONE;
-      args.rate += vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), 0);
+      this_rdc.rate += vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), 0);
     }
-    this_rdc.rate = args.rate;
-    this_rdc.dist = args.dist;
     this_rdc.rate += bmode_costs[this_mode];
     this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
                              this_rdc.rate, this_rdc.dist);
@@ -1105,7 +1109,7 @@
     {GOLDEN_FRAME, NEWMV}
 };
 
-int set_intra_cost_penalty(const VP9_COMP *const cpi, BLOCK_SIZE bsize) {
+static int set_intra_cost_penalty(const VP9_COMP *const cpi, BLOCK_SIZE bsize) {
   const VP9_COMMON *const cm = &cpi->common;
   // Reduce the intra cost penalty for small blocks (<=16x16).
   int reduction_fac =
@@ -1126,34 +1130,38 @@
                                  TileDataEnc *tile_data,
                                  int mi_row, int mi_col,
                                  struct buf_2d yv12_mb[4][MAX_MB_PLANE],
-                                 BLOCK_SIZE bsize) {
+                                 BLOCK_SIZE bsize,
+                                 int force_skip_low_temp_var) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
   TileInfo *const tile_info = &tile_data->tile_info;
-// TODO(jingning) placeholder for inter-frame non-RD mode decision.
+  // TODO(jingning) placeholder for inter-frame non-RD mode decision.
   x->pred_mv_sad[ref_frame] = INT_MAX;
   frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
   frame_mv[ZEROMV][ref_frame].as_int = 0;
-// this needs various further optimizations. to be continued..
+  // this needs various further optimizations. to be continued..
   if ((cpi->ref_frame_flags & flag_list[ref_frame]) && (yv12 != NULL)) {
     int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame];
     const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
     vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col,
                          sf, sf);
-    if (cm->use_prev_frame_mvs)
+    if (cm->use_prev_frame_mvs) {
       vp9_find_mv_refs(cm, xd, xd->mi[0], ref_frame,
                        candidates, mi_row, mi_col,
                        x->mbmi_ext->mode_context);
-    else
-    const_motion[ref_frame] =
-        mv_refs_rt(cpi, cm, x, xd, tile_info, xd->mi[0], ref_frame,
-            candidates, &frame_mv[NEWMV][ref_frame], mi_row, mi_col,
-            (int)(cpi->svc.use_base_mv && cpi->svc.spatial_layer_id));
+    } else {
+      const_motion[ref_frame] =
+          mv_refs_rt(cpi, cm, x, xd, tile_info, xd->mi[0], ref_frame,
+                     candidates, &frame_mv[NEWMV][ref_frame], mi_row, mi_col,
+                     (int)(cpi->svc.use_base_mv && cpi->svc.spatial_layer_id));
+    }
     vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates,
                           &frame_mv[NEARESTMV][ref_frame],
                           &frame_mv[NEARMV][ref_frame]);
-    if (!vp9_is_scaled(sf) && bsize >= BLOCK_8X8) {
+    // Early exit for golden frame if force_skip_low_temp_var is set.
+    if (!vp9_is_scaled(sf) && bsize >= BLOCK_8X8 &&
+        !(force_skip_low_temp_var && ref_frame == GOLDEN_FRAME)) {
       vp9_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride,
                   ref_frame, bsize);
     }
@@ -1161,6 +1169,159 @@
     *ref_frame_skip_mask |= (1 << ref_frame);
   }
 }
+
+static void vp9_large_block_mv_bias(const NOISE_ESTIMATE *ne, RD_COST *this_rdc,
+                                    BLOCK_SIZE bsize, int mv_row, int mv_col,
+                                    int is_last_frame) {
+  // Bias against non-zero (above some threshold) motion for large blocks.
+  // This is temporary fix to avoid selection of large mv for big blocks.
+  if (mv_row > 64 || mv_row < -64 || mv_col > 64 || mv_col < -64) {
+    if (bsize == BLOCK_64X64)
+      this_rdc->rdcost = this_rdc->rdcost << 1;
+    else if (bsize >= BLOCK_32X32)
+      this_rdc->rdcost = 3 * this_rdc->rdcost >> 1;
+  }
+  // If noise estimation is enabled, and estimated level is above threshold,
+  // add a bias to LAST reference with small motion, for large blocks.
+  if (ne->enabled && ne->level >= kMedium &&
+      bsize >= BLOCK_32X32 && is_last_frame &&
+      mv_row < 8 && mv_row > -8 && mv_col < 8 && mv_col > -8) {
+    this_rdc->rdcost = 7 * this_rdc->rdcost >> 3;
+  }
+}
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+static void vp9_pickmode_ctx_den_update(
+    VP9_PICKMODE_CTX_DEN *ctx_den,
+    int64_t zero_last_cost_orig,
+    int ref_frame_cost[MAX_REF_FRAMES],
+    int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
+    int reuse_inter_pred,
+    TX_SIZE best_tx_size,
+    PREDICTION_MODE best_mode,
+    MV_REFERENCE_FRAME best_ref_frame,
+    INTERP_FILTER best_pred_filter,
+    uint8_t best_mode_skip_txfm) {
+  ctx_den->zero_last_cost_orig = zero_last_cost_orig;
+  ctx_den->ref_frame_cost = ref_frame_cost;
+  ctx_den->frame_mv = frame_mv;
+  ctx_den->reuse_inter_pred = reuse_inter_pred;
+  ctx_den->best_tx_size = best_tx_size;
+  ctx_den->best_mode = best_mode;
+  ctx_den->best_ref_frame = best_ref_frame;
+  ctx_den->best_pred_filter = best_pred_filter;
+  ctx_den->best_mode_skip_txfm = best_mode_skip_txfm;
+}
+
+static void recheck_zeromv_after_denoising(
+    VP9_COMP *cpi, MODE_INFO *const mi, MACROBLOCK *x, MACROBLOCKD *const xd,
+    VP9_DENOISER_DECISION decision, VP9_PICKMODE_CTX_DEN *ctx_den,
+    struct buf_2d yv12_mb[4][MAX_MB_PLANE], RD_COST *best_rdc, BLOCK_SIZE bsize,
+    int mi_row, int mi_col) {
+  // If INTRA or GOLDEN reference was selected, re-evaluate ZEROMV on
+  // denoised result. Only do this under noise conditions, and if rdcost of
+  // ZEROMV onoriginal source is not significantly higher than rdcost of best
+  // mode.
+  if (cpi->noise_estimate.enabled &&
+      cpi->noise_estimate.level > kLow &&
+      ctx_den->zero_last_cost_orig < (best_rdc->rdcost << 3) &&
+      ((ctx_den->best_ref_frame == INTRA_FRAME && decision >= FILTER_BLOCK) ||
+       (ctx_den->best_ref_frame == GOLDEN_FRAME &&
+        decision == FILTER_ZEROMV_BLOCK))) {
+    // Check if we should pick ZEROMV on denoised signal.
+    int rate = 0;
+    int64_t dist = 0;
+    uint32_t var_y = UINT_MAX;
+    uint32_t sse_y = UINT_MAX;
+    RD_COST this_rdc;
+    mi->mode = ZEROMV;
+    mi->ref_frame[0] = LAST_FRAME;
+    mi->ref_frame[1] = NONE;
+    mi->mv[0].as_int = 0;
+    mi->interp_filter = EIGHTTAP;
+    xd->plane[0].pre[0] = yv12_mb[LAST_FRAME][0];
+    vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+    model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist, &var_y, &sse_y);
+    this_rdc.rate = rate + ctx_den->ref_frame_cost[LAST_FRAME] +
+        cpi->inter_mode_cost[x->mbmi_ext->mode_context[LAST_FRAME]]
+                            [INTER_OFFSET(ZEROMV)];
+    this_rdc.dist = dist;
+    this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, rate, dist);
+    // Switch to ZEROMV if the rdcost for ZEROMV on denoised source
+    // is lower than best_ref mode (on original source).
+    if (this_rdc.rdcost > best_rdc->rdcost) {
+      this_rdc = *best_rdc;
+      mi->mode = ctx_den->best_mode;
+      mi->ref_frame[0] = ctx_den->best_ref_frame;
+      mi->interp_filter = ctx_den->best_pred_filter;
+      if (ctx_den->best_ref_frame == INTRA_FRAME)
+        mi->mv[0].as_int = INVALID_MV;
+      else if (ctx_den->best_ref_frame == GOLDEN_FRAME) {
+        mi->mv[0].as_int = ctx_den->frame_mv[ctx_den->best_mode]
+                                            [ctx_den->best_ref_frame].as_int;
+        if (ctx_den->reuse_inter_pred) {
+          xd->plane[0].pre[0] = yv12_mb[GOLDEN_FRAME][0];
+          vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+        }
+      }
+      mi->tx_size = ctx_den->best_tx_size;
+      x->skip_txfm[0] = ctx_den->best_mode_skip_txfm;
+    } else {
+      ctx_den->best_ref_frame = LAST_FRAME;
+      *best_rdc = this_rdc;
+    }
+  }
+}
+#endif  // CONFIG_VP9_TEMPORAL_DENOISING
+
+static INLINE int get_force_skip_low_temp_var(uint8_t *variance_low,
+                                              int mi_row, int mi_col,
+                                              BLOCK_SIZE bsize) {
+  const int i = (mi_row & 0x7) >> 1;
+  const int j = (mi_col & 0x7) >> 1;
+  int force_skip_low_temp_var = 0;
+  // Set force_skip_low_temp_var based on the block size and block offset.
+  if (bsize == BLOCK_64X64) {
+    force_skip_low_temp_var = variance_low[0];
+  } else if (bsize == BLOCK_64X32) {
+    if (!(mi_col & 0x7) && !(mi_row & 0x7)) {
+      force_skip_low_temp_var = variance_low[1];
+    } else if (!(mi_col & 0x7) && (mi_row & 0x7)) {
+      force_skip_low_temp_var = variance_low[2];
+    }
+  } else if (bsize == BLOCK_32X64) {
+    if (!(mi_col & 0x7) && !(mi_row & 0x7)) {
+      force_skip_low_temp_var = variance_low[3];
+    } else if ((mi_col & 0x7) && !(mi_row & 0x7)) {
+      force_skip_low_temp_var = variance_low[4];
+    }
+  } else if (bsize == BLOCK_32X32) {
+    if (!(mi_col & 0x7) && !(mi_row & 0x7)) {
+      force_skip_low_temp_var = variance_low[5];
+    } else if ((mi_col & 0x7) && !(mi_row & 0x7)) {
+      force_skip_low_temp_var = variance_low[6];
+    } else if (!(mi_col & 0x7) && (mi_row & 0x7)) {
+      force_skip_low_temp_var = variance_low[7];
+    } else if ((mi_col & 0x7) && (mi_row & 0x7)) {
+      force_skip_low_temp_var = variance_low[8];
+    }
+  } else if (bsize == BLOCK_16X16) {
+    force_skip_low_temp_var = variance_low[pos_shift_16x16[i][j]];
+  } else if (bsize == BLOCK_32X16) {
+    // The col shift index for the second 16x16 block.
+    const int j2 = ((mi_col + 2) & 0x7) >> 1;
+    // Only if each 16x16 block inside has low temporal variance.
+    force_skip_low_temp_var = variance_low[pos_shift_16x16[i][j]] &&
+                              variance_low[pos_shift_16x16[i][j2]];
+  } else if (bsize == BLOCK_16X32) {
+    // The row shift index for the second 16x16 block.
+    const int i2 = ((mi_row + 2) & 0x7) >> 1;
+    force_skip_low_temp_var = variance_low[pos_shift_16x16[i][j]] &&
+                              variance_low[pos_shift_16x16[i2][j]];
+  }
+  return force_skip_low_temp_var;
+}
+
 void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                          TileDataEnc *tile_data,
                          int mi_row, int mi_col, RD_COST *rd_cost,
@@ -1218,7 +1379,10 @@
   int ref_frame_cost[MAX_REF_FRAMES];
   int svc_force_zero_mode[3] = {0};
   int perform_intra_pred = 1;
+  int use_golden_nonzeromv = 1;
+  int force_skip_low_temp_var = 0;
 #if CONFIG_VP9_TEMPORAL_DENOISING
+  VP9_PICKMODE_CTX_DEN ctx_den;
   int64_t zero_last_cost_orig = INT64_MAX;
 #endif
 
@@ -1303,10 +1467,19 @@
     }
   }
 
+  if (cpi->sf.short_circuit_low_temp_var) {
+    force_skip_low_temp_var =
+        get_force_skip_low_temp_var(&x->variance_low[0], mi_row, mi_col, bsize);
+  }
+
+  if (!((cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) &&
+      !svc_force_zero_mode[GOLDEN_FRAME - 1] && !force_skip_low_temp_var))
+    use_golden_nonzeromv = 0;
+
   for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) {
     find_predictors(cpi, x, ref_frame, frame_mv, const_motion,
                     &ref_frame_skip_mask, flag_list, tile_data, mi_row, mi_col,
-                    yv12_mb, bsize);
+                    yv12_mb, bsize, force_skip_low_temp_var);
   }
 
   for (idx = 0; idx < RT_INTER_MODES; ++idx) {
@@ -1318,6 +1491,7 @@
     int is_skippable;
     int this_early_term = 0;
     PREDICTION_MODE this_mode = ref_mode_set[idx].pred_mode;
+
     if (cpi->use_svc)
       this_mode = ref_mode_set_svc[idx].pred_mode;
 
@@ -1336,17 +1510,33 @@
 
     if (!(cpi->ref_frame_flags & flag_list[ref_frame]))
       continue;
+
     if (const_motion[ref_frame] && this_mode == NEARMV)
       continue;
 
+    // Skip non-zeromv mode search for golden frame if force_skip_low_temp_var
+    // is set. If nearestmv for golden frame is 0, zeromv mode will be skipped
+    // later.
+    if (force_skip_low_temp_var && ref_frame == GOLDEN_FRAME &&
+        frame_mv[this_mode][ref_frame].as_int != 0) {
+      continue;
+    }
+
+    if (cpi->sf.short_circuit_low_temp_var == 2 &&
+        force_skip_low_temp_var && ref_frame == LAST_FRAME &&
+        this_mode == NEWMV) {
+      continue;
+    }
+
     if (cpi->use_svc) {
       if (svc_force_zero_mode[ref_frame - 1] &&
           frame_mv[this_mode][ref_frame].as_int != 0)
         continue;
     }
 
-    if (!(frame_mv[this_mode][ref_frame].as_int == 0 &&
-        ref_frame == LAST_FRAME)) {
+    if (!force_skip_low_temp_var &&
+        !(frame_mv[this_mode][ref_frame].as_int == 0 &&
+          ref_frame == LAST_FRAME)) {
       i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME;
       if ((cpi->ref_frame_flags & flag_list[i]) && sf->reference_masking)
         if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1))
@@ -1374,7 +1564,8 @@
           !cpi->use_svc &&
           cpi->oxcf.rc_mode == VPX_CBR) {
         int tmp_sad;
-        int dis, cost_list[5];
+        uint32_t dis;
+        int cost_list[5];
 
         if (bsize < BLOCK_16X16)
           continue;
@@ -1437,7 +1628,10 @@
       }
     }
 
-    if (this_mode == NEWMV && ref_frame == LAST_FRAME &&
+    // If use_golden_nonzeromv is false, NEWMV mode is skipped for golden, no
+    // need to compute best_pred_sad which is only used to skip golden NEWMV.
+    if (use_golden_nonzeromv && this_mode == NEWMV &&
+        ref_frame == LAST_FRAME &&
         frame_mv[NEWMV][LAST_FRAME].as_int != INVALID_MV) {
       const int pre_stride = xd->plane[0].pre[0].stride;
       const uint8_t * const pre_buf = xd->plane[0].pre[0].buf +
@@ -1449,21 +1643,6 @@
       x->pred_mv_sad[LAST_FRAME] = best_pred_sad;
     }
 
-    if (cpi->use_svc) {
-      if (this_mode == NEWMV && ref_frame == GOLDEN_FRAME &&
-          frame_mv[NEWMV][GOLDEN_FRAME].as_int != INVALID_MV) {
-        const int pre_stride = xd->plane[0].pre[0].stride;
-        const uint8_t * const pre_buf = xd->plane[0].pre[0].buf +
-            (frame_mv[NEWMV][GOLDEN_FRAME].as_mv.row >> 3) * pre_stride +
-            (frame_mv[NEWMV][GOLDEN_FRAME].as_mv.col >> 3);
-        best_pred_sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf,
-                                               x->plane[0].src.stride,
-                                               pre_buf, pre_stride);
-        x->pred_mv_sad[GOLDEN_FRAME] = best_pred_sad;
-      }
-    }
-
-
     if (this_mode != NEARESTMV &&
         frame_mv[this_mode][ref_frame].as_int ==
             frame_mv[NEARESTMV][ref_frame].as_int)
@@ -1487,8 +1666,9 @@
 
     if ((this_mode == NEWMV || filter_ref == SWITCHABLE) && pred_filter_search
         && (ref_frame == LAST_FRAME ||
-            (ref_frame == GOLDEN_FRAME && cpi->use_svc))
-        && (((mi->mv[0].as_mv.row | mi->mv[0].as_mv.col) & 0x07) != 0)) {
+           (ref_frame == GOLDEN_FRAME &&
+           (cpi->use_svc || cpi->oxcf.rc_mode == VPX_VBR))) &&
+           (((mi->mv[0].as_mv.row | mi->mv[0].as_mv.col) & 0x07) != 0)) {
       int pf_rate[3];
       int64_t pf_dist[3];
       unsigned int pf_var[3];
@@ -1517,12 +1697,9 @@
               free_pred_buffer(this_mode_pred);
               this_mode_pred = current_pred;
             }
-
-            if (filter < EIGHTTAP_SHARP) {
-              current_pred = &tmp[get_pred_buffer(tmp, 3)];
-              pd->dst.buf = current_pred->data;
-              pd->dst.stride = bw;
-            }
+            current_pred = &tmp[get_pred_buffer(tmp, 3)];
+            pd->dst.buf = current_pred->data;
+            pd->dst.stride = bw;
           }
         }
       }
@@ -1546,7 +1723,7 @@
       vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
 
       // For large partition blocks, extra testing is done.
-      if (bsize > BLOCK_32X32 &&
+      if (cpi->oxcf.rc_mode == VPX_CBR && bsize > BLOCK_32X32 &&
         !cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id) &&
         cm->base_qindex) {
         model_rd_for_sb_y_large(cpi, bsize, x, xd, &this_rdc.rate,
@@ -1560,8 +1737,8 @@
 
     if (!this_early_term) {
       this_sse = (int64_t)sse_y;
-      block_yrd(cpi, x, &this_rdc.rate, &this_rdc.dist, &is_skippable,
-                &this_sse, 0, bsize, VPXMIN(mi->tx_size, TX_16X16));
+      block_yrd(cpi, x, &this_rdc, &is_skippable, &this_sse, bsize,
+                VPXMIN(mi->tx_size, TX_16X16));
       x->skip_txfm[0] = is_skippable;
       if (is_skippable) {
         this_rdc.rate = vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
@@ -1587,17 +1764,15 @@
     }
 
     if (x->color_sensitivity[0] || x->color_sensitivity[1]) {
-      int uv_rate = 0;
-      int64_t uv_dist = 0;
+      RD_COST rdc_uv;
       const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, &xd->plane[1]);
       if (x->color_sensitivity[0])
         vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 1);
       if (x->color_sensitivity[1])
         vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 2);
-      model_rd_for_sb_uv(cpi, uv_bsize, x, xd, &uv_rate, &uv_dist,
-                         &var_y, &sse_y, 1, 2);
-      this_rdc.rate += uv_rate;
-      this_rdc.dist += uv_dist;
+      model_rd_for_sb_uv(cpi, uv_bsize, x, xd, &rdc_uv, &var_y, &sse_y, 1, 2);
+      this_rdc.rate += rdc_uv.rate;
+      this_rdc.dist += rdc_uv.dist;
     }
 
     this_rdc.rate += rate_mv;
@@ -1607,32 +1782,15 @@
     this_rdc.rate += ref_frame_cost[ref_frame];
     this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist);
 
+    // Bias against non-zero motion
     if (cpi->oxcf.rc_mode == VPX_CBR &&
         cpi->oxcf.speed >= 5 &&
         cpi->oxcf.content != VP9E_CONTENT_SCREEN &&
         !x->sb_is_skin) {
-      // Bias against non-zero (above some threshold) motion for large blocks.
-      // This is temporary fix to avoid selection of large mv for big blocks.
-      if (frame_mv[this_mode][ref_frame].as_mv.row > 64 ||
-          frame_mv[this_mode][ref_frame].as_mv.row < -64 ||
-          frame_mv[this_mode][ref_frame].as_mv.col > 64 ||
-          frame_mv[this_mode][ref_frame].as_mv.col < -64) {
-        if (bsize == BLOCK_64X64)
-          this_rdc.rdcost = this_rdc.rdcost << 1;
-        else if (bsize >= BLOCK_32X32)
-          this_rdc.rdcost = 3 * this_rdc.rdcost >> 1;
-      }
-      // If noise estimation is enabled, and estimated level is above threshold,
-      // add a bias to LAST reference with small motion, for large blocks.
-      if (cpi->noise_estimate.enabled &&
-          cpi->noise_estimate.level >= kMedium &&
-          bsize >= BLOCK_32X32 &&
-          ref_frame == LAST_FRAME &&
-          frame_mv[this_mode][ref_frame].as_mv.row < 8 &&
-          frame_mv[this_mode][ref_frame].as_mv.row > -8 &&
-          frame_mv[this_mode][ref_frame].as_mv.col < 8 &&
-          frame_mv[this_mode][ref_frame].as_mv.col > -8)
-        this_rdc.rdcost = 7 * this_rdc.rdcost >> 3;
+      vp9_large_block_mv_bias(&cpi->noise_estimate, &this_rdc, bsize,
+                              frame_mv[this_mode][ref_frame].as_mv.row,
+                              frame_mv[this_mode][ref_frame].as_mv.col,
+                              ref_frame == LAST_FRAME);
     }
 
     // Skipping checking: test to see if this block can be reconstructed by
@@ -1710,11 +1868,12 @@
   }
   // Perform intra prediction search, if the best SAD is above a certain
   // threshold.
-  if (perform_intra_pred &&
-      ((best_rdc.rdcost == INT64_MAX ||
-      (!x->skip && best_rdc.rdcost > inter_mode_thresh &&
-       bsize <= cpi->sf.max_intra_bsize)))) {
-    struct estimate_block_intra_args args = { cpi, x, DC_PRED, 1, 0, 0 };
+  if ((!force_skip_low_temp_var || bsize < BLOCK_32X32) &&
+      perform_intra_pred &&
+      (best_rdc.rdcost == INT64_MAX ||
+       (!x->skip && best_rdc.rdcost > inter_mode_thresh &&
+        bsize <= cpi->sf.max_intra_bsize))) {
+    struct estimate_block_intra_args args = { cpi, x, DC_PRED, 1, 0 };
     int i;
     TX_SIZE best_intra_tx_size = TX_SIZES;
     TX_SIZE intra_tx_size =
@@ -1763,10 +1922,10 @@
 
       mi->mode = this_mode;
       mi->ref_frame[0] = INTRA_FRAME;
+      this_rdc.dist = this_rdc.rate = 0;
       args.mode = this_mode;
       args.skippable = 1;
-      args.rate = 0;
-      args.dist = 0;
+      args.rdc = &this_rdc;
       mi->tx_size = intra_tx_size;
       vp9_foreach_transformed_block_in_plane(xd, bsize, 0,
                                              estimate_block_intra, &args);
@@ -1774,10 +1933,10 @@
       // mirrors the behavior used by inter
       if (args.skippable) {
         x->skip_txfm[0] = SKIP_TXFM_AC_DC;
-        args.rate = vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), 1);
+        this_rdc.rate = vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), 1);
       } else {
         x->skip_txfm[0] = SKIP_TXFM_NONE;
-        args.rate += vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), 0);
+        this_rdc.rate += vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), 0);
       }
       // Inter and intra RD will mismatch in scale for non-screen content.
       if (cpi->oxcf.content == VP9E_CONTENT_SCREEN) {
@@ -1788,8 +1947,6 @@
           vp9_foreach_transformed_block_in_plane(xd, bsize, 2,
                                                  estimate_block_intra, &args);
       }
-      this_rdc.rate = args.rate;
-      this_rdc.dist = args.dist;
       this_rdc.rate += cpi->mbmode_cost[this_mode];
       this_rdc.rate += ref_frame_cost[INTRA_FRAME];
       this_rdc.rate += intra_cost_penalty;
@@ -1845,55 +2002,14 @@
       cpi->denoiser.denoising_level > kDenLowLow &&
       cpi->denoiser.reset == 0) {
     VP9_DENOISER_DECISION decision = COPY_BLOCK;
-    vp9_denoiser_denoise(cpi, x, mi_row, mi_col, VPXMAX(BLOCK_8X8, bsize),
-                         ctx, &decision);
-    // If INTRA or GOLDEN reference was selected, re-evaluate ZEROMV on denoised
-    // result. Only do this under noise conditions, and if rdcost of ZEROMV on
-    // original source is not significantly higher than rdcost of best mode.
-    if (((best_ref_frame == INTRA_FRAME && decision >= FILTER_BLOCK) ||
-        (best_ref_frame == GOLDEN_FRAME && decision == FILTER_ZEROMV_BLOCK)) &&
-        cpi->noise_estimate.enabled &&
-        cpi->noise_estimate.level > kLow &&
-        zero_last_cost_orig < (best_rdc.rdcost << 3)) {
-      // Check if we should pick ZEROMV on denoised signal.
-      int rate = 0;
-      int64_t dist = 0;
-      mi->mode = ZEROMV;
-      mi->ref_frame[0] = LAST_FRAME;
-      mi->ref_frame[1] = NONE;
-      mi->mv[0].as_int = 0;
-      mi->interp_filter = EIGHTTAP;
-      xd->plane[0].pre[0] = yv12_mb[LAST_FRAME][0];
-      vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
-      model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist, &var_y, &sse_y);
-      this_rdc.rate = rate + ref_frame_cost[LAST_FRAME] +
-          cpi->inter_mode_cost[x->mbmi_ext->mode_context[LAST_FRAME]]
-                              [INTER_OFFSET(ZEROMV)];
-      this_rdc.dist = dist;
-      this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, rate, dist);
-      // Switch to ZEROMV if the rdcost for ZEROMV on denoised source
-      // is lower than best_ref mode (on original source).
-      if (this_rdc.rdcost > best_rdc.rdcost) {
-        this_rdc = best_rdc;
-        mi->mode = best_mode;
-        mi->ref_frame[0] = best_ref_frame;
-        mi->interp_filter = best_pred_filter;
-        if (best_ref_frame == INTRA_FRAME)
-          mi->mv[0].as_int = INVALID_MV;
-        else if (best_ref_frame == GOLDEN_FRAME) {
-          mi->mv[0].as_int = frame_mv[best_mode][best_ref_frame].as_int;
-          if (reuse_inter_pred) {
-            xd->plane[0].pre[0] = yv12_mb[GOLDEN_FRAME][0];
-            vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
-          }
-        }
-        mi->tx_size = best_tx_size;
-        x->skip_txfm[0] = best_mode_skip_txfm;
-      } else {
-        best_ref_frame = LAST_FRAME;
-        best_rdc = this_rdc;
-      }
-    }
+    vp9_pickmode_ctx_den_update(&ctx_den, zero_last_cost_orig, ref_frame_cost,
+                                frame_mv, reuse_inter_pred, best_tx_size,
+                                best_mode, best_ref_frame, best_pred_filter,
+                                best_mode_skip_txfm);
+    vp9_denoiser_denoise(cpi, x, mi_row, mi_col, bsize, ctx, &decision);
+    recheck_zeromv_after_denoising(cpi, mi, x, xd, decision, &ctx_den, yv12_mb,
+                                   &best_rdc, bsize, mi_row, mi_col);
+    best_ref_frame = ctx_den.best_ref_frame;
   }
 #endif
 
@@ -2060,7 +2176,7 @@
             const int tmp_col_max = x->mv_col_max;
             const int tmp_row_min = x->mv_row_min;
             const int tmp_row_max = x->mv_row_max;
-            int dummy_dist;
+            uint32_t dummy_dist;
 
             if (i == 0) {
               mvp_full.row = b_mv[NEARESTMV].as_mv.row >> 3;
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 91f877e..d68b684 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -94,7 +94,7 @@
       const int coeff_sign = (coeff >> 31);
       const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
       const int64_t tmp = abs_coeff + round_ptr[rc != 0];
-      const uint32_t abs_qcoeff = (uint32_t)((tmp * quant_ptr[rc != 0]) >> 16);
+      const int abs_qcoeff = (int)((tmp * quant_ptr[rc != 0]) >> 16);
       qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
       dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
       if (abs_qcoeff)
@@ -219,12 +219,12 @@
 
 static void invert_quant(int16_t *quant, int16_t *shift, int d) {
   unsigned t;
-  int l;
+  int l, m;
   t = d;
   for (l = 0; t > 1; l++)
     t >>= 1;
-  t = 1 + (1 << (16 + l)) / d;
-  *quant = (int16_t)(t - (1 << 16));
+  m = 1 + (1 << (16 + l)) / d;
+  *quant = (int16_t)(m - (1 << 16));
   *shift = 1 << (16 - l);
 }
 
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index dab7f67..b45f8d0 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -337,6 +337,10 @@
   rc->total_actual_bits = 0;
   rc->total_target_bits = 0;
   rc->total_target_vs_actual = 0;
+  rc->avg_frame_low_motion = 0;
+  rc->high_source_sad = 0;
+  rc->count_last_scene_change = 0;
+  rc->avg_source_sad = 0;
 
   rc->frames_since_key = 8;  // Sensible default for first frame.
   rc->this_key_frame_forced = 0;
@@ -948,11 +952,14 @@
                              FIXED_GF_INTERVAL], cm->bit_depth);
       active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
     } else {
-      // Use the lower of active_worst_quality and recent/average Q.
-      if (cm->current_video_frame > 1)
-        active_best_quality = inter_minq[rc->avg_frame_qindex[INTER_FRAME]];
-      else
+      // Use the min of the average Q and active_worst_quality as basis for
+      // active_best.
+      if (cm->current_video_frame > 1) {
+        q = VPXMIN(rc->avg_frame_qindex[INTER_FRAME], active_worst_quality);
+        active_best_quality = inter_minq[q];
+      } else {
         active_best_quality = inter_minq[rc->avg_frame_qindex[KEY_FRAME]];
+      }
       // For the constrained quality mode we don't want
       // q to fall below the cq level.
       if ((oxcf->rc_mode == VPX_CQ) &&
@@ -1152,8 +1159,7 @@
 
   // Extension to max or min Q if undershoot or overshoot is outside
   // the permitted range.
-  if ((cpi->oxcf.rc_mode != VPX_Q) &&
-      (cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD)) {
+  if (cpi->oxcf.rc_mode != VPX_Q) {
     if (frame_is_intra_only(cm) ||
         (!rc->is_src_frame_alt_ref &&
          (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) {
@@ -1329,6 +1335,26 @@
   }
 }
 
+static void compute_frame_low_motion(VP9_COMP *const cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  int mi_row, mi_col;
+  MODE_INFO **mi = cm->mi_grid_visible;
+  RATE_CONTROL *const rc = &cpi->rc;
+  const int rows = cm->mi_rows, cols = cm->mi_cols;
+  int cnt_zeromv = 0;
+  for (mi_row = 0; mi_row < rows; mi_row++) {
+    for (mi_col = 0; mi_col < cols; mi_col++) {
+      if (abs(mi[0]->mv[0].as_mv.row) < 16 &&
+          abs(mi[0]->mv[0].as_mv.col) < 16)
+        cnt_zeromv++;
+      mi++;
+    }
+    mi += 8;
+  }
+  cnt_zeromv = 100 * cnt_zeromv / (rows * cols);
+  rc->avg_frame_low_motion = (3 * rc->avg_frame_low_motion + cnt_zeromv) >> 2;
+}
+
 void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
   const VP9_COMMON *const cm = &cpi->common;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
@@ -1438,6 +1464,11 @@
         rc->next_frame_size_selector != rc->frame_size_selector;
     rc->frame_size_selector = rc->next_frame_size_selector;
   }
+
+  if (oxcf->pass == 0) {
+    if (cm->frame_type != KEY_FRAME)
+      compute_frame_low_motion(cpi);
+  }
 }
 
 void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) {
@@ -1476,6 +1507,24 @@
   return vp9_rc_clamp_iframe_target_size(cpi, target);
 }
 
+static void adjust_gf_key_frame(VP9_COMP *cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  rc->constrained_gf_group = 0;
+  // Reset gf interval to make more equal spacing for up-coming key frame.
+  if ((rc->frames_to_key <= 7 * rc->baseline_gf_interval >> 2) &&
+      (rc->frames_to_key > rc->baseline_gf_interval)) {
+    rc->baseline_gf_interval = rc->frames_to_key >> 1;
+    if (rc->baseline_gf_interval < 5)
+      rc->baseline_gf_interval = rc->frames_to_key;
+    rc->constrained_gf_group = 1;
+  } else {
+    // Reset since frames_till_gf_update_due must be <= frames_to_key.
+    if (rc->baseline_gf_interval > rc->frames_to_key) {
+      rc->baseline_gf_interval = rc->frames_to_key;
+      rc->constrained_gf_group = 1;
+    }
+  }
+}
 void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
@@ -1496,23 +1545,33 @@
     cm->frame_type = INTER_FRAME;
   }
   if (rc->frames_till_gf_update_due == 0) {
+    double rate_err = 1.0;
+    rc->gfu_boost = DEFAULT_GF_BOOST;
     if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->oxcf.pass == 0) {
       vp9_cyclic_refresh_set_golden_update(cpi);
     } else {
       rc->baseline_gf_interval =
           (rc->min_gf_interval + rc->max_gf_interval) / 2;
     }
-    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
-    // NOTE: frames_till_gf_update_due must be <= frames_to_key.
-    if (rc->frames_till_gf_update_due > rc->frames_to_key) {
-      rc->frames_till_gf_update_due = rc->frames_to_key;
-      rc->constrained_gf_group = 1;
-    } else {
-      rc->constrained_gf_group = 0;
+    if (rc->rolling_target_bits > 0)
+      rate_err =
+          (double)rc->rolling_actual_bits / (double)rc->rolling_target_bits;
+    // Increase gf interval at high Q and high overshoot.
+    if (cm->current_video_frame > 30 &&
+        rc->avg_frame_qindex[INTER_FRAME] > (7 * rc->worst_quality) >> 3 &&
+        rate_err > 3.5) {
+      rc->baseline_gf_interval =
+          VPXMIN(15, (3 * rc->baseline_gf_interval) >> 1);
+    } else if (cm->current_video_frame > 30 &&
+               rc->avg_frame_low_motion < 20) {
+      // Decrease boost and gf interval for high motion case.
+      rc->gfu_boost = DEFAULT_GF_BOOST >> 1;
+      rc->baseline_gf_interval = VPXMAX(5, rc->baseline_gf_interval >> 1);
     }
+    adjust_gf_key_frame(cpi);
+    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
     cpi->refresh_golden_frame = 1;
     rc->source_alt_ref_pending = USE_ALTREF_FOR_ONE_PASS;
-    rc->gfu_boost = DEFAULT_GF_BOOST;
   }
   if (cm->frame_type == KEY_FRAME)
     target = calc_iframe_target_size_one_pass_vbr(cpi);
@@ -1833,27 +1892,28 @@
   RATE_CONTROL *const rc = &cpi->rc;
   int64_t vbr_bits_off_target = rc->vbr_bits_off_target;
   int max_delta;
-  double position_factor = 1.0;
+  int frame_window = VPXMIN(16,
+      ((int)cpi->twopass.total_stats.count - cpi->common.current_video_frame));
 
-  // How far through the clip are we.
-  // This number is used to damp the per frame rate correction.
-  // Range 0 - 1.0
-  if (cpi->twopass.total_stats.count) {
-    position_factor = sqrt((double)cpi->common.current_video_frame /
-                           cpi->twopass.total_stats.count);
-  }
-  max_delta = (int)(position_factor *
-                    ((*this_frame_target * VBR_PCT_ADJUSTMENT_LIMIT) / 100));
+  // Calcluate the adjustment to rate for this frame.
+  if (frame_window > 0) {
+    max_delta = (vbr_bits_off_target > 0)
+        ? (int)(vbr_bits_off_target / frame_window)
+        : (int)(-vbr_bits_off_target / frame_window);
 
-  // vbr_bits_off_target > 0 means we have extra bits to spend
-  if (vbr_bits_off_target > 0) {
-    *this_frame_target +=
-      (vbr_bits_off_target > max_delta) ? max_delta
-                                        : (int)vbr_bits_off_target;
-  } else {
-    *this_frame_target -=
-      (vbr_bits_off_target < -max_delta) ? max_delta
-                                         : (int)-vbr_bits_off_target;
+    max_delta = VPXMIN(max_delta,
+        ((*this_frame_target * VBR_PCT_ADJUSTMENT_LIMIT) / 100));
+
+    // vbr_bits_off_target > 0 means we have extra bits to spend
+    if (vbr_bits_off_target > 0) {
+      *this_frame_target +=
+        (vbr_bits_off_target > max_delta) ? max_delta
+                                          : (int)vbr_bits_off_target;
+    } else {
+      *this_frame_target -=
+        (vbr_bits_off_target < -max_delta) ? max_delta
+                                           : (int)-vbr_bits_off_target;
+    }
   }
 
   // Fast redistribution of bits arising from massive local undershoot.
@@ -2088,6 +2148,7 @@
     // For VBR, under scene change/high content change, force golden refresh.
     if (cpi->oxcf.rc_mode == VPX_VBR &&
         rc->high_source_sad &&
+        rc->frames_to_key > 3 &&
         rc->count_last_scene_change > 4 &&
         cpi->ext_refresh_frame_flags_pending == 0) {
       int target;
@@ -2095,9 +2156,8 @@
       rc->gfu_boost = DEFAULT_GF_BOOST >> 1;
       rc->baseline_gf_interval = VPXMIN(20,
           VPXMAX(10, rc->baseline_gf_interval));
+      adjust_gf_key_frame(cpi);
       rc->frames_till_gf_update_due = rc->baseline_gf_interval;
-      if (rc->frames_till_gf_update_due > rc->frames_to_key)
-        rc->frames_till_gf_update_due = rc->frames_to_key;
       target = calc_pframe_target_size_one_pass_vbr(cpi);
       vp9_rc_set_frame_target(cpi, target);
       rc->count_last_scene_change = 0;
diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index 4cd4b12..7024bcf 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -162,6 +162,7 @@
   uint64_t avg_source_sad;
   int high_source_sad;
   int count_last_scene_change;
+  int avg_frame_low_motion;
 } RATE_CONTROL;
 
 struct VP9_COMP;
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index a304182..0ed0850 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -1214,6 +1214,11 @@
   for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
     if (!(cpi->sf.intra_uv_mode_mask[max_tx_size] & (1 << mode)))
       continue;
+#if CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH
+    if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) &&
+        (xd->above_mi == NULL || xd->left_mi == NULL) && need_top_left[mode])
+      continue;
+#endif  // CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH
 
     xd->mi[0]->uv_mode = mode;
 
@@ -1710,8 +1715,8 @@
     x->mv_row_max = tmp_row_max;
 
     if (bestsme < INT_MAX) {
-      int dis; /* TODO: use dis in distortion calculation later. */
-      unsigned int sse;
+      uint32_t dis; /* TODO: use dis in distortion calculation later. */
+      uint32_t sse;
       bestsme = cpi->find_fractional_mv_step(
           x, &tmp_mv,
           &ref_mv[id].as_mv,
@@ -1911,7 +1916,7 @@
               INT_MAX, 1);
 
           if (bestsme < INT_MAX) {
-            int distortion;
+            uint32_t distortion;
             cpi->find_fractional_mv_step(
                 x,
                 new_mv,
@@ -2341,7 +2346,7 @@
   x->mv_row_max = tmp_row_max;
 
   if (bestsme < INT_MAX) {
-    int dis;  /* TODO: use dis in distortion calculation later. */
+    uint32_t dis;  /* TODO: use dis in distortion calculation later. */
     cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv,
                                  cm->allow_high_precision_mv,
                                  x->errorperbit,
diff --git a/vp9/encoder/vp9_skin_detection.c b/vp9/encoder/vp9_skin_detection.c
index ff0dfce..23a5fc7 100644
--- a/vp9/encoder/vp9_skin_detection.c
+++ b/vp9/encoder/vp9_skin_detection.c
@@ -88,7 +88,7 @@
                            int stride, int strideuv, int bsize,
                            int consec_zeromv, int curr_motion_magn) {
   // No skin if block has been zero/small motion for long consecutive time.
-  if (consec_zeromv > 80 && curr_motion_magn == 0) {
+  if (consec_zeromv > 60 && curr_motion_magn == 0) {
     return 0;
   } else {
     int motion = 1;
@@ -100,7 +100,7 @@
     const uint8_t ysource = y[y_height_shift * stride + y_width_shift];
     const uint8_t usource = u[uv_height_shift * strideuv + uv_width_shift];
     const uint8_t vsource = v[uv_height_shift * strideuv + uv_width_shift];
-    if (consec_zeromv > 30 && curr_motion_magn == 0)
+    if (consec_zeromv > 25 && curr_motion_magn == 0)
       motion = 0;
     return vp9_skin_pixel(ysource, usource, vsource, motion);
   }
@@ -112,7 +112,6 @@
 void vp9_compute_skin_map(VP9_COMP *const cpi, FILE *yuv_skinmap_file) {
   int i, j, mi_row, mi_col, num_bl;
   VP9_COMMON *const cm = &cpi->common;
-  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   uint8_t *y;
   const uint8_t *src_y = cpi->Source->y_buffer;
   const uint8_t *src_u = cpi->Source->u_buffer;
@@ -166,19 +165,17 @@
       } else {
         int block_size = BLOCK_8X8;
         int consec_zeromv = 0;
-        if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
-          int bl_index = mi_row * cm->mi_cols + mi_col;
-          int bl_index1 = bl_index + 1;
-          int bl_index2 = bl_index + cm->mi_cols;
-          int bl_index3 = bl_index2 + 1;
-          if (y_bsize == 8)
-            consec_zeromv = cr->consec_zero_mv[bl_index];
-          else
-            consec_zeromv = VPXMIN(cr->consec_zero_mv[bl_index],
-                                     VPXMIN(cr->consec_zero_mv[bl_index1],
-                                     VPXMIN(cr->consec_zero_mv[bl_index2],
-                                     cr->consec_zero_mv[bl_index3])));
-        }
+        int bl_index = mi_row * cm->mi_cols + mi_col;
+        int bl_index1 = bl_index + 1;
+        int bl_index2 = bl_index + cm->mi_cols;
+        int bl_index3 = bl_index2 + 1;
+        if (y_bsize == 8)
+          consec_zeromv = cpi->consec_zero_mv[bl_index];
+        else
+          consec_zeromv = VPXMIN(cpi->consec_zero_mv[bl_index],
+                                 VPXMIN(cpi->consec_zero_mv[bl_index1],
+                                 VPXMIN(cpi->consec_zero_mv[bl_index2],
+                                 cpi->consec_zero_mv[bl_index3])));
         if (y_bsize == 16)
           block_size = BLOCK_16X16;
         is_skin  = vp9_compute_skin_block(src_y, src_u, src_v, src_ystride,
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 02be3c3..e7f04a2 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -429,6 +429,11 @@
     sf->mv.search_method = NSTEP;
     sf->mv.reduce_first_step_size = 1;
     sf->skip_encode_sb = 0;
+    if (!cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR &&
+        content != VP9E_CONTENT_SCREEN) {
+      // Enable short circuit for low temporal variance.
+      sf->short_circuit_low_temp_var = 1;
+    }
   }
 
   if (speed >= 7) {
@@ -445,6 +450,17 @@
     sf->adaptive_rd_thresh = 4;
     sf->mv.subpel_force_stop = (content == VP9E_CONTENT_SCREEN) ? 3 : 2;
     sf->lpf_pick = LPF_PICK_MINIMAL_LPF;
+    // Only keep INTRA_DC mode for speed 8.
+    if (!is_keyframe) {
+      int i = 0;
+      for (i = 0; i < BLOCK_SIZES; ++i)
+        sf->intra_y_mode_bsize_mask[i] = INTRA_DC;
+    }
+    if (!cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR &&
+        content != VP9E_CONTENT_SCREEN) {
+      // More aggressive short circuit for speed 8.
+      sf->short_circuit_low_temp_var = 2;
+    }
   }
 }
 
@@ -554,6 +570,7 @@
   sf->default_interp_filter = SWITCHABLE;
   sf->simple_model_rd_from_var = 0;
   sf->short_circuit_flat_blocks = 0;
+  sf->short_circuit_low_temp_var = 0;
 
   // Some speed-up features even for best quality as minimal impact on quality.
   sf->adaptive_rd_thresh = 1;
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index 90b3216..e88a7df 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -446,6 +446,14 @@
   // Skip a number of expensive mode evaluations for blocks with zero source
   // variance.
   int short_circuit_flat_blocks;
+
+  // Skip a number of expensive mode evaluations for blocks with very low
+  // temporal variance.
+  // 1: Skip golden non-zeromv and ALL INTRA for bsize >= 32x32.
+  // 2: Skip golden non-zeromv and newmv-last for bsize >= 16x16, skip ALL
+  // INTRA for bsize >= 32x32 and vert/horz INTRA for bsize 16x16, 16x32 and
+  // 32x16.
+  int short_circuit_low_temp_var;
 } SPEED_FEATURES;
 
 struct VP9_COMP;
diff --git a/vp9/encoder/vp9_subexp.c b/vp9/encoder/vp9_subexp.c
index cbd3c49..29db015 100644
--- a/vp9/encoder/vp9_subexp.c
+++ b/vp9/encoder/vp9_subexp.c
@@ -14,8 +14,6 @@
 #include "vp9/encoder/vp9_cost.h"
 #include "vp9/encoder/vp9_subexp.h"
 
-#define vp9_cost_upd256  ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd)))
-
 static const uint8_t update_bits[255] = {
    5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
    6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
@@ -34,6 +32,7 @@
   11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
   11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  0,
 };
+#define MIN_DELP_BITS 5
 
 static int recenter_nonneg(int v, int m) {
   if (v > (m << 1))
@@ -123,14 +122,17 @@
   int bestsavings = 0;
   vpx_prob newp, bestnewp = oldp;
   const int step = *bestp > oldp ? -1 : 1;
+  const int upd_cost = vp9_cost_one(upd) - vp9_cost_zero(upd);
 
-  for (newp = *bestp; newp != oldp; newp += step) {
-    const int new_b = cost_branch256(ct, newp);
-    const int update_b = prob_diff_update_cost(newp, oldp) + vp9_cost_upd256;
-    const int savings = old_b - new_b - update_b;
-    if (savings > bestsavings) {
-      bestsavings = savings;
-      bestnewp = newp;
+  if (old_b > upd_cost + (MIN_DELP_BITS << VP9_PROB_COST_SHIFT)) {
+    for (newp = *bestp; newp != oldp; newp += step) {
+      const int new_b = cost_branch256(ct, newp);
+      const int update_b = prob_diff_update_cost(newp, oldp) + upd_cost;
+      const int savings = old_b - new_b - update_b;
+      if (savings > bestsavings) {
+        bestsavings = savings;
+        bestnewp = newp;
+      }
     }
   }
   *bestp = bestnewp;
@@ -138,41 +140,40 @@
 }
 
 int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
-                                              const vpx_prob *oldp,
+                                              const vpx_prob oldp,
                                               vpx_prob *bestp,
                                               vpx_prob upd,
                                               int stepsize) {
   int i, old_b, new_b, update_b, savings, bestsavings;
   int newp;
-  const int step_sign = *bestp > oldp[PIVOT_NODE] ? -1 : 1;
+  const int step_sign = *bestp > oldp ? -1 : 1;
   const int step = stepsize * step_sign;
-  vpx_prob bestnewp, newplist[ENTROPY_NODES], oldplist[ENTROPY_NODES];
-  vp9_model_to_full_probs(oldp, oldplist);
-  memcpy(newplist, oldp, sizeof(vpx_prob) * UNCONSTRAINED_NODES);
-  for (i = UNCONSTRAINED_NODES, old_b = 0; i < ENTROPY_NODES; ++i)
-    old_b += cost_branch256(ct + 2 * i, oldplist[i]);
-  old_b += cost_branch256(ct + 2 * PIVOT_NODE, oldplist[PIVOT_NODE]);
+  const int upd_cost = vp9_cost_one(upd) - vp9_cost_zero(upd);
+  const vpx_prob *newplist, *oldplist;
+  vpx_prob bestnewp;
+  oldplist = vp9_pareto8_full[oldp - 1];
+  old_b = cost_branch256(ct + 2 * PIVOT_NODE, oldp);
+  for (i = UNCONSTRAINED_NODES; i < ENTROPY_NODES; ++i)
+    old_b += cost_branch256(ct + 2 * i, oldplist[i - UNCONSTRAINED_NODES]);
 
   bestsavings = 0;
-  bestnewp = oldp[PIVOT_NODE];
+  bestnewp = oldp;
 
   assert(stepsize > 0);
 
-  for (newp = *bestp; (newp - oldp[PIVOT_NODE]) * step_sign < 0;
-      newp += step) {
-    if (newp < 1 || newp > 255)
-      continue;
-    newplist[PIVOT_NODE] = newp;
-    vp9_model_to_full_probs(newplist, newplist);
-    for (i = UNCONSTRAINED_NODES, new_b = 0; i < ENTROPY_NODES; ++i)
-      new_b += cost_branch256(ct + 2 * i, newplist[i]);
-    new_b += cost_branch256(ct + 2 * PIVOT_NODE, newplist[PIVOT_NODE]);
-    update_b = prob_diff_update_cost(newp, oldp[PIVOT_NODE]) +
-        vp9_cost_upd256;
-    savings = old_b - new_b - update_b;
-    if (savings > bestsavings) {
-      bestsavings = savings;
-      bestnewp = newp;
+  if (old_b > upd_cost + (MIN_DELP_BITS << VP9_PROB_COST_SHIFT)) {
+    for (newp = *bestp; (newp - oldp) * step_sign < 0; newp += step) {
+      if (newp < 1 || newp > 255) continue;
+      newplist = vp9_pareto8_full[newp - 1];
+      new_b = cost_branch256(ct + 2 * PIVOT_NODE, newp);
+      for (i = UNCONSTRAINED_NODES; i < ENTROPY_NODES; ++i)
+        new_b += cost_branch256(ct + 2 * i, newplist[i - UNCONSTRAINED_NODES]);
+      update_b = prob_diff_update_cost(newp, oldp) + upd_cost;
+      savings = old_b - new_b - update_b;
+      if (savings > bestsavings) {
+        bestsavings = savings;
+        bestnewp = newp;
+      }
     }
   }
 
diff --git a/vp9/encoder/vp9_subexp.h b/vp9/encoder/vp9_subexp.h
index b968232..efe62c0 100644
--- a/vp9/encoder/vp9_subexp.h
+++ b/vp9/encoder/vp9_subexp.h
@@ -32,7 +32,7 @@
 
 
 int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
-                                              const vpx_prob *oldp,
+                                              const vpx_prob oldp,
                                               vpx_prob *bestp,
                                               vpx_prob upd,
                                               int stepsize);
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index 73048f8..1814a32 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -327,12 +327,12 @@
     CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
     signed char *temp = cr->map;
     uint8_t *temp2 = cr->last_coded_q_map;
-    uint8_t *temp3 = cr->consec_zero_mv;
+    uint8_t *temp3 = cpi->consec_zero_mv;
     cr->map = lc->map;
     lc->map = temp;
     cr->last_coded_q_map = lc->last_coded_q_map;
     lc->last_coded_q_map = temp2;
-    cr->consec_zero_mv = lc->consec_zero_mv;
+    cpi->consec_zero_mv = lc->consec_zero_mv;
     lc->consec_zero_mv = temp3;
     cr->sb_index = lc->sb_index;
   }
@@ -360,8 +360,8 @@
     cr->map = temp;
     lc->last_coded_q_map = cr->last_coded_q_map;
     cr->last_coded_q_map = temp2;
-    lc->consec_zero_mv = cr->consec_zero_mv;
-    cr->consec_zero_mv = temp3;
+    lc->consec_zero_mv = cpi->consec_zero_mv;
+    cpi->consec_zero_mv = temp3;
     lc->sb_index = cr->sb_index;
   }
 }
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index ebe28b8..02bcf5a 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -143,8 +143,8 @@
 
       for (idy = -1; idy <= 1; ++idy) {
         for (idx = -1; idx <= 1; ++idx) {
-          int row = i + idy;
-          int col = j + idx;
+          int row = (int)i + idy;
+          int col = (int)j + idx;
 
           if (row >= 0 && row < (int)block_height &&
               col >= 0 && col < (int)block_width) {
@@ -211,8 +211,8 @@
 
       for (idy = -1; idy <= 1; ++idy) {
         for (idx = -1; idx <= 1; ++idx) {
-          int row = i + idy;
-          int col = j + idx;
+          int row = (int)i + idy;
+          int col = (int)j + idx;
 
           if (row >= 0 && row < (int)block_height &&
               col >= 0 && col < (int)block_width) {
@@ -264,8 +264,8 @@
   int step_param;
   int sadpb = x->sadperbit16;
   int bestsme = INT_MAX;
-  int distortion;
-  unsigned int sse;
+  uint32_t distortion;
+  uint32_t sse;
   int cost_list[5];
 
   MV best_ref_mv1 = {0, 0};
diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_intrin_sse2.c
similarity index 100%
rename from vp9/encoder/x86/vp9_dct_sse2.c
rename to vp9/encoder/x86/vp9_dct_intrin_sse2.c
diff --git a/vp9/encoder/x86/vp9_dct_mmx.asm b/vp9/encoder/x86/vp9_dct_mmx.asm
deleted file mode 100644
index 7a7a6b6..0000000
--- a/vp9/encoder/x86/vp9_dct_mmx.asm
+++ /dev/null
@@ -1,104 +0,0 @@
-;
-;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%define private_prefix vp9
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-%macro TRANSFORM_COLS 0
-  paddw           m0,        m1
-  movq            m4,        m0
-  psubw           m3,        m2
-  psubw           m4,        m3
-  psraw           m4,        1
-  movq            m5,        m4
-  psubw           m5,        m1 ;b1
-  psubw           m4,        m2 ;c1
-  psubw           m0,        m4
-  paddw           m3,        m5
-                                ; m0 a0
-  SWAP            1,         4  ; m1 c1
-  SWAP            2,         3  ; m2 d1
-  SWAP            3,         5  ; m3 b1
-%endmacro
-
-%macro TRANSPOSE_4X4 0
-  movq            m4,        m0
-  movq            m5,        m2
-  punpcklwd       m4,        m1
-  punpckhwd       m0,        m1
-  punpcklwd       m5,        m3
-  punpckhwd       m2,        m3
-  movq            m1,        m4
-  movq            m3,        m0
-  punpckldq       m1,        m5
-  punpckhdq       m4,        m5
-  punpckldq       m3,        m2
-  punpckhdq       m0,        m2
-  SWAP            2, 3, 0, 1, 4
-%endmacro
-
-INIT_MMX mmx
-cglobal fwht4x4, 3, 4, 8, input, output, stride
-  lea             r3q,       [inputq + strideq*4]
-  movq            m0,        [inputq] ;a1
-  movq            m1,        [inputq + strideq*2] ;b1
-  movq            m2,        [r3q] ;c1
-  movq            m3,        [r3q + strideq*2] ;d1
-
-  TRANSFORM_COLS
-  TRANSPOSE_4X4
-  TRANSFORM_COLS
-  TRANSPOSE_4X4
-
-  psllw           m0,        2
-  psllw           m1,        2
-  psllw           m2,        2
-  psllw           m3,        2
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  pxor            m4,             m4
-  pxor            m5,             m5
-  pcmpgtw         m4,             m0
-  pcmpgtw         m5,             m1
-  movq            m6,             m0
-  movq            m7,             m1
-  punpcklwd       m0,             m4
-  punpcklwd       m1,             m5
-  punpckhwd       m6,             m4
-  punpckhwd       m7,             m5
-  movq            [outputq],      m0
-  movq            [outputq + 8],  m6
-  movq            [outputq + 16], m1
-  movq            [outputq + 24], m7
-  pxor            m4,             m4
-  pxor            m5,             m5
-  pcmpgtw         m4,             m2
-  pcmpgtw         m5,             m3
-  movq            m6,             m2
-  movq            m7,             m3
-  punpcklwd       m2,             m4
-  punpcklwd       m3,             m5
-  punpckhwd       m6,             m4
-  punpckhwd       m7,             m5
-  movq            [outputq + 32], m2
-  movq            [outputq + 40], m6
-  movq            [outputq + 48], m3
-  movq            [outputq + 56], m7
-%else
-  movq            [outputq],      m0
-  movq            [outputq + 8],  m1
-  movq            [outputq + 16], m2
-  movq            [outputq + 24], m3
-%endif
-
-  RET
diff --git a/vp9/encoder/x86/vp9_dct_sse2.asm b/vp9/encoder/x86/vp9_dct_sse2.asm
new file mode 100644
index 0000000..d3b2a27
--- /dev/null
+++ b/vp9/encoder/x86/vp9_dct_sse2.asm
@@ -0,0 +1,87 @@
+;
+;  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%define private_prefix vp9
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro TRANSFORM_COLS 0
+  paddw           m0,        m1
+  movq            m4,        m0
+  psubw           m3,        m2
+  psubw           m4,        m3
+  psraw           m4,        1
+  movq            m5,        m4
+  psubw           m5,        m1 ;b1
+  psubw           m4,        m2 ;c1
+  psubw           m0,        m4
+  paddw           m3,        m5
+                                ; m0 a0
+  SWAP            1,         4  ; m1 c1
+  SWAP            2,         3  ; m2 d1
+  SWAP            3,         5  ; m3 b1
+%endmacro
+
+%macro TRANSPOSE_4X4 0
+                                ; 00 01 02 03
+                                ; 10 11 12 13
+                                ; 20 21 22 23
+                                ; 30 31 32 33
+  punpcklwd       m0,        m1 ; 00 10 01 11  02 12 03 13
+  punpcklwd       m2,        m3 ; 20 30 21 31  22 32 23 33
+  mova            m1,        m0
+  punpckldq       m0,        m2 ; 00 10 20 30  01 11 21 31
+  punpckhdq       m1,        m2 ; 02 12 22 32  03 13 23 33
+%endmacro
+
+INIT_XMM sse2
+cglobal fwht4x4, 3, 4, 8, input, output, stride
+; TODO(linfeng): The duplication with vp10 should be resolved.
+  lea             r3q,       [inputq + strideq*4]
+  movq            m0,        [inputq] ;a1
+  movq            m1,        [inputq + strideq*2] ;b1
+  movq            m2,        [r3q] ;c1
+  movq            m3,        [r3q + strideq*2] ;d1
+
+  TRANSFORM_COLS
+  TRANSPOSE_4X4
+  SWAP            1,         2
+  psrldq          m1,        m0, 8
+  psrldq          m3,        m2, 8
+  TRANSFORM_COLS
+  TRANSPOSE_4X4
+
+  psllw           m0,        2
+  psllw           m1,        2
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  ; sign extension
+  mova            m2,             m0
+  mova            m3,             m1
+  punpcklwd       m0,             m0
+  punpcklwd       m1,             m1
+  punpckhwd       m2,             m2
+  punpckhwd       m3,             m3
+  psrad           m0,             16
+  psrad           m1,             16
+  psrad           m2,             16
+  psrad           m3,             16
+  mova            [outputq],      m0
+  mova            [outputq + 16], m2
+  mova            [outputq + 32], m1
+  mova            [outputq + 48], m3
+%else
+  mova            [outputq],      m0
+  mova            [outputq + 16], m1
+%endif
+
+  RET
diff --git a/vp9/encoder/x86/vp9_denoiser_sse2.c b/vp9/encoder/x86/vp9_denoiser_sse2.c
index bf7c7af..883507a 100644
--- a/vp9/encoder/x86/vp9_denoiser_sse2.c
+++ b/vp9/encoder/x86/vp9_denoiser_sse2.c
@@ -125,7 +125,7 @@
   return acc_diff;
 }
 
-// Denoiser for 4xM and 8xM blocks.
+// Denoise 8x8 and 8x16 blocks.
 static int vp9_denoiser_NxM_sse2_small(
     const uint8_t *sig, int sig_stride, const uint8_t *mc_running_avg_y,
     int mc_avg_y_stride, uint8_t *running_avg_y, int avg_y_stride,
@@ -147,9 +147,9 @@
   const __m128i l32 = _mm_set1_epi8(2);
   // Difference between level 2 and level 1 is 1.
   const __m128i l21 = _mm_set1_epi8(1);
-  const uint8_t shift = (width == 4) ? 2 : 1;
+  const int b_height = (4 << b_height_log2_lookup[bs]) >> 1;
 
-  for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> shift); ++r) {
+  for (r = 0; r < b_height; ++r) {
     memcpy(sig_buffer[r], sig, width);
     memcpy(sig_buffer[r] + width, sig + sig_stride, width);
     memcpy(mc_running_buffer[r], mc_running_avg_y, width);
@@ -157,18 +157,6 @@
            mc_running_avg_y + mc_avg_y_stride, width);
     memcpy(running_buffer[r], running_avg_y, width);
     memcpy(running_buffer[r] + width, running_avg_y + avg_y_stride, width);
-    if (width == 4) {
-      memcpy(sig_buffer[r] + width * 2, sig + sig_stride * 2, width);
-      memcpy(sig_buffer[r] + width * 3, sig + sig_stride * 3, width);
-      memcpy(mc_running_buffer[r] + width * 2,
-             mc_running_avg_y + mc_avg_y_stride * 2, width);
-      memcpy(mc_running_buffer[r] + width * 3,
-             mc_running_avg_y + mc_avg_y_stride * 3, width);
-      memcpy(running_buffer[r] + width * 2,
-             running_avg_y + avg_y_stride * 2, width);
-      memcpy(running_buffer[r] + width * 3,
-             running_avg_y + avg_y_stride * 3, width);
-    }
     acc_diff = vp9_denoiser_16x1_sse2(sig_buffer[r],
                                       mc_running_buffer[r],
                                       running_buffer[r],
@@ -176,16 +164,10 @@
                                       &l3, &l32, &l21, acc_diff);
     memcpy(running_avg_y, running_buffer[r], width);
     memcpy(running_avg_y + avg_y_stride, running_buffer[r] + width, width);
-    if (width == 4) {
-      memcpy(running_avg_y + avg_y_stride * 2,
-             running_buffer[r] + width * 2, width);
-      memcpy(running_avg_y + avg_y_stride * 3,
-             running_buffer[r] + width * 3, width);
-    }
     // Update pointers for next iteration.
-    sig += (sig_stride << shift);
-    mc_running_avg_y += (mc_avg_y_stride << shift);
-    running_avg_y += (avg_y_stride << shift);
+    sig += (sig_stride << 1);
+    mc_running_avg_y += (mc_avg_y_stride << 1);
+    running_avg_y += (avg_y_stride << 1);
   }
 
   {
@@ -207,22 +189,16 @@
       // Only apply the adjustment for max delta up to 3.
       if (delta < 4) {
         const __m128i k_delta = _mm_set1_epi8(delta);
-        running_avg_y -= avg_y_stride * (4 << b_height_log2_lookup[bs]);
-        for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> shift); ++r) {
+        running_avg_y -= avg_y_stride * (b_height << 1);
+        for (r = 0; r < b_height; ++r) {
           acc_diff = vp9_denoiser_adj_16x1_sse2(
               sig_buffer[r], mc_running_buffer[r], running_buffer[r],
               k_0, k_delta, acc_diff);
           memcpy(running_avg_y, running_buffer[r], width);
           memcpy(running_avg_y + avg_y_stride,
                  running_buffer[r] + width, width);
-          if (width == 4) {
-            memcpy(running_avg_y + avg_y_stride * 2,
-                   running_buffer[r] + width * 2, width);
-            memcpy(running_avg_y + avg_y_stride * 3,
-                   running_buffer[r] + width * 3, width);
-          }
           // Update pointers for next iteration.
-          running_avg_y += (avg_y_stride << shift);
+          running_avg_y += (avg_y_stride << 1);
         }
         sum_diff = sum_diff_16x1(acc_diff);
         if (abs(sum_diff) > sum_diff_thresh) {
@@ -236,7 +212,7 @@
   return FILTER_BLOCK;
 }
 
-// Denoiser for 16xM, 32xM and 64xM blocks
+// Denoise 16x16, 16x32, 32x16, 32x32, 32x64, 64x32 and 64x64 blocks.
 static int vp9_denoiser_NxM_sse2_big(const uint8_t *sig, int sig_stride,
                                      const uint8_t *mc_running_avg_y,
                                      int mc_avg_y_stride,
@@ -260,38 +236,37 @@
   const __m128i l32 = _mm_set1_epi8(2);
   // Difference between level 2 and level 1 is 1.
   const __m128i l21 = _mm_set1_epi8(1);
+  const int b_width = (4 << b_width_log2_lookup[bs]);
+  const int b_height = (4 << b_height_log2_lookup[bs]);
+  const int b_width_shift4 = b_width >> 4;
 
-  for (c = 0; c < 4; ++c) {
-    for (r = 0; r < 4; ++r) {
+  for (r = 0; r < 4; ++r) {
+    for (c = 0; c < b_width_shift4; ++c) {
       acc_diff[c][r] = _mm_setzero_si128();
     }
   }
 
-  for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) {
-    for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
-      acc_diff[c>>4][r>>4] = vp9_denoiser_16x1_sse2(
+  for (r = 0; r < b_height; ++r) {
+    for (c = 0; c < b_width_shift4; ++c) {
+      acc_diff[c][r>>4] = vp9_denoiser_16x1_sse2(
           sig, mc_running_avg_y, running_avg_y, &k_0, &k_4,
-          &k_8, &k_16, &l3, &l32, &l21, acc_diff[c>>4][r>>4]);
+          &k_8, &k_16, &l3, &l32, &l21, acc_diff[c][r>>4]);
       // Update pointers for next iteration.
       sig += 16;
       mc_running_avg_y += 16;
       running_avg_y += 16;
     }
 
-    if ((r + 1) % 16 == 0 || (bs == BLOCK_16X8 && r == 7)) {
-      for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
-        sum_diff += sum_diff_16x1(acc_diff[c>>4][r>>4]);
+    if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
+      for (c = 0; c < b_width_shift4; ++c) {
+        sum_diff += sum_diff_16x1(acc_diff[c][r>>4]);
       }
     }
 
     // Update pointers for next iteration.
-    sig = sig - 16 * ((4 << b_width_log2_lookup[bs]) >> 4) + sig_stride;
-    mc_running_avg_y = mc_running_avg_y -
-                       16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
-                       mc_avg_y_stride;
-    running_avg_y = running_avg_y -
-                    16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
-                    avg_y_stride;
+    sig = sig - b_width + sig_stride;
+    mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
+    running_avg_y = running_avg_y - b_width + avg_y_stride;
   }
 
   {
@@ -303,33 +278,29 @@
       // Only apply the adjustment for max delta up to 3.
       if (delta < 4) {
         const __m128i k_delta = _mm_set1_epi8(delta);
-        sig -= sig_stride * (4 << b_height_log2_lookup[bs]);
-        mc_running_avg_y -= mc_avg_y_stride * (4 << b_height_log2_lookup[bs]);
-        running_avg_y -= avg_y_stride * (4 << b_height_log2_lookup[bs]);
+        sig -= sig_stride * b_height;
+        mc_running_avg_y -= mc_avg_y_stride * b_height;
+        running_avg_y -= avg_y_stride * b_height;
         sum_diff = 0;
-        for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) {
-          for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
-            acc_diff[c>>4][r>>4] = vp9_denoiser_adj_16x1_sse2(
+        for (r = 0; r < b_height; ++r) {
+          for (c = 0; c < b_width_shift4; ++c) {
+            acc_diff[c][r>>4] = vp9_denoiser_adj_16x1_sse2(
                 sig, mc_running_avg_y, running_avg_y, k_0,
-                k_delta, acc_diff[c>>4][r>>4]);
+                k_delta, acc_diff[c][r>>4]);
             // Update pointers for next iteration.
             sig += 16;
             mc_running_avg_y += 16;
             running_avg_y += 16;
           }
 
-          if ((r + 1) % 16 == 0 || (bs == BLOCK_16X8 && r == 7)) {
-            for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
-              sum_diff += sum_diff_16x1(acc_diff[c>>4][r>>4]);
+          if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
+            for (c = 0; c < b_width_shift4; ++c) {
+              sum_diff += sum_diff_16x1(acc_diff[c][r>>4]);
             }
           }
-          sig = sig - 16 * ((4 << b_width_log2_lookup[bs]) >> 4) + sig_stride;
-          mc_running_avg_y = mc_running_avg_y -
-                             16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
-                             mc_avg_y_stride;
-          running_avg_y = running_avg_y -
-                          16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
-                          avg_y_stride;
+          sig = sig - b_width + sig_stride;
+          mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
+          running_avg_y = running_avg_y - b_width + avg_y_stride;
         }
         if (abs(sum_diff) > sum_diff_thresh) {
           return COPY_BLOCK;
@@ -349,26 +320,21 @@
                              int increase_denoising,
                              BLOCK_SIZE bs,
                              int motion_magnitude) {
-  if (bs == BLOCK_4X4 || bs == BLOCK_4X8) {
-    return vp9_denoiser_NxM_sse2_small(sig, sig_stride,
-                                       mc_avg, mc_avg_stride,
-                                       avg, avg_stride,
-                                       increase_denoising,
-                                       bs, motion_magnitude, 4);
-  } else if (bs == BLOCK_8X4 || bs == BLOCK_8X8 || bs == BLOCK_8X16) {
-    return vp9_denoiser_NxM_sse2_small(sig, sig_stride,
-                                       mc_avg, mc_avg_stride,
-                                       avg, avg_stride,
-                                       increase_denoising,
-                                       bs, motion_magnitude, 8);
-  } else if (bs == BLOCK_16X8 || bs == BLOCK_16X16 || bs == BLOCK_16X32 ||
-             bs == BLOCK_32X16|| bs == BLOCK_32X32 || bs == BLOCK_32X64 ||
-             bs == BLOCK_64X32 || bs == BLOCK_64X64) {
+  // Rank by frequency of the block type to have an early termination.
+  if (bs == BLOCK_16X16 || bs == BLOCK_32X32 || bs == BLOCK_64X64 ||
+      bs == BLOCK_16X32 || bs == BLOCK_16X8 || bs == BLOCK_32X16 ||
+      bs == BLOCK_32X64 || bs == BLOCK_64X32) {
     return vp9_denoiser_NxM_sse2_big(sig, sig_stride,
                                      mc_avg, mc_avg_stride,
                                      avg, avg_stride,
                                      increase_denoising,
                                      bs, motion_magnitude);
+  } else if (bs == BLOCK_8X8 || bs == BLOCK_8X16) {
+    return vp9_denoiser_NxM_sse2_small(sig, sig_stride,
+                                       mc_avg, mc_avg_stride,
+                                       avg, avg_stride,
+                                       increase_denoising,
+                                       bs, motion_magnitude, 8);
   } else {
     return COPY_BLOCK;
   }
diff --git a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
index 0bc417f..cd3e87e 100644
--- a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
+++ b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
@@ -56,7 +56,7 @@
 }
 
 /*****************************************************************************
- * This function utilises 3 properties of the cost function lookup tables,   *
+ * This function utilizes 3 properties of the cost function lookup tables,   *
  * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in       *
  * vp9_encoder.c.                                                            *
  * For the joint cost:                                                       *
@@ -122,10 +122,7 @@
 #endif
 
   unsigned int best_sad;
-
-  int i;
-  int j;
-  int step;
+  int i, j, step;
 
   // Check the prerequisite cost function properties that are easy to check
   // in an assert. See the function-level documentation for details on all
@@ -141,11 +138,7 @@
 
   for (i = 0, step = 0; step < tot_steps; step++) {
     for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) {
-      __m128i v_sad_d;
-      __m128i v_cost_d;
-      __m128i v_outside_d;
-      __m128i v_inside_d;
-      __m128i v_diff_mv_w;
+      __m128i v_sad_d, v_cost_d, v_outside_d, v_inside_d, v_diff_mv_w;
 #if ARCH_X86_64
       __m128i v_blocka[2];
 #else
@@ -153,7 +146,7 @@
 #endif
 
       // Compute the candidate motion vectors
-      const __m128i v_ss_mv_w = _mm_loadu_si128((const __m128i*)&ss_mv[i]);
+      const __m128i v_ss_mv_w = _mm_loadu_si128((const __m128i *)&ss_mv[i]);
       const __m128i v_these_mv_w = _mm_add_epi16(v_bmv_w, v_ss_mv_w);
       // Clamp them to the search bounds
       __m128i v_these_mv_clamp_w = v_these_mv_w;
@@ -185,8 +178,8 @@
       {
 #if ARCH_X86_64  //  sizeof(intptr_t) == 8
         // Load the offsets
-        __m128i v_bo10_q = _mm_loadu_si128((const __m128i*)&ss_os[i+0]);
-        __m128i v_bo32_q = _mm_loadu_si128((const __m128i*)&ss_os[i+2]);
+        __m128i v_bo10_q = _mm_loadu_si128((const __m128i *)&ss_os[i + 0]);
+        __m128i v_bo32_q = _mm_loadu_si128((const __m128i *)&ss_os[i + 2]);
         // Set the ones falling outside to zero
         v_bo10_q = _mm_and_si128(v_bo10_q,
                                  _mm_cvtepi32_epi64(v_inside_d));
@@ -196,7 +189,7 @@
         v_blocka[0] = _mm_add_epi64(v_ba_q, v_bo10_q);
         v_blocka[1] = _mm_add_epi64(v_ba_q, v_bo32_q);
 #else  // ARCH_X86 //  sizeof(intptr_t) == 4
-        __m128i v_bo_d = _mm_loadu_si128((const __m128i*)&ss_os[i]);
+        __m128i v_bo_d = _mm_loadu_si128((const __m128i *)&ss_os[i]);
         v_bo_d = _mm_and_si128(v_bo_d, v_inside_d);
         v_blocka[0] = _mm_add_epi32(v_ba_d, v_bo_d);
 #endif
@@ -224,13 +217,10 @@
         const uint32_t cost3 = x->nmvsadcost[0][row3] + x->nmvsadcost[0][col3];
 
         __m128i v_cost_10_d, v_cost_32_d;
-
         v_cost_10_d = _mm_cvtsi32_si128(cost0);
         v_cost_10_d = _mm_insert_epi32(v_cost_10_d, cost1, 1);
-
         v_cost_32_d = _mm_cvtsi32_si128(cost2);
         v_cost_32_d = _mm_insert_epi32(v_cost_32_d, cost3, 1);
-
         v_cost_d = _mm_unpacklo_epi64(v_cost_10_d, v_cost_32_d);
       }
 
@@ -246,9 +236,10 @@
 
       // Multiply by sad_per_bit
       v_cost_d = _mm_mullo_epi32(v_cost_d, v_spb_d);
-      // ROUND_POWER_OF_TWO(v_cost_d, 8)
-      v_cost_d = _mm_add_epi32(v_cost_d, _mm_set1_epi32(0x80));
-      v_cost_d = _mm_srai_epi32(v_cost_d, 8);
+      // ROUND_POWER_OF_TWO(v_cost_d, VP9_PROB_COST_SHIFT)
+      v_cost_d = _mm_add_epi32(v_cost_d,
+                               _mm_set1_epi32(1 << (VP9_PROB_COST_SHIFT - 1)));
+      v_cost_d = _mm_srai_epi32(v_cost_d, VP9_PROB_COST_SHIFT);
       // Add the cost to the sad
       v_sad_d = _mm_add_epi32(v_sad_d, v_cost_d);
 
diff --git a/vp9/encoder/x86/vp9_frame_scale_ssse3.c b/vp9/encoder/x86/vp9_frame_scale_ssse3.c
index 2559586..23325d6 100644
--- a/vp9/encoder/x86/vp9_frame_scale_ssse3.c
+++ b/vp9/encoder/x86/vp9_frame_scale_ssse3.c
@@ -18,9 +18,9 @@
 extern void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src,
                                          YV12_BUFFER_CONFIG *dst);
 
-void downsample_2_to_1_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             int w, int h) {
+static void downsample_2_to_1_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+                                    uint8_t *dst, ptrdiff_t dst_stride,
+                                    int w, int h) {
   const __m128i mask = _mm_set1_epi16(0x00FF);
   const int max_width = w & ~15;
   int y;
@@ -87,9 +87,9 @@
   }
 }
 
-void upsample_1_to_2_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-                           uint8_t *dst, ptrdiff_t dst_stride,
-                           int dst_w, int dst_h) {
+static void upsample_1_to_2_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  int dst_w, int dst_h) {
   dst_w /= 2;
   dst_h /= 2;
   {
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index d13e699..51c6fbb 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -40,6 +40,7 @@
   unsigned int                rc_max_inter_bitrate_pct;
   unsigned int                gf_cbr_boost_pct;
   unsigned int                lossless;
+  unsigned int                target_level;
   unsigned int                frame_parallel_decoding_mode;
   AQ_MODE                     aq_mode;
   unsigned int                frame_periodic_boost;
@@ -69,6 +70,7 @@
   0,                          // rc_max_inter_bitrate_pct
   0,                          // gf_cbr_boost_pct
   0,                          // lossless
+  255,                        // target_level
   1,                          // frame_parallel_decoding_mode
   NO_AQ,                      // aq_mode
   0,                          // frame_periodic_delta_q
@@ -183,6 +185,17 @@
   RANGE_CHECK(cfg, ss_number_layers, 1, VPX_SS_MAX_LAYERS);
   RANGE_CHECK(cfg, ts_number_layers, 1, VPX_TS_MAX_LAYERS);
 
+  {
+    unsigned int level = extra_cfg->target_level;
+    if (level != LEVEL_1 && level != LEVEL_1_1 && level != LEVEL_2 &&
+        level != LEVEL_2_1 && level != LEVEL_3 && level != LEVEL_3_1 &&
+        level != LEVEL_4 && level != LEVEL_4_1 && level != LEVEL_5 &&
+        level != LEVEL_5_1 && level != LEVEL_5_2 && level != LEVEL_6 &&
+        level != LEVEL_6_1 && level != LEVEL_6_2 &&
+        level != LEVEL_UNKNOWN && level != LEVEL_MAX)
+    ERROR("target_level is invalid");
+  }
+
   if (cfg->ss_number_layers * cfg->ts_number_layers > VPX_MAX_LAYERS)
     ERROR("ss_number_layers * ts_number_layers is out of range");
   if (cfg->ts_number_layers > 1) {
@@ -496,6 +509,8 @@
   oxcf->temporal_layering_mode = (enum vp9e_temporal_layering_mode)
       cfg->temporal_layering_mode;
 
+  oxcf->target_level = extra_cfg->target_level;
+
   for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
 #if CONFIG_SPATIAL_SVC
     oxcf->ss_enable_auto_arf[sl] = cfg->ss_enable_auto_alt_ref[sl];
@@ -522,6 +537,7 @@
   /*
   printf("Current VP9 Settings: \n");
   printf("target_bandwidth: %d\n", oxcf->target_bandwidth);
+  printf("target_level: %d\n", oxcf->target_level);
   printf("noise_sensitivity: %d\n", oxcf->noise_sensitivity);
   printf("sharpness: %d\n",    oxcf->sharpness);
   printf("cpu_used: %d\n",  oxcf->cpu_used);
@@ -771,6 +787,20 @@
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static vpx_codec_err_t ctrl_set_target_level(vpx_codec_alg_priv_t *ctx,
+                                             va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.target_level = CAST(VP9E_SET_TARGET_LEVEL, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_get_level(vpx_codec_alg_priv_t *ctx, va_list args) {
+  int *const arg = va_arg(args, int *);
+  if (arg == NULL) return VPX_CODEC_INVALID_PARAM;
+  *arg = (int)vp9_get_level(&ctx->cpi->level_info.level_spec);
+  return VPX_CODEC_OK;
+}
+
 static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
                                     vpx_codec_priv_enc_mr_cfg_t *data) {
   vpx_codec_err_t res = VPX_CODEC_OK;
@@ -862,6 +892,11 @@
       break;
   }
 
+  if (deadline == VPX_DL_REALTIME) {
+    ctx->oxcf.pass = 0;
+    new_mode = REALTIME;
+  }
+
   if (ctx->oxcf.mode != new_mode) {
     ctx->oxcf.mode = new_mode;
     vp9_change_config(ctx->cpi, &ctx->oxcf);
@@ -928,9 +963,6 @@
   return index_sz;
 }
 
-// vp9 uses 10,000,000 ticks/second as time stamp
-#define TICKS_PER_SEC 10000000LL
-
 static int64_t timebase_units_to_ticks(const vpx_rational_t *timebase,
                                        int64_t n) {
   return n * TICKS_PER_SEC * timebase->num / timebase->den;
@@ -938,7 +970,7 @@
 
 static int64_t ticks_to_timebase_units(const vpx_rational_t *timebase,
                                        int64_t n) {
-  const int64_t round = TICKS_PER_SEC * timebase->num / 2 - 1;
+  const int64_t round = (int64_t)TICKS_PER_SEC * timebase->num / 2 - 1;
   return (n * timebase->den + round) / timebase->num / TICKS_PER_SEC;
 }
 
@@ -1503,6 +1535,7 @@
   {VP9E_SET_MAX_GF_INTERVAL,          ctrl_set_max_gf_interval},
   {VP9E_SET_SVC_REF_FRAME_CONFIG,     ctrl_set_svc_ref_frame_config},
   {VP9E_SET_RENDER_SIZE,              ctrl_set_render_size},
+  {VP9E_SET_TARGET_LEVEL,             ctrl_set_target_level},
 
   // Getters
   {VP8E_GET_LAST_QUANTIZER,           ctrl_get_quantizer},
@@ -1510,6 +1543,7 @@
   {VP9_GET_REFERENCE,                 ctrl_get_reference},
   {VP9E_GET_SVC_LAYER_ID,             ctrl_get_svc_layer_id},
   {VP9E_GET_ACTIVEMAP,                ctrl_get_active_map},
+  {VP9E_GET_LEVEL,                    ctrl_get_level},
 
   { -1, NULL},
 };
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 2930c23..5f3de8f 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -102,7 +102,7 @@
 endif
 
 ifeq ($(CONFIG_USE_X86INC),yes)
-VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.asm
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_error_sse2.asm
 VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_highbd_error_avx.asm
@@ -117,7 +117,7 @@
 endif
 endif
 
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_intrin_sse2.c
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_dct_ssse3.c
 ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_frame_scale_ssse3.c
diff --git a/vpx/exports_enc b/vpx/exports_enc
index e4707ba..914e36c 100644
--- a/vpx/exports_enc
+++ b/vpx/exports_enc
@@ -7,9 +7,3 @@
 text vpx_codec_get_global_headers
 text vpx_codec_get_preview_frame
 text vpx_codec_set_cx_data_buf
-text vpx_svc_dump_statistics
-text vpx_svc_encode
-text vpx_svc_get_message
-text vpx_svc_init
-text vpx_svc_release
-text vpx_svc_set_options
diff --git a/vpx/exports_spatial_svc b/vpx/exports_spatial_svc
new file mode 100644
index 0000000..d258a1d
--- /dev/null
+++ b/vpx/exports_spatial_svc
@@ -0,0 +1,6 @@
+text vpx_svc_dump_statistics
+text vpx_svc_encode
+text vpx_svc_get_message
+text vpx_svc_init
+text vpx_svc_release
+text vpx_svc_set_options
diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index 710fb51..dcc35c9 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -570,6 +570,21 @@
    */
   VP9E_SET_RENDER_SIZE,
 
+  /*!\brief Codec control function to set target level.
+   *
+   * 255: off (default); 0: only keep level stats; 10: target for level 1.0;
+   * 11: target for level 1.1; ... 62: target for level 6.2
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_TARGET_LEVEL,
+
+  /*!\brief Codec control function to get bitstream level.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_GET_LEVEL,
+
   /*!\brief Codec control function to set intended superblock size.
    *
    * By default, the superblock size is determined separately for each
@@ -847,6 +862,12 @@
 
 VPX_CTRL_USE_TYPE(VP10E_SET_SUPERBLOCK_SIZE, unsigned int)
 #define VPX_CTRL_VP10E_SET_SUPERBLOCK_SIZE
+
+VPX_CTRL_USE_TYPE(VP9E_SET_TARGET_LEVEL,  unsigned int)
+#define VPX_CTRL_VP9E_SET_TARGET_LEVEL
+
+VPX_CTRL_USE_TYPE(VP9E_GET_LEVEL, int *)
+#define VPX_CTRL_VP9E_GET_LEVEL
 /*!\endcond */
 /*! @} - end defgroup vp8_encoder */
 #ifdef __cplusplus
diff --git a/vpx_dsp/add_noise.c b/vpx_dsp/add_noise.c
new file mode 100644
index 0000000..682b444
--- /dev/null
+++ b/vpx_dsp/add_noise.c
@@ -0,0 +1,40 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+void vpx_plane_add_noise_c(uint8_t *start, char *noise,
+                           char blackclamp[16],
+                           char whiteclamp[16],
+                           char bothclamp[16],
+                           unsigned int width, unsigned int height, int pitch) {
+  unsigned int i, j;
+
+  for (i = 0; i < height; i++) {
+    uint8_t *pos = start + i * pitch;
+    char  *ref = (char *)(noise + (rand() & 0xff));  // NOLINT
+
+    for (j = 0; j < width; j++) {
+      int v = pos[j];
+
+      v = clamp(v - blackclamp[0], 0, 255);
+      v = clamp(v + bothclamp[0], 0, 255);
+      v = clamp(v - whiteclamp[0], 0, 255);
+
+      pos[j] = v + ref[j];
+    }
+  }
+}
diff --git a/vpx_dsp/arm/avg_neon.c b/vpx_dsp/arm/avg_neon.c
index d054c41..e52958c 100644
--- a/vpx_dsp/arm/avg_neon.c
+++ b/vpx_dsp/arm/avg_neon.c
@@ -197,3 +197,60 @@
     return s - ((t * t) >> shift_factor);
   }
 }
+
+void vpx_minmax_8x8_neon(const uint8_t *a, int a_stride,
+                         const uint8_t *b, int b_stride,
+                         int *min, int *max) {
+  // Load and concatenate.
+  const uint8x16_t a01 = vcombine_u8(vld1_u8(a),
+                                     vld1_u8(a + a_stride));
+  const uint8x16_t a23 = vcombine_u8(vld1_u8(a + 2 * a_stride),
+                                     vld1_u8(a + 3 * a_stride));
+  const uint8x16_t a45 = vcombine_u8(vld1_u8(a + 4 * a_stride),
+                                     vld1_u8(a + 5 * a_stride));
+  const uint8x16_t a67 = vcombine_u8(vld1_u8(a + 6 * a_stride),
+                                     vld1_u8(a + 7 * a_stride));
+
+  const uint8x16_t b01 = vcombine_u8(vld1_u8(b),
+                                     vld1_u8(b + b_stride));
+  const uint8x16_t b23 = vcombine_u8(vld1_u8(b + 2 * b_stride),
+                                     vld1_u8(b + 3 * b_stride));
+  const uint8x16_t b45 = vcombine_u8(vld1_u8(b + 4 * b_stride),
+                                     vld1_u8(b + 5 * b_stride));
+  const uint8x16_t b67 = vcombine_u8(vld1_u8(b + 6 * b_stride),
+                                     vld1_u8(b + 7 * b_stride));
+
+  // Absolute difference.
+  const uint8x16_t ab01_diff = vabdq_u8(a01, b01);
+  const uint8x16_t ab23_diff = vabdq_u8(a23, b23);
+  const uint8x16_t ab45_diff = vabdq_u8(a45, b45);
+  const uint8x16_t ab67_diff = vabdq_u8(a67, b67);
+
+  // Max values between the Q vectors.
+  const uint8x16_t ab0123_max = vmaxq_u8(ab01_diff, ab23_diff);
+  const uint8x16_t ab4567_max = vmaxq_u8(ab45_diff, ab67_diff);
+  const uint8x16_t ab0123_min = vminq_u8(ab01_diff, ab23_diff);
+  const uint8x16_t ab4567_min = vminq_u8(ab45_diff, ab67_diff);
+
+  const uint8x16_t ab07_max = vmaxq_u8(ab0123_max, ab4567_max);
+  const uint8x16_t ab07_min = vminq_u8(ab0123_min, ab4567_min);
+
+  // Split to D and start doing pairwise.
+  uint8x8_t ab_max = vmax_u8(vget_high_u8(ab07_max), vget_low_u8(ab07_max));
+  uint8x8_t ab_min = vmin_u8(vget_high_u8(ab07_min), vget_low_u8(ab07_min));
+
+  // Enough runs of vpmax/min propogate the max/min values to every position.
+  ab_max = vpmax_u8(ab_max, ab_max);
+  ab_min = vpmin_u8(ab_min, ab_min);
+
+  ab_max = vpmax_u8(ab_max, ab_max);
+  ab_min = vpmin_u8(ab_min, ab_min);
+
+  ab_max = vpmax_u8(ab_max, ab_max);
+  ab_min = vpmin_u8(ab_min, ab_min);
+
+  *min = *max = 0;  // Clear high bits
+  // Store directly to avoid costly neon->gpr transfer.
+  vst1_lane_u8((uint8_t *)max, ab_max, 0);
+  vst1_lane_u8((uint8_t *)min, ab_min, 0);
+}
diff --git a/vpx_dsp/arm/hadamard_neon.c b/vpx_dsp/arm/hadamard_neon.c
new file mode 100644
index 0000000..21e3e3d
--- /dev/null
+++ b/vpx_dsp/arm/hadamard_neon.c
@@ -0,0 +1,201 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+
+static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1,
+                                 int16x8_t *a2, int16x8_t *a3,
+                                 int16x8_t *a4, int16x8_t *a5,
+                                 int16x8_t *a6, int16x8_t *a7) {
+  const int16x8_t b0 = vaddq_s16(*a0, *a1);
+  const int16x8_t b1 = vsubq_s16(*a0, *a1);
+  const int16x8_t b2 = vaddq_s16(*a2, *a3);
+  const int16x8_t b3 = vsubq_s16(*a2, *a3);
+  const int16x8_t b4 = vaddq_s16(*a4, *a5);
+  const int16x8_t b5 = vsubq_s16(*a4, *a5);
+  const int16x8_t b6 = vaddq_s16(*a6, *a7);
+  const int16x8_t b7 = vsubq_s16(*a6, *a7);
+
+  const int16x8_t c0 = vaddq_s16(b0, b2);
+  const int16x8_t c1 = vaddq_s16(b1, b3);
+  const int16x8_t c2 = vsubq_s16(b0, b2);
+  const int16x8_t c3 = vsubq_s16(b1, b3);
+  const int16x8_t c4 = vaddq_s16(b4, b6);
+  const int16x8_t c5 = vaddq_s16(b5, b7);
+  const int16x8_t c6 = vsubq_s16(b4, b6);
+  const int16x8_t c7 = vsubq_s16(b5, b7);
+
+  *a0 = vaddq_s16(c0, c4);
+  *a1 = vsubq_s16(c2, c6);
+  *a2 = vsubq_s16(c0, c4);
+  *a3 = vaddq_s16(c2, c6);
+  *a4 = vaddq_s16(c3, c7);
+  *a5 = vsubq_s16(c3, c7);
+  *a6 = vsubq_s16(c1, c5);
+  *a7 = vaddq_s16(c1, c5);
+}
+
+// TODO(johannkoenig): Make a transpose library and dedup with idct. Consider
+// reversing transpose order which may make it easier for the compiler to
+// reconcile the vtrn.64 moves.
+static void transpose8x8(int16x8_t *a0, int16x8_t *a1,
+                         int16x8_t *a2, int16x8_t *a3,
+                         int16x8_t *a4, int16x8_t *a5,
+                         int16x8_t *a6, int16x8_t *a7) {
+  // Swap 64 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 08 09 10 11 12 13 14 15
+  // a2: 16 17 18 19 20 21 22 23
+  // a3: 24 25 26 27 28 29 30 31
+  // a4: 32 33 34 35 36 37 38 39
+  // a5: 40 41 42 43 44 45 46 47
+  // a6: 48 49 50 51 52 53 54 55
+  // a7: 56 57 58 59 60 61 62 63
+  // to:
+  // a04_lo: 00 01 02 03 32 33 34 35
+  // a15_lo: 08 09 10 11 40 41 42 43
+  // a26_lo: 16 17 18 19 48 49 50 51
+  // a37_lo: 24 25 26 27 56 57 58 59
+  // a04_hi: 04 05 06 07 36 37 38 39
+  // a15_hi: 12 13 14 15 44 45 46 47
+  // a26_hi: 20 21 22 23 52 53 54 55
+  // a37_hi: 28 29 30 31 60 61 62 63
+  const int16x8_t a04_lo = vcombine_s16(vget_low_s16(*a0), vget_low_s16(*a4));
+  const int16x8_t a15_lo = vcombine_s16(vget_low_s16(*a1), vget_low_s16(*a5));
+  const int16x8_t a26_lo = vcombine_s16(vget_low_s16(*a2), vget_low_s16(*a6));
+  const int16x8_t a37_lo = vcombine_s16(vget_low_s16(*a3), vget_low_s16(*a7));
+  const int16x8_t a04_hi = vcombine_s16(vget_high_s16(*a0), vget_high_s16(*a4));
+  const int16x8_t a15_hi = vcombine_s16(vget_high_s16(*a1), vget_high_s16(*a5));
+  const int16x8_t a26_hi = vcombine_s16(vget_high_s16(*a2), vget_high_s16(*a6));
+  const int16x8_t a37_hi = vcombine_s16(vget_high_s16(*a3), vget_high_s16(*a7));
+
+  // Swap 32 bit elements resulting in:
+  // a0246_lo:
+  // 00 01 16 17 32 33 48 49
+  // 02 03 18 19 34 35 50 51
+  // a1357_lo:
+  // 08 09 24 25 40 41 56 57
+  // 10 11 26 27 42 43 58 59
+  // a0246_hi:
+  // 04 05 20 21 36 37 52 53
+  // 06 07 22 23 38 39 54 55
+  // a1657_hi:
+  // 12 13 28 29 44 45 60 61
+  // 14 15 30 31 46 47 62 63
+  const int32x4x2_t a0246_lo = vtrnq_s32(vreinterpretq_s32_s16(a04_lo),
+                                         vreinterpretq_s32_s16(a26_lo));
+  const int32x4x2_t a1357_lo = vtrnq_s32(vreinterpretq_s32_s16(a15_lo),
+                                         vreinterpretq_s32_s16(a37_lo));
+  const int32x4x2_t a0246_hi = vtrnq_s32(vreinterpretq_s32_s16(a04_hi),
+                                         vreinterpretq_s32_s16(a26_hi));
+  const int32x4x2_t a1357_hi = vtrnq_s32(vreinterpretq_s32_s16(a15_hi),
+                                         vreinterpretq_s32_s16(a37_hi));
+
+  // Swap 16 bit elements resulting in:
+  // b0:
+  // 00 08 16 24 32 40 48 56
+  // 01 09 17 25 33 41 49 57
+  // b1:
+  // 02 10 18 26 34 42 50 58
+  // 03 11 19 27 35 43 51 59
+  // b2:
+  // 04 12 20 28 36 44 52 60
+  // 05 13 21 29 37 45 53 61
+  // b3:
+  // 06 14 22 30 38 46 54 62
+  // 07 15 23 31 39 47 55 63
+  const int16x8x2_t b0 = vtrnq_s16(vreinterpretq_s16_s32(a0246_lo.val[0]),
+                                   vreinterpretq_s16_s32(a1357_lo.val[0]));
+  const int16x8x2_t b1 = vtrnq_s16(vreinterpretq_s16_s32(a0246_lo.val[1]),
+                                   vreinterpretq_s16_s32(a1357_lo.val[1]));
+  const int16x8x2_t b2 = vtrnq_s16(vreinterpretq_s16_s32(a0246_hi.val[0]),
+                                   vreinterpretq_s16_s32(a1357_hi.val[0]));
+  const int16x8x2_t b3 = vtrnq_s16(vreinterpretq_s16_s32(a0246_hi.val[1]),
+                                   vreinterpretq_s16_s32(a1357_hi.val[1]));
+
+  *a0 = b0.val[0];
+  *a1 = b0.val[1];
+  *a2 = b1.val[0];
+  *a3 = b1.val[1];
+  *a4 = b2.val[0];
+  *a5 = b2.val[1];
+  *a6 = b3.val[0];
+  *a7 = b3.val[1];
+}
+
+void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride,
+                           int16_t *coeff) {
+  int16x8_t a0 = vld1q_s16(src_diff);
+  int16x8_t a1 = vld1q_s16(src_diff + src_stride);
+  int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride);
+  int16x8_t a3 = vld1q_s16(src_diff + 3 * src_stride);
+  int16x8_t a4 = vld1q_s16(src_diff + 4 * src_stride);
+  int16x8_t a5 = vld1q_s16(src_diff + 5 * src_stride);
+  int16x8_t a6 = vld1q_s16(src_diff + 6 * src_stride);
+  int16x8_t a7 = vld1q_s16(src_diff + 7 * src_stride);
+
+  hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+  transpose8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+  hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+  // Skip the second transpose because it is not required.
+
+  vst1q_s16(coeff + 0, a0);
+  vst1q_s16(coeff + 8, a1);
+  vst1q_s16(coeff + 16, a2);
+  vst1q_s16(coeff + 24, a3);
+  vst1q_s16(coeff + 32, a4);
+  vst1q_s16(coeff + 40, a5);
+  vst1q_s16(coeff + 48, a6);
+  vst1q_s16(coeff + 56, a7);
+}
+
+void vpx_hadamard_16x16_neon(const int16_t *src_diff, int src_stride,
+                             int16_t *coeff) {
+  int i;
+
+  /* Rearrange 16x16 to 8x32 and remove stride.
+   * Top left first. */
+  vpx_hadamard_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0);
+  /* Top right. */
+  vpx_hadamard_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64);
+  /* Bottom left. */
+  vpx_hadamard_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128);
+  /* Bottom right. */
+  vpx_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192);
+
+  for (i = 0; i < 64; i += 8) {
+    const int16x8_t a0 = vld1q_s16(coeff + 0);
+    const int16x8_t a1 = vld1q_s16(coeff + 64);
+    const int16x8_t a2 = vld1q_s16(coeff + 128);
+    const int16x8_t a3 = vld1q_s16(coeff + 192);
+
+    const int16x8_t b0 = vhaddq_s16(a0, a1);
+    const int16x8_t b1 = vhsubq_s16(a0, a1);
+    const int16x8_t b2 = vhaddq_s16(a2, a3);
+    const int16x8_t b3 = vhsubq_s16(a2, a3);
+
+    const int16x8_t c0 = vaddq_s16(b0, b2);
+    const int16x8_t c1 = vaddq_s16(b1, b3);
+    const int16x8_t c2 = vsubq_s16(b0, b2);
+    const int16x8_t c3 = vsubq_s16(b1, b3);
+
+    vst1q_s16(coeff + 0, c0);
+    vst1q_s16(coeff + 64, c1);
+    vst1q_s16(coeff + 128, c2);
+    vst1q_s16(coeff + 192, c3);
+
+    coeff += 8;
+  }
+}
diff --git a/vpx_dsp/avg.c b/vpx_dsp/avg.c
index 2e9dfd4..cf7fd36 100644
--- a/vpx_dsp/avg.c
+++ b/vpx_dsp/avg.c
@@ -62,6 +62,8 @@
   coeff[5] = c3 - c7;
 }
 
+// The order of the output coeff of the hadamard is not important. For
+// optimization purposes the final transpose may be skipped.
 void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride,
                         int16_t *coeff) {
   int idx;
diff --git a/vpx_dsp/bitreader.c b/vpx_dsp/bitreader.c
index 6ad806a..8140e78 100644
--- a/vpx_dsp/bitreader.c
+++ b/vpx_dsp/bitreader.c
@@ -69,7 +69,7 @@
       buffer += (bits >> 3);
       value = r->value | (nv << (shift & 0x7));
   } else {
-    const int bits_over = (int)(shift + CHAR_BIT - bits_left);
+    const int bits_over = (int)(shift + CHAR_BIT - (int)bits_left);
     int loop_end = 0;
     if (bits_over >= 0) {
       count += LOTS_OF_BITS;
diff --git a/vpx_dsp/inv_txfm.c b/vpx_dsp/inv_txfm.c
index 533f762..707cb92 100644
--- a/vpx_dsp/inv_txfm.c
+++ b/vpx_dsp/inv_txfm.c
@@ -35,10 +35,10 @@
     c1 = e1 - c1;
     a1 -= b1;
     d1 += c1;
-    op[0] = WRAPLOW(a1, 8);
-    op[1] = WRAPLOW(b1, 8);
-    op[2] = WRAPLOW(c1, 8);
-    op[3] = WRAPLOW(d1, 8);
+    op[0] = WRAPLOW(a1);
+    op[1] = WRAPLOW(b1);
+    op[2] = WRAPLOW(c1);
+    op[3] = WRAPLOW(d1);
     ip += 4;
     op += 4;
   }
@@ -56,10 +56,10 @@
     c1 = e1 - c1;
     a1 -= b1;
     d1 += c1;
-    dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1);
-    dest[stride * 1] = clip_pixel_add(dest[stride * 1], b1);
-    dest[stride * 2] = clip_pixel_add(dest[stride * 2], c1);
-    dest[stride * 3] = clip_pixel_add(dest[stride * 3], d1);
+    dest[stride * 0] = clip_pixel_add(dest[stride * 0], WRAPLOW(a1));
+    dest[stride * 1] = clip_pixel_add(dest[stride * 1], WRAPLOW(b1));
+    dest[stride * 2] = clip_pixel_add(dest[stride * 2], WRAPLOW(c1));
+    dest[stride * 3] = clip_pixel_add(dest[stride * 3], WRAPLOW(d1));
 
     ip++;
     dest++;
@@ -76,8 +76,8 @@
   a1 = ip[0] >> UNIT_QUANT_SHIFT;
   e1 = a1 >> 1;
   a1 -= e1;
-  op[0] = WRAPLOW(a1, 8);
-  op[1] = op[2] = op[3] = WRAPLOW(e1, 8);
+  op[0] = WRAPLOW(a1);
+  op[1] = op[2] = op[3] = WRAPLOW(e1);
 
   ip = tmp;
   for (i = 0; i < 4; i++) {
@@ -98,18 +98,18 @@
   // stage 1
   temp1 = (input[0] + input[2]) * cospi_16_64;
   temp2 = (input[0] - input[2]) * cospi_16_64;
-  step[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step[0] = WRAPLOW(dct_const_round_shift(temp1));
+  step[1] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
   temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
-  step[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step[2] = WRAPLOW(dct_const_round_shift(temp1));
+  step[3] = WRAPLOW(dct_const_round_shift(temp2));
 
   // stage 2
-  output[0] = WRAPLOW(step[0] + step[3], 8);
-  output[1] = WRAPLOW(step[1] + step[2], 8);
-  output[2] = WRAPLOW(step[1] - step[2], 8);
-  output[3] = WRAPLOW(step[0] - step[3], 8);
+  output[0] = WRAPLOW(step[0] + step[3]);
+  output[1] = WRAPLOW(step[1] + step[2]);
+  output[2] = WRAPLOW(step[1] - step[2]);
+  output[3] = WRAPLOW(step[0] - step[3]);
 }
 
 void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
@@ -141,8 +141,8 @@
                          int dest_stride) {
   int i;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   a1 = ROUND_POWER_OF_TWO(out, 4);
 
   for (i = 0; i < 4; i++) {
@@ -164,48 +164,48 @@
   step1[3] = input[6];
   temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
   temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
-  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
   temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
 
   // stage 2
   temp1 = (step1[0] + step1[2]) * cospi_16_64;
   temp2 = (step1[0] - step1[2]) * cospi_16_64;
-  step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
   temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  step2[4] = WRAPLOW(step1[4] + step1[5], 8);
-  step2[5] = WRAPLOW(step1[4] - step1[5], 8);
-  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
-  step2[7] = WRAPLOW(step1[6] + step1[7], 8);
+  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[4] = WRAPLOW(step1[4] + step1[5]);
+  step2[5] = WRAPLOW(step1[4] - step1[5]);
+  step2[6] = WRAPLOW(-step1[6] + step1[7]);
+  step2[7] = WRAPLOW(step1[6] + step1[7]);
 
   // stage 3
-  step1[0] = WRAPLOW(step2[0] + step2[3], 8);
-  step1[1] = WRAPLOW(step2[1] + step2[2], 8);
-  step1[2] = WRAPLOW(step2[1] - step2[2], 8);
-  step1[3] = WRAPLOW(step2[0] - step2[3], 8);
+  step1[0] = WRAPLOW(step2[0] + step2[3]);
+  step1[1] = WRAPLOW(step2[1] + step2[2]);
+  step1[2] = WRAPLOW(step2[1] - step2[2]);
+  step1[3] = WRAPLOW(step2[0] - step2[3]);
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
   step1[7] = step2[7];
 
   // stage 4
-  output[0] = WRAPLOW(step1[0] + step1[7], 8);
-  output[1] = WRAPLOW(step1[1] + step1[6], 8);
-  output[2] = WRAPLOW(step1[2] + step1[5], 8);
-  output[3] = WRAPLOW(step1[3] + step1[4], 8);
-  output[4] = WRAPLOW(step1[3] - step1[4], 8);
-  output[5] = WRAPLOW(step1[2] - step1[5], 8);
-  output[6] = WRAPLOW(step1[1] - step1[6], 8);
-  output[7] = WRAPLOW(step1[0] - step1[7], 8);
+  output[0] = WRAPLOW(step1[0] + step1[7]);
+  output[1] = WRAPLOW(step1[1] + step1[6]);
+  output[2] = WRAPLOW(step1[2] + step1[5]);
+  output[3] = WRAPLOW(step1[3] + step1[4]);
+  output[4] = WRAPLOW(step1[3] - step1[4]);
+  output[5] = WRAPLOW(step1[2] - step1[5]);
+  output[6] = WRAPLOW(step1[1] - step1[6]);
+  output[7] = WRAPLOW(step1[0] - step1[7]);
 }
 
 void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
@@ -236,8 +236,8 @@
 void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   int i, j;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   a1 = ROUND_POWER_OF_TWO(out, 5);
   for (j = 0; j < 8; ++j) {
     for (i = 0; i < 8; ++i)
@@ -266,7 +266,7 @@
   s4 = sinpi_1_9 * x2;
   s5 = sinpi_2_9 * x3;
   s6 = sinpi_4_9 * x3;
-  s7 = x0 - x2 + x3;
+  s7 = WRAPLOW(x0 - x2 + x3);
 
   s0 = s0 + s3 + s5;
   s1 = s1 - s4 - s6;
@@ -277,10 +277,10 @@
   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
   // + 1b (addition) = 29b.
   // Hence the output bit depth is 15b.
-  output[0] = WRAPLOW(dct_const_round_shift(s0 + s3), 8);
-  output[1] = WRAPLOW(dct_const_round_shift(s1 + s3), 8);
-  output[2] = WRAPLOW(dct_const_round_shift(s2), 8);
-  output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3), 8);
+  output[0] = WRAPLOW(dct_const_round_shift(s0 + s3));
+  output[1] = WRAPLOW(dct_const_round_shift(s1 + s3));
+  output[2] = WRAPLOW(dct_const_round_shift(s2));
+  output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));
 }
 
 void iadst8_c(const tran_low_t *input, tran_low_t *output) {
@@ -311,14 +311,14 @@
   s6 = (int)(cospi_26_64 * x6 + cospi_6_64  * x7);
   s7 = (int)(cospi_6_64  * x6 - cospi_26_64 * x7);
 
-  x0 = WRAPLOW(dct_const_round_shift(s0 + s4), 8);
-  x1 = WRAPLOW(dct_const_round_shift(s1 + s5), 8);
-  x2 = WRAPLOW(dct_const_round_shift(s2 + s6), 8);
-  x3 = WRAPLOW(dct_const_round_shift(s3 + s7), 8);
-  x4 = WRAPLOW(dct_const_round_shift(s0 - s4), 8);
-  x5 = WRAPLOW(dct_const_round_shift(s1 - s5), 8);
-  x6 = WRAPLOW(dct_const_round_shift(s2 - s6), 8);
-  x7 = WRAPLOW(dct_const_round_shift(s3 - s7), 8);
+  x0 = WRAPLOW(dct_const_round_shift(s0 + s4));
+  x1 = WRAPLOW(dct_const_round_shift(s1 + s5));
+  x2 = WRAPLOW(dct_const_round_shift(s2 + s6));
+  x3 = WRAPLOW(dct_const_round_shift(s3 + s7));
+  x4 = WRAPLOW(dct_const_round_shift(s0 - s4));
+  x5 = WRAPLOW(dct_const_round_shift(s1 - s5));
+  x6 = WRAPLOW(dct_const_round_shift(s2 - s6));
+  x7 = WRAPLOW(dct_const_round_shift(s3 - s7));
 
   // stage 2
   s0 = (int)x0;
@@ -330,14 +330,14 @@
   s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
   s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
 
-  x0 = WRAPLOW(s0 + s2, 8);
-  x1 = WRAPLOW(s1 + s3, 8);
-  x2 = WRAPLOW(s0 - s2, 8);
-  x3 = WRAPLOW(s1 - s3, 8);
-  x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);
-  x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);
-  x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);
-  x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
+  x0 = WRAPLOW(s0 + s2);
+  x1 = WRAPLOW(s1 + s3);
+  x2 = WRAPLOW(s0 - s2);
+  x3 = WRAPLOW(s1 - s3);
+  x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
+  x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
+  x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
+  x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
 
   // stage 3
   s2 = (int)(cospi_16_64 * (x2 + x3));
@@ -345,19 +345,19 @@
   s6 = (int)(cospi_16_64 * (x6 + x7));
   s7 = (int)(cospi_16_64 * (x6 - x7));
 
-  x2 = WRAPLOW(dct_const_round_shift(s2), 8);
-  x3 = WRAPLOW(dct_const_round_shift(s3), 8);
-  x6 = WRAPLOW(dct_const_round_shift(s6), 8);
-  x7 = WRAPLOW(dct_const_round_shift(s7), 8);
+  x2 = WRAPLOW(dct_const_round_shift(s2));
+  x3 = WRAPLOW(dct_const_round_shift(s3));
+  x6 = WRAPLOW(dct_const_round_shift(s6));
+  x7 = WRAPLOW(dct_const_round_shift(s7));
 
-  output[0] = WRAPLOW(x0, 8);
-  output[1] = WRAPLOW(-x4, 8);
-  output[2] = WRAPLOW(x6, 8);
-  output[3] = WRAPLOW(-x2, 8);
-  output[4] = WRAPLOW(x3, 8);
-  output[5] = WRAPLOW(-x7, 8);
-  output[6] = WRAPLOW(x5, 8);
-  output[7] = WRAPLOW(-x1, 8);
+  output[0] = WRAPLOW(x0);
+  output[1] = WRAPLOW(-x4);
+  output[2] = WRAPLOW(x6);
+  output[3] = WRAPLOW(-x2);
+  output[4] = WRAPLOW(x3);
+  output[5] = WRAPLOW(-x7);
+  output[6] = WRAPLOW(x5);
+  output[7] = WRAPLOW(-x1);
 }
 
 void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
@@ -420,23 +420,23 @@
 
   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[8] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[15] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
 
   // stage 3
   step1[0] = step2[0];
@@ -446,109 +446,109 @@
 
   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
 
-  step1[8] = WRAPLOW(step2[8] + step2[9], 8);
-  step1[9] = WRAPLOW(step2[8] - step2[9], 8);
-  step1[10] = WRAPLOW(-step2[10] + step2[11], 8);
-  step1[11] = WRAPLOW(step2[10] + step2[11], 8);
-  step1[12] = WRAPLOW(step2[12] + step2[13], 8);
-  step1[13] = WRAPLOW(step2[12] - step2[13], 8);
-  step1[14] = WRAPLOW(-step2[14] + step2[15], 8);
-  step1[15] = WRAPLOW(step2[14] + step2[15], 8);
+  step1[8] = WRAPLOW(step2[8] + step2[9]);
+  step1[9] = WRAPLOW(step2[8] - step2[9]);
+  step1[10] = WRAPLOW(-step2[10] + step2[11]);
+  step1[11] = WRAPLOW(step2[10] + step2[11]);
+  step1[12] = WRAPLOW(step2[12] + step2[13]);
+  step1[13] = WRAPLOW(step2[12] - step2[13]);
+  step1[14] = WRAPLOW(-step2[14] + step2[15]);
+  step1[15] = WRAPLOW(step2[14] + step2[15]);
 
   // stage 4
   temp1 = (step1[0] + step1[1]) * cospi_16_64;
   temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  step2[4] = WRAPLOW(step1[4] + step1[5], 8);
-  step2[5] = WRAPLOW(step1[4] - step1[5], 8);
-  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
-  step2[7] = WRAPLOW(step1[6] + step1[7], 8);
+  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[4] = WRAPLOW(step1[4] + step1[5]);
+  step2[5] = WRAPLOW(step1[4] - step1[5]);
+  step2[6] = WRAPLOW(-step1[6] + step1[7]);
+  step2[7] = WRAPLOW(step1[6] + step1[7]);
 
   step2[8] = step1[8];
   step2[15] = step1[15];
   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
   step2[11] = step1[11];
   step2[12] = step1[12];
 
   // stage 5
-  step1[0] = WRAPLOW(step2[0] + step2[3], 8);
-  step1[1] = WRAPLOW(step2[1] + step2[2], 8);
-  step1[2] = WRAPLOW(step2[1] - step2[2], 8);
-  step1[3] = WRAPLOW(step2[0] - step2[3], 8);
+  step1[0] = WRAPLOW(step2[0] + step2[3]);
+  step1[1] = WRAPLOW(step2[1] + step2[2]);
+  step1[2] = WRAPLOW(step2[1] - step2[2]);
+  step1[3] = WRAPLOW(step2[0] - step2[3]);
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
   step1[7] = step2[7];
 
-  step1[8] = WRAPLOW(step2[8] + step2[11], 8);
-  step1[9] = WRAPLOW(step2[9] + step2[10], 8);
-  step1[10] = WRAPLOW(step2[9] - step2[10], 8);
-  step1[11] = WRAPLOW(step2[8] - step2[11], 8);
-  step1[12] = WRAPLOW(-step2[12] + step2[15], 8);
-  step1[13] = WRAPLOW(-step2[13] + step2[14], 8);
-  step1[14] = WRAPLOW(step2[13] + step2[14], 8);
-  step1[15] = WRAPLOW(step2[12] + step2[15], 8);
+  step1[8] = WRAPLOW(step2[8] + step2[11]);
+  step1[9] = WRAPLOW(step2[9] + step2[10]);
+  step1[10] = WRAPLOW(step2[9] - step2[10]);
+  step1[11] = WRAPLOW(step2[8] - step2[11]);
+  step1[12] = WRAPLOW(-step2[12] + step2[15]);
+  step1[13] = WRAPLOW(-step2[13] + step2[14]);
+  step1[14] = WRAPLOW(step2[13] + step2[14]);
+  step1[15] = WRAPLOW(step2[12] + step2[15]);
 
   // stage 6
-  step2[0] = WRAPLOW(step1[0] + step1[7], 8);
-  step2[1] = WRAPLOW(step1[1] + step1[6], 8);
-  step2[2] = WRAPLOW(step1[2] + step1[5], 8);
-  step2[3] = WRAPLOW(step1[3] + step1[4], 8);
-  step2[4] = WRAPLOW(step1[3] - step1[4], 8);
-  step2[5] = WRAPLOW(step1[2] - step1[5], 8);
-  step2[6] = WRAPLOW(step1[1] - step1[6], 8);
-  step2[7] = WRAPLOW(step1[0] - step1[7], 8);
+  step2[0] = WRAPLOW(step1[0] + step1[7]);
+  step2[1] = WRAPLOW(step1[1] + step1[6]);
+  step2[2] = WRAPLOW(step1[2] + step1[5]);
+  step2[3] = WRAPLOW(step1[3] + step1[4]);
+  step2[4] = WRAPLOW(step1[3] - step1[4]);
+  step2[5] = WRAPLOW(step1[2] - step1[5]);
+  step2[6] = WRAPLOW(step1[1] - step1[6]);
+  step2[7] = WRAPLOW(step1[0] - step1[7]);
   step2[8] = step1[8];
   step2[9] = step1[9];
   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
   step2[14] = step1[14];
   step2[15] = step1[15];
 
   // stage 7
-  output[0] = WRAPLOW(step2[0] + step2[15], 8);
-  output[1] = WRAPLOW(step2[1] + step2[14], 8);
-  output[2] = WRAPLOW(step2[2] + step2[13], 8);
-  output[3] = WRAPLOW(step2[3] + step2[12], 8);
-  output[4] = WRAPLOW(step2[4] + step2[11], 8);
-  output[5] = WRAPLOW(step2[5] + step2[10], 8);
-  output[6] = WRAPLOW(step2[6] + step2[9], 8);
-  output[7] = WRAPLOW(step2[7] + step2[8], 8);
-  output[8] = WRAPLOW(step2[7] - step2[8], 8);
-  output[9] = WRAPLOW(step2[6] - step2[9], 8);
-  output[10] = WRAPLOW(step2[5] - step2[10], 8);
-  output[11] = WRAPLOW(step2[4] - step2[11], 8);
-  output[12] = WRAPLOW(step2[3] - step2[12], 8);
-  output[13] = WRAPLOW(step2[2] - step2[13], 8);
-  output[14] = WRAPLOW(step2[1] - step2[14], 8);
-  output[15] = WRAPLOW(step2[0] - step2[15], 8);
+  output[0] = WRAPLOW(step2[0] + step2[15]);
+  output[1] = WRAPLOW(step2[1] + step2[14]);
+  output[2] = WRAPLOW(step2[2] + step2[13]);
+  output[3] = WRAPLOW(step2[3] + step2[12]);
+  output[4] = WRAPLOW(step2[4] + step2[11]);
+  output[5] = WRAPLOW(step2[5] + step2[10]);
+  output[6] = WRAPLOW(step2[6] + step2[9]);
+  output[7] = WRAPLOW(step2[7] + step2[8]);
+  output[8] = WRAPLOW(step2[7] - step2[8]);
+  output[9] = WRAPLOW(step2[6] - step2[9]);
+  output[10] = WRAPLOW(step2[5] - step2[10]);
+  output[11] = WRAPLOW(step2[4] - step2[11]);
+  output[12] = WRAPLOW(step2[3] - step2[12]);
+  output[13] = WRAPLOW(step2[2] - step2[13]);
+  output[14] = WRAPLOW(step2[1] - step2[14]);
+  output[15] = WRAPLOW(step2[0] - step2[15]);
 }
 
 void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
@@ -625,22 +625,22 @@
   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
   s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
 
-  x0 = WRAPLOW(dct_const_round_shift(s0 + s8), 8);
-  x1 = WRAPLOW(dct_const_round_shift(s1 + s9), 8);
-  x2 = WRAPLOW(dct_const_round_shift(s2 + s10), 8);
-  x3 = WRAPLOW(dct_const_round_shift(s3 + s11), 8);
-  x4 = WRAPLOW(dct_const_round_shift(s4 + s12), 8);
-  x5 = WRAPLOW(dct_const_round_shift(s5 + s13), 8);
-  x6 = WRAPLOW(dct_const_round_shift(s6 + s14), 8);
-  x7 = WRAPLOW(dct_const_round_shift(s7 + s15), 8);
-  x8 = WRAPLOW(dct_const_round_shift(s0 - s8), 8);
-  x9 = WRAPLOW(dct_const_round_shift(s1 - s9), 8);
-  x10 = WRAPLOW(dct_const_round_shift(s2 - s10), 8);
-  x11 = WRAPLOW(dct_const_round_shift(s3 - s11), 8);
-  x12 = WRAPLOW(dct_const_round_shift(s4 - s12), 8);
-  x13 = WRAPLOW(dct_const_round_shift(s5 - s13), 8);
-  x14 = WRAPLOW(dct_const_round_shift(s6 - s14), 8);
-  x15 = WRAPLOW(dct_const_round_shift(s7 - s15), 8);
+  x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
+  x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
+  x2 = WRAPLOW(dct_const_round_shift(s2 + s10));
+  x3 = WRAPLOW(dct_const_round_shift(s3 + s11));
+  x4 = WRAPLOW(dct_const_round_shift(s4 + s12));
+  x5 = WRAPLOW(dct_const_round_shift(s5 + s13));
+  x6 = WRAPLOW(dct_const_round_shift(s6 + s14));
+  x7 = WRAPLOW(dct_const_round_shift(s7 + s15));
+  x8 = WRAPLOW(dct_const_round_shift(s0 - s8));
+  x9 = WRAPLOW(dct_const_round_shift(s1 - s9));
+  x10 = WRAPLOW(dct_const_round_shift(s2 - s10));
+  x11 = WRAPLOW(dct_const_round_shift(s3 - s11));
+  x12 = WRAPLOW(dct_const_round_shift(s4 - s12));
+  x13 = WRAPLOW(dct_const_round_shift(s5 - s13));
+  x14 = WRAPLOW(dct_const_round_shift(s6 - s14));
+  x15 = WRAPLOW(dct_const_round_shift(s7 - s15));
 
   // stage 2
   s0 = x0;
@@ -660,22 +660,22 @@
   s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
   s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
 
-  x0 = WRAPLOW(s0 + s4, 8);
-  x1 = WRAPLOW(s1 + s5, 8);
-  x2 = WRAPLOW(s2 + s6, 8);
-  x3 = WRAPLOW(s3 + s7, 8);
-  x4 = WRAPLOW(s0 - s4, 8);
-  x5 = WRAPLOW(s1 - s5, 8);
-  x6 = WRAPLOW(s2 - s6, 8);
-  x7 = WRAPLOW(s3 - s7, 8);
-  x8 = WRAPLOW(dct_const_round_shift(s8 + s12), 8);
-  x9 = WRAPLOW(dct_const_round_shift(s9 + s13), 8);
-  x10 = WRAPLOW(dct_const_round_shift(s10 + s14), 8);
-  x11 = WRAPLOW(dct_const_round_shift(s11 + s15), 8);
-  x12 = WRAPLOW(dct_const_round_shift(s8 - s12), 8);
-  x13 = WRAPLOW(dct_const_round_shift(s9 - s13), 8);
-  x14 = WRAPLOW(dct_const_round_shift(s10 - s14), 8);
-  x15 = WRAPLOW(dct_const_round_shift(s11 - s15), 8);
+  x0 = WRAPLOW(s0 + s4);
+  x1 = WRAPLOW(s1 + s5);
+  x2 = WRAPLOW(s2 + s6);
+  x3 = WRAPLOW(s3 + s7);
+  x4 = WRAPLOW(s0 - s4);
+  x5 = WRAPLOW(s1 - s5);
+  x6 = WRAPLOW(s2 - s6);
+  x7 = WRAPLOW(s3 - s7);
+  x8 = WRAPLOW(dct_const_round_shift(s8 + s12));
+  x9 = WRAPLOW(dct_const_round_shift(s9 + s13));
+  x10 = WRAPLOW(dct_const_round_shift(s10 + s14));
+  x11 = WRAPLOW(dct_const_round_shift(s11 + s15));
+  x12 = WRAPLOW(dct_const_round_shift(s8 - s12));
+  x13 = WRAPLOW(dct_const_round_shift(s9 - s13));
+  x14 = WRAPLOW(dct_const_round_shift(s10 - s14));
+  x15 = WRAPLOW(dct_const_round_shift(s11 - s15));
 
   // stage 3
   s0 = x0;
@@ -695,22 +695,22 @@
   s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
   s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
 
-  x0 = WRAPLOW(check_range(s0 + s2), 8);
-  x1 = WRAPLOW(check_range(s1 + s3), 8);
-  x2 = WRAPLOW(check_range(s0 - s2), 8);
-  x3 = WRAPLOW(check_range(s1 - s3), 8);
-  x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);
-  x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);
-  x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);
-  x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
-  x8 = WRAPLOW(check_range(s8 + s10), 8);
-  x9 = WRAPLOW(check_range(s9 + s11), 8);
-  x10 = WRAPLOW(check_range(s8 - s10), 8);
-  x11 = WRAPLOW(check_range(s9 - s11), 8);
-  x12 = WRAPLOW(dct_const_round_shift(s12 + s14), 8);
-  x13 = WRAPLOW(dct_const_round_shift(s13 + s15), 8);
-  x14 = WRAPLOW(dct_const_round_shift(s12 - s14), 8);
-  x15 = WRAPLOW(dct_const_round_shift(s13 - s15), 8);
+  x0 = WRAPLOW(s0 + s2);
+  x1 = WRAPLOW(s1 + s3);
+  x2 = WRAPLOW(s0 - s2);
+  x3 = WRAPLOW(s1 - s3);
+  x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
+  x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
+  x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
+  x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
+  x8 = WRAPLOW(s8 + s10);
+  x9 = WRAPLOW(s9 + s11);
+  x10 = WRAPLOW(s8 - s10);
+  x11 = WRAPLOW(s9 - s11);
+  x12 = WRAPLOW(dct_const_round_shift(s12 + s14));
+  x13 = WRAPLOW(dct_const_round_shift(s13 + s15));
+  x14 = WRAPLOW(dct_const_round_shift(s12 - s14));
+  x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
 
   // stage 4
   s2 = (- cospi_16_64) * (x2 + x3);
@@ -722,31 +722,31 @@
   s14 = (- cospi_16_64) * (x14 + x15);
   s15 = cospi_16_64 * (x14 - x15);
 
-  x2 = WRAPLOW(dct_const_round_shift(s2), 8);
-  x3 = WRAPLOW(dct_const_round_shift(s3), 8);
-  x6 = WRAPLOW(dct_const_round_shift(s6), 8);
-  x7 = WRAPLOW(dct_const_round_shift(s7), 8);
-  x10 = WRAPLOW(dct_const_round_shift(s10), 8);
-  x11 = WRAPLOW(dct_const_round_shift(s11), 8);
-  x14 = WRAPLOW(dct_const_round_shift(s14), 8);
-  x15 = WRAPLOW(dct_const_round_shift(s15), 8);
+  x2 = WRAPLOW(dct_const_round_shift(s2));
+  x3 = WRAPLOW(dct_const_round_shift(s3));
+  x6 = WRAPLOW(dct_const_round_shift(s6));
+  x7 = WRAPLOW(dct_const_round_shift(s7));
+  x10 = WRAPLOW(dct_const_round_shift(s10));
+  x11 = WRAPLOW(dct_const_round_shift(s11));
+  x14 = WRAPLOW(dct_const_round_shift(s14));
+  x15 = WRAPLOW(dct_const_round_shift(s15));
 
-  output[0] = WRAPLOW(x0, 8);
-  output[1] = WRAPLOW(-x8, 8);
-  output[2] = WRAPLOW(x12, 8);
-  output[3] = WRAPLOW(-x4, 8);
-  output[4] = WRAPLOW(x6, 8);
-  output[5] = WRAPLOW(x14, 8);
-  output[6] = WRAPLOW(x10, 8);
-  output[7] = WRAPLOW(x2, 8);
-  output[8] = WRAPLOW(x3, 8);
-  output[9] = WRAPLOW(x11, 8);
-  output[10] = WRAPLOW(x15, 8);
-  output[11] = WRAPLOW(x7, 8);
-  output[12] = WRAPLOW(x5, 8);
-  output[13] = WRAPLOW(-x13, 8);
-  output[14] = WRAPLOW(x9, 8);
-  output[15] = WRAPLOW(-x1, 8);
+  output[0] = WRAPLOW(x0);
+  output[1] = WRAPLOW(-x8);
+  output[2] = WRAPLOW(x12);
+  output[3] = WRAPLOW(-x4);
+  output[4] = WRAPLOW(x6);
+  output[5] = WRAPLOW(x14);
+  output[6] = WRAPLOW(x10);
+  output[7] = WRAPLOW(x2);
+  output[8] = WRAPLOW(x3);
+  output[9] = WRAPLOW(x11);
+  output[10] = WRAPLOW(x15);
+  output[11] = WRAPLOW(x7);
+  output[12] = WRAPLOW(x5);
+  output[13] = WRAPLOW(-x13);
+  output[14] = WRAPLOW(x9);
+  output[15] = WRAPLOW(-x1);
 }
 
 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
@@ -779,8 +779,8 @@
 void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   int i, j;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   a1 = ROUND_POWER_OF_TWO(out, 6);
   for (j = 0; j < 16; ++j) {
     for (i = 0; i < 16; ++i)
@@ -813,43 +813,43 @@
 
   temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
   temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
-  step1[16] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[31] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[16] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[31] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
   temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
-  step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[17] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[30] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
   temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
-  step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[18] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[29] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
   temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
-  step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[19] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[28] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
   temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
-  step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[20] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[27] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
   temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
   temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
-  step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[22] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[25] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
   temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
-  step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[23] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[24] = WRAPLOW(dct_const_round_shift(temp2));
 
   // stage 2
   step2[0] = step1[0];
@@ -863,40 +863,40 @@
 
   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[8] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[15] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
 
-  step2[16] = WRAPLOW(step1[16] + step1[17], 8);
-  step2[17] = WRAPLOW(step1[16] - step1[17], 8);
-  step2[18] = WRAPLOW(-step1[18] + step1[19], 8);
-  step2[19] = WRAPLOW(step1[18] + step1[19], 8);
-  step2[20] = WRAPLOW(step1[20] + step1[21], 8);
-  step2[21] = WRAPLOW(step1[20] - step1[21], 8);
-  step2[22] = WRAPLOW(-step1[22] + step1[23], 8);
-  step2[23] = WRAPLOW(step1[22] + step1[23], 8);
-  step2[24] = WRAPLOW(step1[24] + step1[25], 8);
-  step2[25] = WRAPLOW(step1[24] - step1[25], 8);
-  step2[26] = WRAPLOW(-step1[26] + step1[27], 8);
-  step2[27] = WRAPLOW(step1[26] + step1[27], 8);
-  step2[28] = WRAPLOW(step1[28] + step1[29], 8);
-  step2[29] = WRAPLOW(step1[28] - step1[29], 8);
-  step2[30] = WRAPLOW(-step1[30] + step1[31], 8);
-  step2[31] = WRAPLOW(step1[30] + step1[31], 8);
+  step2[16] = WRAPLOW(step1[16] + step1[17]);
+  step2[17] = WRAPLOW(step1[16] - step1[17]);
+  step2[18] = WRAPLOW(-step1[18] + step1[19]);
+  step2[19] = WRAPLOW(step1[18] + step1[19]);
+  step2[20] = WRAPLOW(step1[20] + step1[21]);
+  step2[21] = WRAPLOW(step1[20] - step1[21]);
+  step2[22] = WRAPLOW(-step1[22] + step1[23]);
+  step2[23] = WRAPLOW(step1[22] + step1[23]);
+  step2[24] = WRAPLOW(step1[24] + step1[25]);
+  step2[25] = WRAPLOW(step1[24] - step1[25]);
+  step2[26] = WRAPLOW(-step1[26] + step1[27]);
+  step2[27] = WRAPLOW(step1[26] + step1[27]);
+  step2[28] = WRAPLOW(step1[28] + step1[29]);
+  step2[29] = WRAPLOW(step1[28] - step1[29]);
+  step2[30] = WRAPLOW(-step1[30] + step1[31]);
+  step2[31] = WRAPLOW(step1[30] + step1[31]);
 
   // stage 3
   step1[0] = step2[0];
@@ -906,42 +906,42 @@
 
   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
 
-  step1[8] = WRAPLOW(step2[8] + step2[9], 8);
-  step1[9] = WRAPLOW(step2[8] - step2[9], 8);
-  step1[10] = WRAPLOW(-step2[10] + step2[11], 8);
-  step1[11] = WRAPLOW(step2[10] + step2[11], 8);
-  step1[12] = WRAPLOW(step2[12] + step2[13], 8);
-  step1[13] = WRAPLOW(step2[12] - step2[13], 8);
-  step1[14] = WRAPLOW(-step2[14] + step2[15], 8);
-  step1[15] = WRAPLOW(step2[14] + step2[15], 8);
+  step1[8] = WRAPLOW(step2[8] + step2[9]);
+  step1[9] = WRAPLOW(step2[8] - step2[9]);
+  step1[10] = WRAPLOW(-step2[10] + step2[11]);
+  step1[11] = WRAPLOW(step2[10] + step2[11]);
+  step1[12] = WRAPLOW(step2[12] + step2[13]);
+  step1[13] = WRAPLOW(step2[12] - step2[13]);
+  step1[14] = WRAPLOW(-step2[14] + step2[15]);
+  step1[15] = WRAPLOW(step2[14] + step2[15]);
 
   step1[16] = step2[16];
   step1[31] = step2[31];
   temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
   temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
-  step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[17] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[30] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
   temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
-  step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[18] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[29] = WRAPLOW(dct_const_round_shift(temp2));
   step1[19] = step2[19];
   step1[20] = step2[20];
   temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
   temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
   temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
-  step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[22] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[25] = WRAPLOW(dct_const_round_shift(temp2));
   step1[23] = step2[23];
   step1[24] = step2[24];
   step1[27] = step2[27];
@@ -950,87 +950,87 @@
   // stage 4
   temp1 = (step1[0] + step1[1]) * cospi_16_64;
   temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  step2[4] = WRAPLOW(step1[4] + step1[5], 8);
-  step2[5] = WRAPLOW(step1[4] - step1[5], 8);
-  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
-  step2[7] = WRAPLOW(step1[6] + step1[7], 8);
+  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[4] = WRAPLOW(step1[4] + step1[5]);
+  step2[5] = WRAPLOW(step1[4] - step1[5]);
+  step2[6] = WRAPLOW(-step1[6] + step1[7]);
+  step2[7] = WRAPLOW(step1[6] + step1[7]);
 
   step2[8] = step1[8];
   step2[15] = step1[15];
   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
   step2[11] = step1[11];
   step2[12] = step1[12];
 
-  step2[16] = WRAPLOW(step1[16] + step1[19], 8);
-  step2[17] = WRAPLOW(step1[17] + step1[18], 8);
-  step2[18] = WRAPLOW(step1[17] - step1[18], 8);
-  step2[19] = WRAPLOW(step1[16] - step1[19], 8);
-  step2[20] = WRAPLOW(-step1[20] + step1[23], 8);
-  step2[21] = WRAPLOW(-step1[21] + step1[22], 8);
-  step2[22] = WRAPLOW(step1[21] + step1[22], 8);
-  step2[23] = WRAPLOW(step1[20] + step1[23], 8);
+  step2[16] = WRAPLOW(step1[16] + step1[19]);
+  step2[17] = WRAPLOW(step1[17] + step1[18]);
+  step2[18] = WRAPLOW(step1[17] - step1[18]);
+  step2[19] = WRAPLOW(step1[16] - step1[19]);
+  step2[20] = WRAPLOW(-step1[20] + step1[23]);
+  step2[21] = WRAPLOW(-step1[21] + step1[22]);
+  step2[22] = WRAPLOW(step1[21] + step1[22]);
+  step2[23] = WRAPLOW(step1[20] + step1[23]);
 
-  step2[24] = WRAPLOW(step1[24] + step1[27], 8);
-  step2[25] = WRAPLOW(step1[25] + step1[26], 8);
-  step2[26] = WRAPLOW(step1[25] - step1[26], 8);
-  step2[27] = WRAPLOW(step1[24] - step1[27], 8);
-  step2[28] = WRAPLOW(-step1[28] + step1[31], 8);
-  step2[29] = WRAPLOW(-step1[29] + step1[30], 8);
-  step2[30] = WRAPLOW(step1[29] + step1[30], 8);
-  step2[31] = WRAPLOW(step1[28] + step1[31], 8);
+  step2[24] = WRAPLOW(step1[24] + step1[27]);
+  step2[25] = WRAPLOW(step1[25] + step1[26]);
+  step2[26] = WRAPLOW(step1[25] - step1[26]);
+  step2[27] = WRAPLOW(step1[24] - step1[27]);
+  step2[28] = WRAPLOW(-step1[28] + step1[31]);
+  step2[29] = WRAPLOW(-step1[29] + step1[30]);
+  step2[30] = WRAPLOW(step1[29] + step1[30]);
+  step2[31] = WRAPLOW(step1[28] + step1[31]);
 
   // stage 5
-  step1[0] = WRAPLOW(step2[0] + step2[3], 8);
-  step1[1] = WRAPLOW(step2[1] + step2[2], 8);
-  step1[2] = WRAPLOW(step2[1] - step2[2], 8);
-  step1[3] = WRAPLOW(step2[0] - step2[3], 8);
+  step1[0] = WRAPLOW(step2[0] + step2[3]);
+  step1[1] = WRAPLOW(step2[1] + step2[2]);
+  step1[2] = WRAPLOW(step2[1] - step2[2]);
+  step1[3] = WRAPLOW(step2[0] - step2[3]);
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
   step1[7] = step2[7];
 
-  step1[8] = WRAPLOW(step2[8] + step2[11], 8);
-  step1[9] = WRAPLOW(step2[9] + step2[10], 8);
-  step1[10] = WRAPLOW(step2[9] - step2[10], 8);
-  step1[11] = WRAPLOW(step2[8] - step2[11], 8);
-  step1[12] = WRAPLOW(-step2[12] + step2[15], 8);
-  step1[13] = WRAPLOW(-step2[13] + step2[14], 8);
-  step1[14] = WRAPLOW(step2[13] + step2[14], 8);
-  step1[15] = WRAPLOW(step2[12] + step2[15], 8);
+  step1[8] = WRAPLOW(step2[8] + step2[11]);
+  step1[9] = WRAPLOW(step2[9] + step2[10]);
+  step1[10] = WRAPLOW(step2[9] - step2[10]);
+  step1[11] = WRAPLOW(step2[8] - step2[11]);
+  step1[12] = WRAPLOW(-step2[12] + step2[15]);
+  step1[13] = WRAPLOW(-step2[13] + step2[14]);
+  step1[14] = WRAPLOW(step2[13] + step2[14]);
+  step1[15] = WRAPLOW(step2[12] + step2[15]);
 
   step1[16] = step2[16];
   step1[17] = step2[17];
   temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
   temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
-  step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[18] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[29] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
   temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
-  step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[19] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[28] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
   temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
-  step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[20] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[27] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
   temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
   step1[22] = step2[22];
   step1[23] = step2[23];
   step1[24] = step2[24];
@@ -1039,62 +1039,62 @@
   step1[31] = step2[31];
 
   // stage 6
-  step2[0] = WRAPLOW(step1[0] + step1[7], 8);
-  step2[1] = WRAPLOW(step1[1] + step1[6], 8);
-  step2[2] = WRAPLOW(step1[2] + step1[5], 8);
-  step2[3] = WRAPLOW(step1[3] + step1[4], 8);
-  step2[4] = WRAPLOW(step1[3] - step1[4], 8);
-  step2[5] = WRAPLOW(step1[2] - step1[5], 8);
-  step2[6] = WRAPLOW(step1[1] - step1[6], 8);
-  step2[7] = WRAPLOW(step1[0] - step1[7], 8);
+  step2[0] = WRAPLOW(step1[0] + step1[7]);
+  step2[1] = WRAPLOW(step1[1] + step1[6]);
+  step2[2] = WRAPLOW(step1[2] + step1[5]);
+  step2[3] = WRAPLOW(step1[3] + step1[4]);
+  step2[4] = WRAPLOW(step1[3] - step1[4]);
+  step2[5] = WRAPLOW(step1[2] - step1[5]);
+  step2[6] = WRAPLOW(step1[1] - step1[6]);
+  step2[7] = WRAPLOW(step1[0] - step1[7]);
   step2[8] = step1[8];
   step2[9] = step1[9];
   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
   step2[14] = step1[14];
   step2[15] = step1[15];
 
-  step2[16] = WRAPLOW(step1[16] + step1[23], 8);
-  step2[17] = WRAPLOW(step1[17] + step1[22], 8);
-  step2[18] = WRAPLOW(step1[18] + step1[21], 8);
-  step2[19] = WRAPLOW(step1[19] + step1[20], 8);
-  step2[20] = WRAPLOW(step1[19] - step1[20], 8);
-  step2[21] = WRAPLOW(step1[18] - step1[21], 8);
-  step2[22] = WRAPLOW(step1[17] - step1[22], 8);
-  step2[23] = WRAPLOW(step1[16] - step1[23], 8);
+  step2[16] = WRAPLOW(step1[16] + step1[23]);
+  step2[17] = WRAPLOW(step1[17] + step1[22]);
+  step2[18] = WRAPLOW(step1[18] + step1[21]);
+  step2[19] = WRAPLOW(step1[19] + step1[20]);
+  step2[20] = WRAPLOW(step1[19] - step1[20]);
+  step2[21] = WRAPLOW(step1[18] - step1[21]);
+  step2[22] = WRAPLOW(step1[17] - step1[22]);
+  step2[23] = WRAPLOW(step1[16] - step1[23]);
 
-  step2[24] = WRAPLOW(-step1[24] + step1[31], 8);
-  step2[25] = WRAPLOW(-step1[25] + step1[30], 8);
-  step2[26] = WRAPLOW(-step1[26] + step1[29], 8);
-  step2[27] = WRAPLOW(-step1[27] + step1[28], 8);
-  step2[28] = WRAPLOW(step1[27] + step1[28], 8);
-  step2[29] = WRAPLOW(step1[26] + step1[29], 8);
-  step2[30] = WRAPLOW(step1[25] + step1[30], 8);
-  step2[31] = WRAPLOW(step1[24] + step1[31], 8);
+  step2[24] = WRAPLOW(-step1[24] + step1[31]);
+  step2[25] = WRAPLOW(-step1[25] + step1[30]);
+  step2[26] = WRAPLOW(-step1[26] + step1[29]);
+  step2[27] = WRAPLOW(-step1[27] + step1[28]);
+  step2[28] = WRAPLOW(step1[27] + step1[28]);
+  step2[29] = WRAPLOW(step1[26] + step1[29]);
+  step2[30] = WRAPLOW(step1[25] + step1[30]);
+  step2[31] = WRAPLOW(step1[24] + step1[31]);
 
   // stage 7
-  step1[0] = WRAPLOW(step2[0] + step2[15], 8);
-  step1[1] = WRAPLOW(step2[1] + step2[14], 8);
-  step1[2] = WRAPLOW(step2[2] + step2[13], 8);
-  step1[3] = WRAPLOW(step2[3] + step2[12], 8);
-  step1[4] = WRAPLOW(step2[4] + step2[11], 8);
-  step1[5] = WRAPLOW(step2[5] + step2[10], 8);
-  step1[6] = WRAPLOW(step2[6] + step2[9], 8);
-  step1[7] = WRAPLOW(step2[7] + step2[8], 8);
-  step1[8] = WRAPLOW(step2[7] - step2[8], 8);
-  step1[9] = WRAPLOW(step2[6] - step2[9], 8);
-  step1[10] = WRAPLOW(step2[5] - step2[10], 8);
-  step1[11] = WRAPLOW(step2[4] - step2[11], 8);
-  step1[12] = WRAPLOW(step2[3] - step2[12], 8);
-  step1[13] = WRAPLOW(step2[2] - step2[13], 8);
-  step1[14] = WRAPLOW(step2[1] - step2[14], 8);
-  step1[15] = WRAPLOW(step2[0] - step2[15], 8);
+  step1[0] = WRAPLOW(step2[0] + step2[15]);
+  step1[1] = WRAPLOW(step2[1] + step2[14]);
+  step1[2] = WRAPLOW(step2[2] + step2[13]);
+  step1[3] = WRAPLOW(step2[3] + step2[12]);
+  step1[4] = WRAPLOW(step2[4] + step2[11]);
+  step1[5] = WRAPLOW(step2[5] + step2[10]);
+  step1[6] = WRAPLOW(step2[6] + step2[9]);
+  step1[7] = WRAPLOW(step2[7] + step2[8]);
+  step1[8] = WRAPLOW(step2[7] - step2[8]);
+  step1[9] = WRAPLOW(step2[6] - step2[9]);
+  step1[10] = WRAPLOW(step2[5] - step2[10]);
+  step1[11] = WRAPLOW(step2[4] - step2[11]);
+  step1[12] = WRAPLOW(step2[3] - step2[12]);
+  step1[13] = WRAPLOW(step2[2] - step2[13]);
+  step1[14] = WRAPLOW(step2[1] - step2[14]);
+  step1[15] = WRAPLOW(step2[0] - step2[15]);
 
   step1[16] = step2[16];
   step1[17] = step2[17];
@@ -1102,58 +1102,58 @@
   step1[19] = step2[19];
   temp1 = (-step2[20] + step2[27]) * cospi_16_64;
   temp2 = (step2[20] + step2[27]) * cospi_16_64;
-  step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[20] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[27] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = (-step2[21] + step2[26]) * cospi_16_64;
   temp2 = (step2[21] + step2[26]) * cospi_16_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = (-step2[22] + step2[25]) * cospi_16_64;
   temp2 = (step2[22] + step2[25]) * cospi_16_64;
-  step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[22] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[25] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = (-step2[23] + step2[24]) * cospi_16_64;
   temp2 = (step2[23] + step2[24]) * cospi_16_64;
-  step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[23] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[24] = WRAPLOW(dct_const_round_shift(temp2));
   step1[28] = step2[28];
   step1[29] = step2[29];
   step1[30] = step2[30];
   step1[31] = step2[31];
 
   // final stage
-  output[0] = WRAPLOW(step1[0] + step1[31], 8);
-  output[1] = WRAPLOW(step1[1] + step1[30], 8);
-  output[2] = WRAPLOW(step1[2] + step1[29], 8);
-  output[3] = WRAPLOW(step1[3] + step1[28], 8);
-  output[4] = WRAPLOW(step1[4] + step1[27], 8);
-  output[5] = WRAPLOW(step1[5] + step1[26], 8);
-  output[6] = WRAPLOW(step1[6] + step1[25], 8);
-  output[7] = WRAPLOW(step1[7] + step1[24], 8);
-  output[8] = WRAPLOW(step1[8] + step1[23], 8);
-  output[9] = WRAPLOW(step1[9] + step1[22], 8);
-  output[10] = WRAPLOW(step1[10] + step1[21], 8);
-  output[11] = WRAPLOW(step1[11] + step1[20], 8);
-  output[12] = WRAPLOW(step1[12] + step1[19], 8);
-  output[13] = WRAPLOW(step1[13] + step1[18], 8);
-  output[14] = WRAPLOW(step1[14] + step1[17], 8);
-  output[15] = WRAPLOW(step1[15] + step1[16], 8);
-  output[16] = WRAPLOW(step1[15] - step1[16], 8);
-  output[17] = WRAPLOW(step1[14] - step1[17], 8);
-  output[18] = WRAPLOW(step1[13] - step1[18], 8);
-  output[19] = WRAPLOW(step1[12] - step1[19], 8);
-  output[20] = WRAPLOW(step1[11] - step1[20], 8);
-  output[21] = WRAPLOW(step1[10] - step1[21], 8);
-  output[22] = WRAPLOW(step1[9] - step1[22], 8);
-  output[23] = WRAPLOW(step1[8] - step1[23], 8);
-  output[24] = WRAPLOW(step1[7] - step1[24], 8);
-  output[25] = WRAPLOW(step1[6] - step1[25], 8);
-  output[26] = WRAPLOW(step1[5] - step1[26], 8);
-  output[27] = WRAPLOW(step1[4] - step1[27], 8);
-  output[28] = WRAPLOW(step1[3] - step1[28], 8);
-  output[29] = WRAPLOW(step1[2] - step1[29], 8);
-  output[30] = WRAPLOW(step1[1] - step1[30], 8);
-  output[31] = WRAPLOW(step1[0] - step1[31], 8);
+  output[0] = WRAPLOW(step1[0] + step1[31]);
+  output[1] = WRAPLOW(step1[1] + step1[30]);
+  output[2] = WRAPLOW(step1[2] + step1[29]);
+  output[3] = WRAPLOW(step1[3] + step1[28]);
+  output[4] = WRAPLOW(step1[4] + step1[27]);
+  output[5] = WRAPLOW(step1[5] + step1[26]);
+  output[6] = WRAPLOW(step1[6] + step1[25]);
+  output[7] = WRAPLOW(step1[7] + step1[24]);
+  output[8] = WRAPLOW(step1[8] + step1[23]);
+  output[9] = WRAPLOW(step1[9] + step1[22]);
+  output[10] = WRAPLOW(step1[10] + step1[21]);
+  output[11] = WRAPLOW(step1[11] + step1[20]);
+  output[12] = WRAPLOW(step1[12] + step1[19]);
+  output[13] = WRAPLOW(step1[13] + step1[18]);
+  output[14] = WRAPLOW(step1[14] + step1[17]);
+  output[15] = WRAPLOW(step1[15] + step1[16]);
+  output[16] = WRAPLOW(step1[15] - step1[16]);
+  output[17] = WRAPLOW(step1[14] - step1[17]);
+  output[18] = WRAPLOW(step1[13] - step1[18]);
+  output[19] = WRAPLOW(step1[12] - step1[19]);
+  output[20] = WRAPLOW(step1[11] - step1[20]);
+  output[21] = WRAPLOW(step1[10] - step1[21]);
+  output[22] = WRAPLOW(step1[9] - step1[22]);
+  output[23] = WRAPLOW(step1[8] - step1[23]);
+  output[24] = WRAPLOW(step1[7] - step1[24]);
+  output[25] = WRAPLOW(step1[6] - step1[25]);
+  output[26] = WRAPLOW(step1[5] - step1[26]);
+  output[27] = WRAPLOW(step1[4] - step1[27]);
+  output[28] = WRAPLOW(step1[3] - step1[28]);
+  output[29] = WRAPLOW(step1[2] - step1[29]);
+  output[30] = WRAPLOW(step1[1] - step1[30]);
+  output[31] = WRAPLOW(step1[0] - step1[31]);
 }
 
 void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
@@ -1253,8 +1253,8 @@
   int i, j;
   tran_high_t a1;
 
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   a1 = ROUND_POWER_OF_TWO(out, 6);
 
   for (j = 0; j < 32; ++j) {
@@ -1288,10 +1288,10 @@
     c1 = e1 - c1;
     a1 -= b1;
     d1 += c1;
-    op[0] = WRAPLOW(a1, bd);
-    op[1] = WRAPLOW(b1, bd);
-    op[2] = WRAPLOW(c1, bd);
-    op[3] = WRAPLOW(d1, bd);
+    op[0] = HIGHBD_WRAPLOW(a1, bd);
+    op[1] = HIGHBD_WRAPLOW(b1, bd);
+    op[2] = HIGHBD_WRAPLOW(c1, bd);
+    op[3] = HIGHBD_WRAPLOW(d1, bd);
     ip += 4;
     op += 4;
   }
@@ -1309,10 +1309,14 @@
     c1 = e1 - c1;
     a1 -= b1;
     d1 += c1;
-    dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);
-    dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], b1, bd);
-    dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], c1, bd);
-    dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], d1, bd);
+    dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0],
+                                             HIGHBD_WRAPLOW(a1, bd), bd);
+    dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1],
+                                             HIGHBD_WRAPLOW(b1, bd), bd);
+    dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2],
+                                             HIGHBD_WRAPLOW(c1, bd), bd);
+    dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3],
+                                             HIGHBD_WRAPLOW(d1, bd), bd);
 
     ip++;
     dest++;
@@ -1332,8 +1336,8 @@
   a1 = ip[0] >> UNIT_QUANT_SHIFT;
   e1 = a1 >> 1;
   a1 -= e1;
-  op[0] = WRAPLOW(a1, bd);
-  op[1] = op[2] = op[3] = WRAPLOW(e1, bd);
+  op[0] = HIGHBD_WRAPLOW(a1, bd);
+  op[1] = op[2] = op[3] = HIGHBD_WRAPLOW(e1, bd);
 
   ip = tmp;
   for (i = 0; i < 4; i++) {
@@ -1359,18 +1363,18 @@
   // stage 1
   temp1 = (input[0] + input[2]) * cospi_16_64;
   temp2 = (input[0] - input[2]) * cospi_16_64;
-  step[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
   temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
-  step[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   // stage 2
-  output[0] = WRAPLOW(step[0] + step[3], bd);
-  output[1] = WRAPLOW(step[1] + step[2], bd);
-  output[2] = WRAPLOW(step[1] - step[2], bd);
-  output[3] = WRAPLOW(step[0] - step[3], bd);
+  output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd);
+  output[1] = HIGHBD_WRAPLOW(step[1] + step[2], bd);
+  output[2] = HIGHBD_WRAPLOW(step[1] - step[2], bd);
+  output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd);
 }
 
 void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
@@ -1404,11 +1408,11 @@
                                 int dest_stride, int bd) {
   int i;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(
-      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
+  tran_low_t out = HIGHBD_WRAPLOW(
+      highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
+  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 4);
 
   for (i = 0; i < 4; i++) {
@@ -1430,39 +1434,39 @@
   step1[3] = input[6];
   temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
   temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
-  step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
   temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   // stage 2 & stage 3 - even half
   vpx_highbd_idct4_c(step1, step1, bd);
 
   // stage 2 - odd half
-  step2[4] = WRAPLOW(step1[4] + step1[5], bd);
-  step2[5] = WRAPLOW(step1[4] - step1[5], bd);
-  step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
-  step2[7] = WRAPLOW(step1[6] + step1[7], bd);
+  step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
+  step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
+  step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
+  step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
 
   // stage 3 - odd half
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step1[7] = step2[7];
 
   // stage 4
-  output[0] = WRAPLOW(step1[0] + step1[7], bd);
-  output[1] = WRAPLOW(step1[1] + step1[6], bd);
-  output[2] = WRAPLOW(step1[2] + step1[5], bd);
-  output[3] = WRAPLOW(step1[3] + step1[4], bd);
-  output[4] = WRAPLOW(step1[3] - step1[4], bd);
-  output[5] = WRAPLOW(step1[2] - step1[5], bd);
-  output[6] = WRAPLOW(step1[1] - step1[6], bd);
-  output[7] = WRAPLOW(step1[0] - step1[7], bd);
+  output[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
+  output[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
+  output[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
+  output[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
+  output[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
+  output[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
+  output[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
+  output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
 }
 
 void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
@@ -1496,10 +1500,10 @@
                                 int stride, int bd) {
   int i, j;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(
-      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
+  tran_low_t out = HIGHBD_WRAPLOW(
+      highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
+  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 5);
   for (j = 0; j < 8; ++j) {
     for (i = 0; i < 8; ++i)
@@ -1529,7 +1533,7 @@
   s4 = sinpi_1_9 * x2;
   s5 = sinpi_2_9 * x3;
   s6 = sinpi_4_9 * x3;
-  s7 = (tran_high_t)(x0 - x2 + x3);
+  s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd);
 
   s0 = s0 + s3 + s5;
   s1 = s1 - s4 - s6;
@@ -1540,10 +1544,10 @@
   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
   // + 1b (addition) = 29b.
   // Hence the output bit depth is 15b.
-  output[0] = WRAPLOW(highbd_dct_const_round_shift(s0 + s3, bd), bd);
-  output[1] = WRAPLOW(highbd_dct_const_round_shift(s1 + s3, bd), bd);
-  output[2] = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
-  output[3] = WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3, bd), bd);
+  output[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s3), bd);
+  output[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s3), bd);
+  output[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
+  output[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3), bd);
 }
 
 void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
@@ -1574,14 +1578,14 @@
   s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
   s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
 
-  x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s4, bd), bd);
-  x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s5, bd), bd);
-  x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s6, bd), bd);
-  x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s7, bd), bd);
-  x4 = WRAPLOW(highbd_dct_const_round_shift(s0 - s4, bd), bd);
-  x5 = WRAPLOW(highbd_dct_const_round_shift(s1 - s5, bd), bd);
-  x6 = WRAPLOW(highbd_dct_const_round_shift(s2 - s6, bd), bd);
-  x7 = WRAPLOW(highbd_dct_const_round_shift(s3 - s7, bd), bd);
+  x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s4), bd);
+  x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s5), bd);
+  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 + s6), bd);
+  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 + s7), bd);
+  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s4), bd);
+  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s5), bd);
+  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s6), bd);
+  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s7), bd);
 
   // stage 2
   s0 = x0;
@@ -1593,14 +1597,14 @@
   s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
   s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
 
-  x0 = WRAPLOW(s0 + s2, bd);
-  x1 = WRAPLOW(s1 + s3, bd);
-  x2 = WRAPLOW(s0 - s2, bd);
-  x3 = WRAPLOW(s1 - s3, bd);
-  x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
-  x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
-  x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
-  x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
+  x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
+  x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
+  x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
+  x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
+  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s6), bd);
+  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s7), bd);
+  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s6), bd);
+  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s7), bd);
 
   // stage 3
   s2 = cospi_16_64 * (x2 + x3);
@@ -1608,19 +1612,19 @@
   s6 = cospi_16_64 * (x6 + x7);
   s7 = cospi_16_64 * (x6 - x7);
 
-  x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
-  x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
-  x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
-  x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
+  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
+  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3), bd);
+  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6), bd);
+  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7), bd);
 
-  output[0] = WRAPLOW(x0, bd);
-  output[1] = WRAPLOW(-x4, bd);
-  output[2] = WRAPLOW(x6, bd);
-  output[3] = WRAPLOW(-x2, bd);
-  output[4] = WRAPLOW(x3, bd);
-  output[5] = WRAPLOW(-x7, bd);
-  output[6] = WRAPLOW(x5, bd);
-  output[7] = WRAPLOW(-x1, bd);
+  output[0] = HIGHBD_WRAPLOW(x0, bd);
+  output[1] = HIGHBD_WRAPLOW(-x4, bd);
+  output[2] = HIGHBD_WRAPLOW(x6, bd);
+  output[3] = HIGHBD_WRAPLOW(-x2, bd);
+  output[4] = HIGHBD_WRAPLOW(x3, bd);
+  output[5] = HIGHBD_WRAPLOW(-x7, bd);
+  output[6] = HIGHBD_WRAPLOW(x5, bd);
+  output[7] = HIGHBD_WRAPLOW(-x1, bd);
 }
 
 void vpx_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
@@ -1685,23 +1689,23 @@
 
   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[8] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[15] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   // stage 3
   step1[0] = step2[0];
@@ -1711,109 +1715,109 @@
 
   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
-  step1[8] = WRAPLOW(step2[8] + step2[9], bd);
-  step1[9] = WRAPLOW(step2[8] - step2[9], bd);
-  step1[10] = WRAPLOW(-step2[10] + step2[11], bd);
-  step1[11] = WRAPLOW(step2[10] + step2[11], bd);
-  step1[12] = WRAPLOW(step2[12] + step2[13], bd);
-  step1[13] = WRAPLOW(step2[12] - step2[13], bd);
-  step1[14] = WRAPLOW(-step2[14] + step2[15], bd);
-  step1[15] = WRAPLOW(step2[14] + step2[15], bd);
+  step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
+  step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
+  step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
+  step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
+  step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
+  step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
+  step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
+  step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
 
   // stage 4
   temp1 = (step1[0] + step1[1]) * cospi_16_64;
   temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-  step2[4] = WRAPLOW(step1[4] + step1[5], bd);
-  step2[5] = WRAPLOW(step1[4] - step1[5], bd);
-  step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
-  step2[7] = WRAPLOW(step1[6] + step1[7], bd);
+  step2[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
+  step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
+  step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
+  step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
 
   step2[8] = step1[8];
   step2[15] = step1[15];
   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step2[11] = step1[11];
   step2[12] = step1[12];
 
   // stage 5
-  step1[0] = WRAPLOW(step2[0] + step2[3], bd);
-  step1[1] = WRAPLOW(step2[1] + step2[2], bd);
-  step1[2] = WRAPLOW(step2[1] - step2[2], bd);
-  step1[3] = WRAPLOW(step2[0] - step2[3], bd);
+  step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
+  step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
+  step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
+  step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step1[7] = step2[7];
 
-  step1[8] = WRAPLOW(step2[8] + step2[11], bd);
-  step1[9] = WRAPLOW(step2[9] + step2[10], bd);
-  step1[10] = WRAPLOW(step2[9] - step2[10], bd);
-  step1[11] = WRAPLOW(step2[8] - step2[11], bd);
-  step1[12] = WRAPLOW(-step2[12] + step2[15], bd);
-  step1[13] = WRAPLOW(-step2[13] + step2[14], bd);
-  step1[14] = WRAPLOW(step2[13] + step2[14], bd);
-  step1[15] = WRAPLOW(step2[12] + step2[15], bd);
+  step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
+  step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
+  step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
+  step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
+  step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
+  step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
+  step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
+  step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
 
   // stage 6
-  step2[0] = WRAPLOW(step1[0] + step1[7], bd);
-  step2[1] = WRAPLOW(step1[1] + step1[6], bd);
-  step2[2] = WRAPLOW(step1[2] + step1[5], bd);
-  step2[3] = WRAPLOW(step1[3] + step1[4], bd);
-  step2[4] = WRAPLOW(step1[3] - step1[4], bd);
-  step2[5] = WRAPLOW(step1[2] - step1[5], bd);
-  step2[6] = WRAPLOW(step1[1] - step1[6], bd);
-  step2[7] = WRAPLOW(step1[0] - step1[7], bd);
+  step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
+  step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
+  step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
+  step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
+  step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
+  step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
+  step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
+  step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
   step2[8] = step1[8];
   step2[9] = step1[9];
   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step2[14] = step1[14];
   step2[15] = step1[15];
 
   // stage 7
-  output[0] = WRAPLOW(step2[0] + step2[15], bd);
-  output[1] = WRAPLOW(step2[1] + step2[14], bd);
-  output[2] = WRAPLOW(step2[2] + step2[13], bd);
-  output[3] = WRAPLOW(step2[3] + step2[12], bd);
-  output[4] = WRAPLOW(step2[4] + step2[11], bd);
-  output[5] = WRAPLOW(step2[5] + step2[10], bd);
-  output[6] = WRAPLOW(step2[6] + step2[9], bd);
-  output[7] = WRAPLOW(step2[7] + step2[8], bd);
-  output[8] = WRAPLOW(step2[7] - step2[8], bd);
-  output[9] = WRAPLOW(step2[6] - step2[9], bd);
-  output[10] = WRAPLOW(step2[5] - step2[10], bd);
-  output[11] = WRAPLOW(step2[4] - step2[11], bd);
-  output[12] = WRAPLOW(step2[3] - step2[12], bd);
-  output[13] = WRAPLOW(step2[2] - step2[13], bd);
-  output[14] = WRAPLOW(step2[1] - step2[14], bd);
-  output[15] = WRAPLOW(step2[0] - step2[15], bd);
+  output[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
+  output[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
+  output[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
+  output[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
+  output[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
+  output[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
+  output[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
+  output[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
+  output[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
+  output[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
+  output[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
+  output[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
+  output[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
+  output[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
+  output[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
+  output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
 }
 
 void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
@@ -1889,22 +1893,22 @@
   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
   s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
 
-  x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s8, bd), bd);
-  x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s9, bd), bd);
-  x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s10, bd), bd);
-  x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s11, bd), bd);
-  x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s12, bd), bd);
-  x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s13, bd), bd);
-  x6 = WRAPLOW(highbd_dct_const_round_shift(s6 + s14, bd), bd);
-  x7 = WRAPLOW(highbd_dct_const_round_shift(s7 + s15, bd), bd);
-  x8  = WRAPLOW(highbd_dct_const_round_shift(s0 - s8, bd), bd);
-  x9  = WRAPLOW(highbd_dct_const_round_shift(s1 - s9, bd), bd);
-  x10 = WRAPLOW(highbd_dct_const_round_shift(s2 - s10, bd), bd);
-  x11 = WRAPLOW(highbd_dct_const_round_shift(s3 - s11, bd), bd);
-  x12 = WRAPLOW(highbd_dct_const_round_shift(s4 - s12, bd), bd);
-  x13 = WRAPLOW(highbd_dct_const_round_shift(s5 - s13, bd), bd);
-  x14 = WRAPLOW(highbd_dct_const_round_shift(s6 - s14, bd), bd);
-  x15 = WRAPLOW(highbd_dct_const_round_shift(s7 - s15, bd), bd);
+  x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s8), bd);
+  x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s9), bd);
+  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 + s10), bd);
+  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 + s11), bd);
+  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s12), bd);
+  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s13), bd);
+  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 + s14), bd);
+  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 + s15), bd);
+  x8  = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s8), bd);
+  x9  = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s9), bd);
+  x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s10), bd);
+  x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s11), bd);
+  x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s12), bd);
+  x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s13), bd);
+  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 - s14), bd);
+  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 - s15), bd);
 
   // stage 2
   s0 = x0;
@@ -1924,22 +1928,22 @@
   s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
   s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
 
-  x0 = WRAPLOW(s0 + s4, bd);
-  x1 = WRAPLOW(s1 + s5, bd);
-  x2 = WRAPLOW(s2 + s6, bd);
-  x3 = WRAPLOW(s3 + s7, bd);
-  x4 = WRAPLOW(s0 - s4, bd);
-  x5 = WRAPLOW(s1 - s5, bd);
-  x6 = WRAPLOW(s2 - s6, bd);
-  x7 = WRAPLOW(s3 - s7, bd);
-  x8 = WRAPLOW(highbd_dct_const_round_shift(s8 + s12, bd), bd);
-  x9 = WRAPLOW(highbd_dct_const_round_shift(s9 + s13, bd), bd);
-  x10 = WRAPLOW(highbd_dct_const_round_shift(s10 + s14, bd), bd);
-  x11 = WRAPLOW(highbd_dct_const_round_shift(s11 + s15, bd), bd);
-  x12 = WRAPLOW(highbd_dct_const_round_shift(s8 - s12, bd), bd);
-  x13 = WRAPLOW(highbd_dct_const_round_shift(s9 - s13, bd), bd);
-  x14 = WRAPLOW(highbd_dct_const_round_shift(s10 - s14, bd), bd);
-  x15 = WRAPLOW(highbd_dct_const_round_shift(s11 - s15, bd), bd);
+  x0 = HIGHBD_WRAPLOW(s0 + s4, bd);
+  x1 = HIGHBD_WRAPLOW(s1 + s5, bd);
+  x2 = HIGHBD_WRAPLOW(s2 + s6, bd);
+  x3 = HIGHBD_WRAPLOW(s3 + s7, bd);
+  x4 = HIGHBD_WRAPLOW(s0 - s4, bd);
+  x5 = HIGHBD_WRAPLOW(s1 - s5, bd);
+  x6 = HIGHBD_WRAPLOW(s2 - s6, bd);
+  x7 = HIGHBD_WRAPLOW(s3 - s7, bd);
+  x8 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s8 + s12), bd);
+  x9 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s9 + s13), bd);
+  x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10 + s14), bd);
+  x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11 + s15), bd);
+  x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s8 - s12), bd);
+  x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s9 - s13), bd);
+  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10 - s14), bd);
+  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11 - s15), bd);
 
   // stage 3
   s0 = x0;
@@ -1959,22 +1963,22 @@
   s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
   s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
 
-  x0 = WRAPLOW(s0 + s2, bd);
-  x1 = WRAPLOW(s1 + s3, bd);
-  x2 = WRAPLOW(s0 - s2, bd);
-  x3 = WRAPLOW(s1 - s3, bd);
-  x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
-  x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
-  x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
-  x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
-  x8 = WRAPLOW(s8 + s10, bd);
-  x9 = WRAPLOW(s9 + s11, bd);
-  x10 = WRAPLOW(s8 - s10, bd);
-  x11 = WRAPLOW(s9 - s11, bd);
-  x12 = WRAPLOW(highbd_dct_const_round_shift(s12 + s14, bd), bd);
-  x13 = WRAPLOW(highbd_dct_const_round_shift(s13 + s15, bd), bd);
-  x14 = WRAPLOW(highbd_dct_const_round_shift(s12 - s14, bd), bd);
-  x15 = WRAPLOW(highbd_dct_const_round_shift(s13 - s15, bd), bd);
+  x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
+  x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
+  x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
+  x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
+  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s6), bd);
+  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s7), bd);
+  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s6), bd);
+  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s7), bd);
+  x8 = HIGHBD_WRAPLOW(s8 + s10, bd);
+  x9 = HIGHBD_WRAPLOW(s9 + s11, bd);
+  x10 = HIGHBD_WRAPLOW(s8 - s10, bd);
+  x11 = HIGHBD_WRAPLOW(s9 - s11, bd);
+  x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s12 + s14), bd);
+  x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 + s15), bd);
+  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s12 - s14), bd);
+  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 - s15), bd);
 
   // stage 4
   s2 = (- cospi_16_64) * (x2 + x3);
@@ -1986,31 +1990,31 @@
   s14 = (- cospi_16_64) * (x14 + x15);
   s15 = cospi_16_64 * (x14 - x15);
 
-  x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
-  x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
-  x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
-  x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
-  x10 = WRAPLOW(highbd_dct_const_round_shift(s10, bd), bd);
-  x11 = WRAPLOW(highbd_dct_const_round_shift(s11, bd), bd);
-  x14 = WRAPLOW(highbd_dct_const_round_shift(s14, bd), bd);
-  x15 = WRAPLOW(highbd_dct_const_round_shift(s15, bd), bd);
+  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
+  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3), bd);
+  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6), bd);
+  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7), bd);
+  x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10), bd);
+  x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11), bd);
+  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s14), bd);
+  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s15), bd);
 
-  output[0] = WRAPLOW(x0, bd);
-  output[1] = WRAPLOW(-x8, bd);
-  output[2] = WRAPLOW(x12, bd);
-  output[3] = WRAPLOW(-x4, bd);
-  output[4] = WRAPLOW(x6, bd);
-  output[5] = WRAPLOW(x14, bd);
-  output[6] = WRAPLOW(x10, bd);
-  output[7] = WRAPLOW(x2, bd);
-  output[8] = WRAPLOW(x3, bd);
-  output[9] = WRAPLOW(x11, bd);
-  output[10] = WRAPLOW(x15, bd);
-  output[11] = WRAPLOW(x7, bd);
-  output[12] = WRAPLOW(x5, bd);
-  output[13] = WRAPLOW(-x13, bd);
-  output[14] = WRAPLOW(x9, bd);
-  output[15] = WRAPLOW(-x1, bd);
+  output[0] = HIGHBD_WRAPLOW(x0, bd);
+  output[1] = HIGHBD_WRAPLOW(-x8, bd);
+  output[2] = HIGHBD_WRAPLOW(x12, bd);
+  output[3] = HIGHBD_WRAPLOW(-x4, bd);
+  output[4] = HIGHBD_WRAPLOW(x6, bd);
+  output[5] = HIGHBD_WRAPLOW(x14, bd);
+  output[6] = HIGHBD_WRAPLOW(x10, bd);
+  output[7] = HIGHBD_WRAPLOW(x2, bd);
+  output[8] = HIGHBD_WRAPLOW(x3, bd);
+  output[9] = HIGHBD_WRAPLOW(x11, bd);
+  output[10] = HIGHBD_WRAPLOW(x15, bd);
+  output[11] = HIGHBD_WRAPLOW(x7, bd);
+  output[12] = HIGHBD_WRAPLOW(x5, bd);
+  output[13] = HIGHBD_WRAPLOW(-x13, bd);
+  output[14] = HIGHBD_WRAPLOW(x9, bd);
+  output[15] = HIGHBD_WRAPLOW(-x1, bd);
 }
 
 void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
@@ -2045,11 +2049,11 @@
                                   int stride, int bd) {
   int i, j;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(
-      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
+  tran_low_t out = HIGHBD_WRAPLOW(
+      highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
+  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 6);
   for (j = 0; j < 16; ++j) {
     for (i = 0; i < 16; ++i)
@@ -2084,43 +2088,43 @@
 
   temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
   temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
-  step1[16] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[31] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[16] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[31] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
   temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
-  step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[17] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[30] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
   temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
-  step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
   temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
-  step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[19] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[28] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
   temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
-  step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
   temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
-  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
   temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
-  step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
   temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
-  step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[23] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[24] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   // stage 2
   step2[0] = step1[0];
@@ -2134,40 +2138,40 @@
 
   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[8] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[15] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
-  step2[16] = WRAPLOW(step1[16] + step1[17], bd);
-  step2[17] = WRAPLOW(step1[16] - step1[17], bd);
-  step2[18] = WRAPLOW(-step1[18] + step1[19], bd);
-  step2[19] = WRAPLOW(step1[18] + step1[19], bd);
-  step2[20] = WRAPLOW(step1[20] + step1[21], bd);
-  step2[21] = WRAPLOW(step1[20] - step1[21], bd);
-  step2[22] = WRAPLOW(-step1[22] + step1[23], bd);
-  step2[23] = WRAPLOW(step1[22] + step1[23], bd);
-  step2[24] = WRAPLOW(step1[24] + step1[25], bd);
-  step2[25] = WRAPLOW(step1[24] - step1[25], bd);
-  step2[26] = WRAPLOW(-step1[26] + step1[27], bd);
-  step2[27] = WRAPLOW(step1[26] + step1[27], bd);
-  step2[28] = WRAPLOW(step1[28] + step1[29], bd);
-  step2[29] = WRAPLOW(step1[28] - step1[29], bd);
-  step2[30] = WRAPLOW(-step1[30] + step1[31], bd);
-  step2[31] = WRAPLOW(step1[30] + step1[31], bd);
+  step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[17], bd);
+  step2[17] = HIGHBD_WRAPLOW(step1[16] - step1[17], bd);
+  step2[18] = HIGHBD_WRAPLOW(-step1[18] + step1[19], bd);
+  step2[19] = HIGHBD_WRAPLOW(step1[18] + step1[19], bd);
+  step2[20] = HIGHBD_WRAPLOW(step1[20] + step1[21], bd);
+  step2[21] = HIGHBD_WRAPLOW(step1[20] - step1[21], bd);
+  step2[22] = HIGHBD_WRAPLOW(-step1[22] + step1[23], bd);
+  step2[23] = HIGHBD_WRAPLOW(step1[22] + step1[23], bd);
+  step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[25], bd);
+  step2[25] = HIGHBD_WRAPLOW(step1[24] - step1[25], bd);
+  step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[27], bd);
+  step2[27] = HIGHBD_WRAPLOW(step1[26] + step1[27], bd);
+  step2[28] = HIGHBD_WRAPLOW(step1[28] + step1[29], bd);
+  step2[29] = HIGHBD_WRAPLOW(step1[28] - step1[29], bd);
+  step2[30] = HIGHBD_WRAPLOW(-step1[30] + step1[31], bd);
+  step2[31] = HIGHBD_WRAPLOW(step1[30] + step1[31], bd);
 
   // stage 3
   step1[0] = step2[0];
@@ -2177,42 +2181,42 @@
 
   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
-  step1[8] = WRAPLOW(step2[8] + step2[9], bd);
-  step1[9] = WRAPLOW(step2[8] - step2[9], bd);
-  step1[10] = WRAPLOW(-step2[10] + step2[11], bd);
-  step1[11] = WRAPLOW(step2[10] + step2[11], bd);
-  step1[12] = WRAPLOW(step2[12] + step2[13], bd);
-  step1[13] = WRAPLOW(step2[12] - step2[13], bd);
-  step1[14] = WRAPLOW(-step2[14] + step2[15], bd);
-  step1[15] = WRAPLOW(step2[14] + step2[15], bd);
+  step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
+  step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
+  step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
+  step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
+  step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
+  step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
+  step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
+  step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
 
   step1[16] = step2[16];
   step1[31] = step2[31];
   temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
   temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
-  step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[17] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[30] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
   temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
-  step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step1[19] = step2[19];
   step1[20] = step2[20];
   temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
   temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
-  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
   temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
-  step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step1[23] = step2[23];
   step1[24] = step2[24];
   step1[27] = step2[27];
@@ -2221,87 +2225,87 @@
   // stage 4
   temp1 = (step1[0] + step1[1]) * cospi_16_64;
   temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-  step2[4] = WRAPLOW(step1[4] + step1[5], bd);
-  step2[5] = WRAPLOW(step1[4] - step1[5], bd);
-  step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
-  step2[7] = WRAPLOW(step1[6] + step1[7], bd);
+  step2[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
+  step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
+  step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
+  step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
 
   step2[8] = step1[8];
   step2[15] = step1[15];
   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step2[11] = step1[11];
   step2[12] = step1[12];
 
-  step2[16] = WRAPLOW(step1[16] + step1[19], bd);
-  step2[17] = WRAPLOW(step1[17] + step1[18], bd);
-  step2[18] = WRAPLOW(step1[17] - step1[18], bd);
-  step2[19] = WRAPLOW(step1[16] - step1[19], bd);
-  step2[20] = WRAPLOW(-step1[20] + step1[23], bd);
-  step2[21] = WRAPLOW(-step1[21] + step1[22], bd);
-  step2[22] = WRAPLOW(step1[21] + step1[22], bd);
-  step2[23] = WRAPLOW(step1[20] + step1[23], bd);
+  step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[19], bd);
+  step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[18], bd);
+  step2[18] = HIGHBD_WRAPLOW(step1[17] - step1[18], bd);
+  step2[19] = HIGHBD_WRAPLOW(step1[16] - step1[19], bd);
+  step2[20] = HIGHBD_WRAPLOW(-step1[20] + step1[23], bd);
+  step2[21] = HIGHBD_WRAPLOW(-step1[21] + step1[22], bd);
+  step2[22] = HIGHBD_WRAPLOW(step1[21] + step1[22], bd);
+  step2[23] = HIGHBD_WRAPLOW(step1[20] + step1[23], bd);
 
-  step2[24] = WRAPLOW(step1[24] + step1[27], bd);
-  step2[25] = WRAPLOW(step1[25] + step1[26], bd);
-  step2[26] = WRAPLOW(step1[25] - step1[26], bd);
-  step2[27] = WRAPLOW(step1[24] - step1[27], bd);
-  step2[28] = WRAPLOW(-step1[28] + step1[31], bd);
-  step2[29] = WRAPLOW(-step1[29] + step1[30], bd);
-  step2[30] = WRAPLOW(step1[29] + step1[30], bd);
-  step2[31] = WRAPLOW(step1[28] + step1[31], bd);
+  step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[27], bd);
+  step2[25] = HIGHBD_WRAPLOW(step1[25] + step1[26], bd);
+  step2[26] = HIGHBD_WRAPLOW(step1[25] - step1[26], bd);
+  step2[27] = HIGHBD_WRAPLOW(step1[24] - step1[27], bd);
+  step2[28] = HIGHBD_WRAPLOW(-step1[28] + step1[31], bd);
+  step2[29] = HIGHBD_WRAPLOW(-step1[29] + step1[30], bd);
+  step2[30] = HIGHBD_WRAPLOW(step1[29] + step1[30], bd);
+  step2[31] = HIGHBD_WRAPLOW(step1[28] + step1[31], bd);
 
   // stage 5
-  step1[0] = WRAPLOW(step2[0] + step2[3], bd);
-  step1[1] = WRAPLOW(step2[1] + step2[2], bd);
-  step1[2] = WRAPLOW(step2[1] - step2[2], bd);
-  step1[3] = WRAPLOW(step2[0] - step2[3], bd);
+  step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
+  step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
+  step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
+  step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step1[7] = step2[7];
 
-  step1[8] = WRAPLOW(step2[8] + step2[11], bd);
-  step1[9] = WRAPLOW(step2[9] + step2[10], bd);
-  step1[10] = WRAPLOW(step2[9] - step2[10], bd);
-  step1[11] = WRAPLOW(step2[8] - step2[11], bd);
-  step1[12] = WRAPLOW(-step2[12] + step2[15], bd);
-  step1[13] = WRAPLOW(-step2[13] + step2[14], bd);
-  step1[14] = WRAPLOW(step2[13] + step2[14], bd);
-  step1[15] = WRAPLOW(step2[12] + step2[15], bd);
+  step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
+  step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
+  step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
+  step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
+  step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
+  step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
+  step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
+  step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
 
   step1[16] = step2[16];
   step1[17] = step2[17];
   temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
   temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
-  step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
   temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
-  step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[19] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[28] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
   temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
-  step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
   temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
-  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step1[22] = step2[22];
   step1[23] = step2[23];
   step1[24] = step2[24];
@@ -2310,62 +2314,62 @@
   step1[31] = step2[31];
 
   // stage 6
-  step2[0] = WRAPLOW(step1[0] + step1[7], bd);
-  step2[1] = WRAPLOW(step1[1] + step1[6], bd);
-  step2[2] = WRAPLOW(step1[2] + step1[5], bd);
-  step2[3] = WRAPLOW(step1[3] + step1[4], bd);
-  step2[4] = WRAPLOW(step1[3] - step1[4], bd);
-  step2[5] = WRAPLOW(step1[2] - step1[5], bd);
-  step2[6] = WRAPLOW(step1[1] - step1[6], bd);
-  step2[7] = WRAPLOW(step1[0] - step1[7], bd);
+  step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
+  step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
+  step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
+  step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
+  step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
+  step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
+  step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
+  step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
   step2[8] = step1[8];
   step2[9] = step1[9];
   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step2[14] = step1[14];
   step2[15] = step1[15];
 
-  step2[16] = WRAPLOW(step1[16] + step1[23], bd);
-  step2[17] = WRAPLOW(step1[17] + step1[22], bd);
-  step2[18] = WRAPLOW(step1[18] + step1[21], bd);
-  step2[19] = WRAPLOW(step1[19] + step1[20], bd);
-  step2[20] = WRAPLOW(step1[19] - step1[20], bd);
-  step2[21] = WRAPLOW(step1[18] - step1[21], bd);
-  step2[22] = WRAPLOW(step1[17] - step1[22], bd);
-  step2[23] = WRAPLOW(step1[16] - step1[23], bd);
+  step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[23], bd);
+  step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[22], bd);
+  step2[18] = HIGHBD_WRAPLOW(step1[18] + step1[21], bd);
+  step2[19] = HIGHBD_WRAPLOW(step1[19] + step1[20], bd);
+  step2[20] = HIGHBD_WRAPLOW(step1[19] - step1[20], bd);
+  step2[21] = HIGHBD_WRAPLOW(step1[18] - step1[21], bd);
+  step2[22] = HIGHBD_WRAPLOW(step1[17] - step1[22], bd);
+  step2[23] = HIGHBD_WRAPLOW(step1[16] - step1[23], bd);
 
-  step2[24] = WRAPLOW(-step1[24] + step1[31], bd);
-  step2[25] = WRAPLOW(-step1[25] + step1[30], bd);
-  step2[26] = WRAPLOW(-step1[26] + step1[29], bd);
-  step2[27] = WRAPLOW(-step1[27] + step1[28], bd);
-  step2[28] = WRAPLOW(step1[27] + step1[28], bd);
-  step2[29] = WRAPLOW(step1[26] + step1[29], bd);
-  step2[30] = WRAPLOW(step1[25] + step1[30], bd);
-  step2[31] = WRAPLOW(step1[24] + step1[31], bd);
+  step2[24] = HIGHBD_WRAPLOW(-step1[24] + step1[31], bd);
+  step2[25] = HIGHBD_WRAPLOW(-step1[25] + step1[30], bd);
+  step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[29], bd);
+  step2[27] = HIGHBD_WRAPLOW(-step1[27] + step1[28], bd);
+  step2[28] = HIGHBD_WRAPLOW(step1[27] + step1[28], bd);
+  step2[29] = HIGHBD_WRAPLOW(step1[26] + step1[29], bd);
+  step2[30] = HIGHBD_WRAPLOW(step1[25] + step1[30], bd);
+  step2[31] = HIGHBD_WRAPLOW(step1[24] + step1[31], bd);
 
   // stage 7
-  step1[0] = WRAPLOW(step2[0] + step2[15], bd);
-  step1[1] = WRAPLOW(step2[1] + step2[14], bd);
-  step1[2] = WRAPLOW(step2[2] + step2[13], bd);
-  step1[3] = WRAPLOW(step2[3] + step2[12], bd);
-  step1[4] = WRAPLOW(step2[4] + step2[11], bd);
-  step1[5] = WRAPLOW(step2[5] + step2[10], bd);
-  step1[6] = WRAPLOW(step2[6] + step2[9], bd);
-  step1[7] = WRAPLOW(step2[7] + step2[8], bd);
-  step1[8] = WRAPLOW(step2[7] - step2[8], bd);
-  step1[9] = WRAPLOW(step2[6] - step2[9], bd);
-  step1[10] = WRAPLOW(step2[5] - step2[10], bd);
-  step1[11] = WRAPLOW(step2[4] - step2[11], bd);
-  step1[12] = WRAPLOW(step2[3] - step2[12], bd);
-  step1[13] = WRAPLOW(step2[2] - step2[13], bd);
-  step1[14] = WRAPLOW(step2[1] - step2[14], bd);
-  step1[15] = WRAPLOW(step2[0] - step2[15], bd);
+  step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
+  step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
+  step1[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
+  step1[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
+  step1[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
+  step1[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
+  step1[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
+  step1[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
+  step1[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
+  step1[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
+  step1[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
+  step1[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
+  step1[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
+  step1[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
+  step1[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
+  step1[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
 
   step1[16] = step2[16];
   step1[17] = step2[17];
@@ -2373,58 +2377,58 @@
   step1[19] = step2[19];
   temp1 = (-step2[20] + step2[27]) * cospi_16_64;
   temp2 = (step2[20] + step2[27]) * cospi_16_64;
-  step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = (-step2[21] + step2[26]) * cospi_16_64;
   temp2 = (step2[21] + step2[26]) * cospi_16_64;
-  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = (-step2[22] + step2[25]) * cospi_16_64;
   temp2 = (step2[22] + step2[25]) * cospi_16_64;
-  step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = (-step2[23] + step2[24]) * cospi_16_64;
   temp2 = (step2[23] + step2[24]) * cospi_16_64;
-  step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[23] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[24] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step1[28] = step2[28];
   step1[29] = step2[29];
   step1[30] = step2[30];
   step1[31] = step2[31];
 
   // final stage
-  output[0] = WRAPLOW(step1[0] + step1[31], bd);
-  output[1] = WRAPLOW(step1[1] + step1[30], bd);
-  output[2] = WRAPLOW(step1[2] + step1[29], bd);
-  output[3] = WRAPLOW(step1[3] + step1[28], bd);
-  output[4] = WRAPLOW(step1[4] + step1[27], bd);
-  output[5] = WRAPLOW(step1[5] + step1[26], bd);
-  output[6] = WRAPLOW(step1[6] + step1[25], bd);
-  output[7] = WRAPLOW(step1[7] + step1[24], bd);
-  output[8] = WRAPLOW(step1[8] + step1[23], bd);
-  output[9] = WRAPLOW(step1[9] + step1[22], bd);
-  output[10] = WRAPLOW(step1[10] + step1[21], bd);
-  output[11] = WRAPLOW(step1[11] + step1[20], bd);
-  output[12] = WRAPLOW(step1[12] + step1[19], bd);
-  output[13] = WRAPLOW(step1[13] + step1[18], bd);
-  output[14] = WRAPLOW(step1[14] + step1[17], bd);
-  output[15] = WRAPLOW(step1[15] + step1[16], bd);
-  output[16] = WRAPLOW(step1[15] - step1[16], bd);
-  output[17] = WRAPLOW(step1[14] - step1[17], bd);
-  output[18] = WRAPLOW(step1[13] - step1[18], bd);
-  output[19] = WRAPLOW(step1[12] - step1[19], bd);
-  output[20] = WRAPLOW(step1[11] - step1[20], bd);
-  output[21] = WRAPLOW(step1[10] - step1[21], bd);
-  output[22] = WRAPLOW(step1[9] - step1[22], bd);
-  output[23] = WRAPLOW(step1[8] - step1[23], bd);
-  output[24] = WRAPLOW(step1[7] - step1[24], bd);
-  output[25] = WRAPLOW(step1[6] - step1[25], bd);
-  output[26] = WRAPLOW(step1[5] - step1[26], bd);
-  output[27] = WRAPLOW(step1[4] - step1[27], bd);
-  output[28] = WRAPLOW(step1[3] - step1[28], bd);
-  output[29] = WRAPLOW(step1[2] - step1[29], bd);
-  output[30] = WRAPLOW(step1[1] - step1[30], bd);
-  output[31] = WRAPLOW(step1[0] - step1[31], bd);
+  output[0] = HIGHBD_WRAPLOW(step1[0] + step1[31], bd);
+  output[1] = HIGHBD_WRAPLOW(step1[1] + step1[30], bd);
+  output[2] = HIGHBD_WRAPLOW(step1[2] + step1[29], bd);
+  output[3] = HIGHBD_WRAPLOW(step1[3] + step1[28], bd);
+  output[4] = HIGHBD_WRAPLOW(step1[4] + step1[27], bd);
+  output[5] = HIGHBD_WRAPLOW(step1[5] + step1[26], bd);
+  output[6] = HIGHBD_WRAPLOW(step1[6] + step1[25], bd);
+  output[7] = HIGHBD_WRAPLOW(step1[7] + step1[24], bd);
+  output[8] = HIGHBD_WRAPLOW(step1[8] + step1[23], bd);
+  output[9] = HIGHBD_WRAPLOW(step1[9] + step1[22], bd);
+  output[10] = HIGHBD_WRAPLOW(step1[10] + step1[21], bd);
+  output[11] = HIGHBD_WRAPLOW(step1[11] + step1[20], bd);
+  output[12] = HIGHBD_WRAPLOW(step1[12] + step1[19], bd);
+  output[13] = HIGHBD_WRAPLOW(step1[13] + step1[18], bd);
+  output[14] = HIGHBD_WRAPLOW(step1[14] + step1[17], bd);
+  output[15] = HIGHBD_WRAPLOW(step1[15] + step1[16], bd);
+  output[16] = HIGHBD_WRAPLOW(step1[15] - step1[16], bd);
+  output[17] = HIGHBD_WRAPLOW(step1[14] - step1[17], bd);
+  output[18] = HIGHBD_WRAPLOW(step1[13] - step1[18], bd);
+  output[19] = HIGHBD_WRAPLOW(step1[12] - step1[19], bd);
+  output[20] = HIGHBD_WRAPLOW(step1[11] - step1[20], bd);
+  output[21] = HIGHBD_WRAPLOW(step1[10] - step1[21], bd);
+  output[22] = HIGHBD_WRAPLOW(step1[9] - step1[22], bd);
+  output[23] = HIGHBD_WRAPLOW(step1[8] - step1[23], bd);
+  output[24] = HIGHBD_WRAPLOW(step1[7] - step1[24], bd);
+  output[25] = HIGHBD_WRAPLOW(step1[6] - step1[25], bd);
+  output[26] = HIGHBD_WRAPLOW(step1[5] - step1[26], bd);
+  output[27] = HIGHBD_WRAPLOW(step1[4] - step1[27], bd);
+  output[28] = HIGHBD_WRAPLOW(step1[3] - step1[28], bd);
+  output[29] = HIGHBD_WRAPLOW(step1[2] - step1[29], bd);
+  output[30] = HIGHBD_WRAPLOW(step1[1] - step1[30], bd);
+  output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd);
 }
 
 void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
@@ -2500,9 +2504,9 @@
   int a1;
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  tran_low_t out = WRAPLOW(
-      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
-  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
+  tran_low_t out = HIGHBD_WRAPLOW(
+      highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
+  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 6);
 
   for (j = 0; j < 32; ++j) {
diff --git a/vpx_dsp/inv_txfm.h b/vpx_dsp/inv_txfm.h
index adbb838..6397e66 100644
--- a/vpx_dsp/inv_txfm.h
+++ b/vpx_dsp/inv_txfm.h
@@ -21,7 +21,7 @@
 extern "C" {
 #endif
 
-static INLINE tran_low_t check_range(tran_high_t input) {
+static INLINE tran_high_t check_range(tran_high_t input) {
 #if CONFIG_COEFFICIENT_RANGE_CHECKING
   // For valid VP9 input streams, intermediate stage coefficients should always
   // stay within the range of a signed 16 bit integer. Coefficients can go out
@@ -32,17 +32,17 @@
   assert(INT16_MIN <= input);
   assert(input <= INT16_MAX);
 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
-  return (tran_low_t)input;
+  return input;
 }
 
-static INLINE tran_low_t dct_const_round_shift(tran_high_t input) {
+static INLINE tran_high_t dct_const_round_shift(tran_high_t input) {
   tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-  return check_range(rv);
+  return (tran_high_t)rv;
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static INLINE tran_low_t highbd_check_range(tran_high_t input,
-                                            int bd) {
+static INLINE tran_high_t highbd_check_range(tran_high_t input,
+                                             int bd) {
 #if CONFIG_COEFFICIENT_RANGE_CHECKING
   // For valid highbitdepth VP9 streams, intermediate stage coefficients will
   // stay within the ranges:
@@ -56,13 +56,12 @@
   (void) int_min;
 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
   (void) bd;
-  return (tran_low_t)input;
+  return input;
 }
 
-static INLINE tran_low_t highbd_dct_const_round_shift(tran_high_t input,
-                                                      int bd) {
+static INLINE tran_high_t highbd_dct_const_round_shift(tran_high_t input) {
   tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-  return highbd_check_range(rv, bd);
+  return (tran_high_t)rv;
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
@@ -83,9 +82,20 @@
 // bd of 10 uses trans_low with 18bits, need to remove 14bits
 // bd of 12 uses trans_low with 20bits, need to remove 12bits
 // bd of x uses trans_low with 8+x bits, need to remove 24-x bits
-#define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd))
-#else
-#define WRAPLOW(x, bd) ((int32_t)(x))
+
+#define WRAPLOW(x) ((((int32_t)check_range(x)) << 16) >> 16)
+#if CONFIG_VP9_HIGHBITDEPTH
+#define HIGHBD_WRAPLOW(x, bd) \
+    ((((int32_t)highbd_check_range((x), bd)) << (24 - bd)) >> (24 - bd))
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#else   // CONFIG_EMULATE_HARDWARE
+
+#define WRAPLOW(x) ((int32_t)check_range(x))
+#if CONFIG_VP9_HIGHBITDEPTH
+#define HIGHBD_WRAPLOW(x, bd) \
+    ((int32_t)highbd_check_range((x), bd))
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // CONFIG_EMULATE_HARDWARE
 
 void idct4_c(const tran_low_t *input, tran_low_t *output);
@@ -108,14 +118,14 @@
 
 static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
                                              int bd) {
-  trans = WRAPLOW(trans, bd);
-  return clip_pixel_highbd(WRAPLOW(dest + trans, bd), bd);
+  trans = HIGHBD_WRAPLOW(trans, bd);
+  return clip_pixel_highbd(dest + (int)trans, bd);
 }
 #endif
 
 static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) {
-  trans = WRAPLOW(trans, 8);
-  return clip_pixel(WRAPLOW(dest + trans, 8));
+  trans = WRAPLOW(trans);
+  return clip_pixel(dest + (int)trans);
 }
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vpx_dsp/mips/add_noise_msa.c b/vpx_dsp/mips/add_noise_msa.c
new file mode 100644
index 0000000..366770c
--- /dev/null
+++ b/vpx_dsp/mips/add_noise_msa.c
@@ -0,0 +1,59 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include "./macros_msa.h"
+
+void vpx_plane_add_noise_msa(uint8_t *start_ptr, char *noise,
+                             char blackclamp[16], char whiteclamp[16],
+                             char bothclamp[16], uint32_t width,
+                             uint32_t height, int32_t pitch) {
+  uint32_t i, j;
+
+  for (i = 0; i < height / 2; ++i) {
+    uint8_t *pos0_ptr = start_ptr + (2 * i) * pitch;
+    int8_t *ref0_ptr = (int8_t *)(noise + (rand() & 0xff));
+    uint8_t *pos1_ptr = start_ptr + (2 * i + 1) * pitch;
+    int8_t *ref1_ptr = (int8_t *)(noise + (rand() & 0xff));
+    for (j = width / 16; j--;) {
+      v16i8 temp00_s, temp01_s;
+      v16u8 temp00, temp01, black_clamp, white_clamp;
+      v16u8 pos0, ref0, pos1, ref1;
+      v16i8 const127 = __msa_ldi_b(127);
+
+      pos0 = LD_UB(pos0_ptr);
+      ref0 = LD_UB(ref0_ptr);
+      pos1 = LD_UB(pos1_ptr);
+      ref1 = LD_UB(ref1_ptr);
+      black_clamp = (v16u8)__msa_fill_b(blackclamp[0]);
+      white_clamp = (v16u8)__msa_fill_b(whiteclamp[0]);
+      temp00 = (pos0 < black_clamp);
+      pos0 = __msa_bmnz_v(pos0, black_clamp, temp00);
+      temp01 = (pos1 < black_clamp);
+      pos1 = __msa_bmnz_v(pos1, black_clamp, temp01);
+      XORI_B2_128_UB(pos0, pos1);
+      temp00_s = __msa_adds_s_b((v16i8)white_clamp, const127);
+      temp00 = (v16u8)(temp00_s < pos0);
+      pos0 = (v16u8)__msa_bmnz_v((v16u8)pos0, (v16u8)temp00_s, temp00);
+      temp01_s = __msa_adds_s_b((v16i8)white_clamp, const127);
+      temp01 = (temp01_s < pos1);
+      pos1 = (v16u8)__msa_bmnz_v((v16u8)pos1, (v16u8)temp01_s, temp01);
+      XORI_B2_128_UB(pos0, pos1);
+      pos0 += ref0;
+      ST_UB(pos0, pos0_ptr);
+      pos1 += ref1;
+      ST_UB(pos1, pos1_ptr);
+      pos0_ptr += 16;
+      pos1_ptr += 16;
+      ref0_ptr += 16;
+      ref1_ptr += 16;
+    }
+  }
+}
diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c
index 6426ccc..80fcd66 100644
--- a/vpx_dsp/quantize.c
+++ b/vpx_dsp/quantize.c
@@ -53,7 +53,7 @@
     const int coeff_sign = (coeff >> 31);
     const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
     const int64_t tmp = abs_coeff + round_ptr[0];
-    const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 16);
+    const int abs_qcoeff = (int)((tmp * quant) >> 16);
     qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
     dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr;
     if (abs_qcoeff)
@@ -109,7 +109,7 @@
     const int coeff_sign = (coeff >> 31);
     const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
     const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 1);
-    const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 15);
+    const int abs_qcoeff = (int)((tmp * quant) >> 15);
     qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
     dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / 2;
     if (abs_qcoeff)
diff --git a/vpx_dsp/sad.c b/vpx_dsp/sad.c
index e64dae3..e49148d 100644
--- a/vpx_dsp/sad.c
+++ b/vpx_dsp/sad.c
@@ -33,47 +33,6 @@
   return sad;
 }
 
-// TODO(johannkoenig): this moved to vpx_dsp, should be able to clean this up.
-/* Remove dependency on vp9 variance function by duplicating vp9_comp_avg_pred.
- * The function averages every corresponding element of the buffers and stores
- * the value in a third buffer, comp_pred.
- * pred and comp_pred are assumed to have stride = width
- * In the usage below comp_pred is a local array.
- */
-static INLINE void avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
-                            int height, const uint8_t *ref, int ref_stride) {
-  int i, j;
-
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
-      const int tmp = pred[j] + ref[j];
-      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
-    }
-    comp_pred += width;
-    pred += width;
-    ref += ref_stride;
-  }
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static INLINE void highbd_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
-                                   int width, int height, const uint8_t *ref8,
-                                   int ref_stride) {
-  int i, j;
-  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
-      const int tmp = pred[j] + ref[j];
-      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
-    }
-    comp_pred += width;
-    pred += width;
-    ref += ref_stride;
-  }
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
 #define sadMxN(m, n) \
 unsigned int vpx_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
                                   const uint8_t *ref, int ref_stride) { \
@@ -83,7 +42,7 @@
                                       const uint8_t *ref, int ref_stride, \
                                       const uint8_t *second_pred) { \
   uint8_t comp_pred[m * n]; \
-  avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \
+  vpx_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride); \
   return sad(src, src_stride, comp_pred, m, m, n); \
 }
 
@@ -237,7 +196,7 @@
                                              int ref_stride, \
                                              const uint8_t *second_pred) { \
   uint16_t comp_pred[m * n]; \
-  highbd_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \
+  vpx_highbd_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride); \
   return highbd_sadb(src, src_stride, comp_pred, m, m, n); \
 }
 
diff --git a/vpx_dsp/variance.c b/vpx_dsp/variance.c
index af2bba2..3fd80dc 100644
--- a/vpx_dsp/variance.c
+++ b/vpx_dsp/variance.c
@@ -304,7 +304,7 @@
 #if CONFIG_VP9_HIGHBITDEPTH
 static void highbd_variance64(const uint8_t *a8, int  a_stride,
                               const uint8_t *b8, int  b_stride,
-                              int w, int h, uint64_t *sse, uint64_t *sum) {
+                              int w, int h, uint64_t *sse, int64_t *sum) {
   int i, j;
 
   uint16_t *a = CONVERT_TO_SHORTPTR(a8);
@@ -327,7 +327,7 @@
                               const uint8_t *b8, int  b_stride,
                               int w, int h, uint32_t *sse, int *sum) {
   uint64_t sse_long = 0;
-  uint64_t sum_long = 0;
+  int64_t sum_long = 0;
   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
   *sse = (uint32_t)sse_long;
   *sum = (int)sum_long;
@@ -337,7 +337,7 @@
                                const uint8_t *b8, int  b_stride,
                                int w, int h, uint32_t *sse, int *sum) {
   uint64_t sse_long = 0;
-  uint64_t sum_long = 0;
+  int64_t sum_long = 0;
   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
   *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
   *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
@@ -347,7 +347,7 @@
                                const uint8_t *b8, int  b_stride,
                                int w, int h, uint32_t *sse, int *sum) {
   uint64_t sse_long = 0;
-  uint64_t sum_long = 0;
+  int64_t sum_long = 0;
   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
   *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
   *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
@@ -370,8 +370,10 @@
                                              int b_stride, \
                                              uint32_t *sse) { \
   int sum; \
+  int64_t var; \
   highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
-  return *sse - (((int64_t)sum * sum) / (W * H)); \
+  var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
+  return (var >= 0) ? (uint32_t)var : 0; \
 } \
 \
 uint32_t vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, \
@@ -380,8 +382,10 @@
                                              int b_stride, \
                                              uint32_t *sse) { \
   int sum; \
+  int64_t var; \
   highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
-  return *sse - (((int64_t)sum * sum) / (W * H)); \
+  var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
+  return (var >= 0) ? (uint32_t)var : 0; \
 }
 
 #define HIGHBD_GET_VAR(S) \
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index d55d952..06b46d3 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -43,7 +43,6 @@
 DSP_SRCS-yes += intrapred.c
 
 ifeq ($(CONFIG_USE_X86INC),yes)
-DSP_SRCS-$(HAVE_MMX) += x86/loopfilter_mmx.asm
 DSP_SRCS-$(HAVE_SSE) += x86/intrapred_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.asm
 DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.asm
@@ -57,6 +56,12 @@
 endif  # CONFIG_USE_X86INC
 endif  # CONFIG_VP9_HIGHBITDEPTH
 
+ifneq ($(filter yes,$(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),)
+DSP_SRCS-yes += add_noise.c
+DSP_SRCS-$(HAVE_MSA) += mips/add_noise_msa.c
+DSP_SRCS-$(HAVE_SSE2) += x86/add_noise_sse2.asm
+endif # CONFIG_POSTPROC
+
 DSP_SRCS-$(HAVE_NEON_ASM) += arm/intrapred_neon_asm$(ASM)
 DSP_SRCS-$(HAVE_NEON) += arm/intrapred_neon.c
 DSP_SRCS-$(HAVE_MSA) += mips/intrapred_msa.c
@@ -275,6 +280,7 @@
 DSP_SRCS-$(HAVE_SSE2)  += x86/avg_intrin_sse2.c
 DSP_SRCS-$(HAVE_NEON)  += arm/avg_neon.c
 DSP_SRCS-$(HAVE_MSA)   += mips/avg_msa.c
+DSP_SRCS-$(HAVE_NEON)  += arm/hadamard_neon.c
 ifeq ($(ARCH_X86_64),yes)
 ifeq ($(CONFIG_USE_X86INC),yes)
 DSP_SRCS-$(HAVE_SSSE3) += x86/avg_ssse3_x86_64.asm
@@ -306,7 +312,6 @@
 DSP_SRCS-$(HAVE_MSA)    += mips/sad_msa.c
 DSP_SRCS-$(HAVE_MSA)    += mips/subtract_msa.c
 
-DSP_SRCS-$(HAVE_MMX)    += x86/sad_mmx.asm
 DSP_SRCS-$(HAVE_SSE3)   += x86/sad_sse3.asm
 DSP_SRCS-$(HAVE_SSSE3)  += x86/sad_ssse3.asm
 DSP_SRCS-$(HAVE_SSE4_1) += x86/sad_sse4.asm
@@ -354,8 +359,6 @@
 DSP_SRCS-$(HAVE_MSA)    += mips/variance_msa.c
 DSP_SRCS-$(HAVE_MSA)    += mips/sub_pixel_variance_msa.c
 
-DSP_SRCS-$(HAVE_MMX)    += x86/variance_mmx.c
-DSP_SRCS-$(HAVE_MMX)    += x86/variance_impl_mmx.asm
 DSP_SRCS-$(HAVE_SSE)    += x86/variance_sse2.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/variance_sse2.c  # Contains SSE2 and SSSE3
 DSP_SRCS-$(HAVE_SSE2)   += x86/halfpix_variance_sse2.c
diff --git a/vpx_dsp/vpx_dsp_common.h b/vpx_dsp/vpx_dsp_common.h
index 7aaa89f..82bafb5 100644
--- a/vpx_dsp/vpx_dsp_common.h
+++ b/vpx_dsp/vpx_dsp_common.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_COMMON_H_
-#define VPX_DSP_COMMON_H_
+#ifndef VPX_DSP_VPX_DSP_COMMON_H_
+#define VPX_DSP_VPX_DSP_COMMON_H_
 
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
@@ -90,4 +90,4 @@
 }  // extern "C"
 #endif
 
-#endif  // VPX_DSP_COMMON_H_
+#endif  // VPX_DSP_VPX_DSP_COMMON_H_
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 7526bea..a04a684 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -68,13 +68,13 @@
 #
 
 add_proto qw/void vpx_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d207_predictor_4x4/, "$ssse3_x86inc";
+specialize qw/vpx_d207_predictor_4x4/, "$sse2_x86inc";
 
 add_proto qw/void vpx_d207e_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d207e_predictor_4x4/;
 
 add_proto qw/void vpx_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d45_predictor_4x4 neon/, "$ssse3_x86inc";
+specialize qw/vpx_d45_predictor_4x4 neon/, "$sse2_x86inc";
 
 add_proto qw/void vpx_d45e_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d45e_predictor_4x4/;
@@ -131,7 +131,7 @@
 specialize qw/vpx_d207e_predictor_8x8/;
 
 add_proto qw/void vpx_d45_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d45_predictor_8x8 neon/, "$ssse3_x86inc";
+specialize qw/vpx_d45_predictor_8x8 neon/, "$sse2_x86inc";
 
 add_proto qw/void vpx_d45e_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d45e_predictor_8x8/;
@@ -548,7 +548,7 @@
 $vpx_lpf_vertical_8_dual_neon_asm=vpx_lpf_vertical_8_dual_neon;
 
 add_proto qw/void vpx_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vpx_lpf_vertical_4 neon dspr2 msa/, "$mmx_x86inc";
+specialize qw/vpx_lpf_vertical_4 sse2 neon dspr2 msa/;
 
 add_proto qw/void vpx_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
 specialize qw/vpx_lpf_vertical_4_dual sse2 neon dspr2 msa/;
@@ -569,7 +569,7 @@
 $vpx_lpf_horizontal_8_dual_neon_asm=vpx_lpf_horizontal_8_dual_neon;
 
 add_proto qw/void vpx_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vpx_lpf_horizontal_4 neon dspr2 msa/, "$mmx_x86inc";
+specialize qw/vpx_lpf_horizontal_4 sse2 neon dspr2 msa/;
 
 add_proto qw/void vpx_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
 specialize qw/vpx_lpf_horizontal_4_dual sse2 neon dspr2 msa/;
@@ -998,6 +998,42 @@
   specialize qw/vpx_sum_squares_i16 sse2/;
 }
 
+add_proto qw/unsigned int vpx_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad32x64 avx2 msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad32x32 avx2 neon msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad32x16 avx2 msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad16x32 msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad16x16 media neon msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad16x8 neon msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad8x16 neon msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad8x8 neon msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad8x4 msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad4x8 msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad4x4 neon msa/, "$sse2_x86inc";
+
+#
+# Avg
+#
 if ((vpx_config("CONFIG_VP9_ENCODER") eq "yes") || (vpx_config("CONFIG_VP10_ENCODER") eq "yes")) {
   #
   # Avg
@@ -1019,17 +1055,17 @@
   # Minmax
   #
   add_proto qw/void vpx_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
-  specialize qw/vpx_minmax_8x8 sse2/;
+  specialize qw/vpx_minmax_8x8 sse2 neon/;
   if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
     specialize qw/vpx_highbd_minmax_8x8/;
   }
 
   add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
-  specialize qw/vpx_hadamard_8x8 sse2/, "$ssse3_x86_64_x86inc";
+  specialize qw/vpx_hadamard_8x8 sse2 neon/, "$ssse3_x86_64_x86inc";
 
   add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
-  specialize qw/vpx_hadamard_16x16 sse2/;
+  specialize qw/vpx_hadamard_16x16 sse2 neon/;
 
   add_proto qw/int vpx_satd/, "const int16_t *coeff, int length";
   specialize qw/vpx_satd sse2 neon/;
@@ -1062,13 +1098,13 @@
 specialize qw/vpx_sad32x32      avx2       neon msa/, "$sse2_x86inc";
 specialize qw/vpx_sad32x16      avx2            msa/, "$sse2_x86inc";
 specialize qw/vpx_sad16x32                      msa/, "$sse2_x86inc";
-specialize qw/vpx_sad16x16   mmx     media neon msa/, "$sse2_x86inc";
-specialize qw/vpx_sad16x8    mmx           neon msa/, "$sse2_x86inc";
-specialize qw/vpx_sad8x16    mmx           neon msa/, "$sse2_x86inc";
-specialize qw/vpx_sad8x8     mmx           neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad16x16           media neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad16x8                  neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad8x16                  neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad8x8                   neon msa/, "$sse2_x86inc";
 specialize qw/vpx_sad8x4                        msa/, "$sse2_x86inc";
 specialize qw/vpx_sad4x8                        msa/, "$sse2_x86inc";
-specialize qw/vpx_sad4x4     mmx           neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad4x4                   neon msa/, "$sse2_x86inc";
 
 specialize qw/vpx_sad128x128_avg         /, "$sse2_x86inc";
 specialize qw/vpx_sad128x64_avg          /, "$sse2_x86inc";
@@ -1248,23 +1284,67 @@
 if (vpx_config("CONFIG_ENCODERS") eq "yes" || vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
 
 #
+# Variance
+#
+add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance64x64 sse2 avx2 neon msa/;
+
+add_proto qw/unsigned int vpx_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance64x32 sse2 avx2 neon msa/;
+
+add_proto qw/unsigned int vpx_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance32x64 sse2 neon msa/;
+
+add_proto qw/unsigned int vpx_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance32x32 sse2 avx2 neon msa/;
+
+add_proto qw/unsigned int vpx_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance32x16 sse2 avx2 msa/;
+
+add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance16x32 sse2 msa/;
+
+add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance16x16 sse2 avx2 media neon msa/;
+
+add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance16x8 sse2 neon msa/;
+
+add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance8x16 sse2 neon msa/;
+
+add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance8x8 sse2 media neon msa/;
+
+add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance8x4 sse2 msa/;
+
+add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance4x8 sse2 msa/;
+
+add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance4x4 sse2 msa/;
+
+#
 # Specialty Variance
 #
 add_proto qw/void vpx_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+
 add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
 
-specialize qw/vpx_get16x16var     avx2 sse2 neon msa/;
-specialize qw/vpx_get8x8var   mmx      sse2 neon msa/;
+specialize qw/vpx_get16x16var sse2 avx2 neon msa/;
+specialize qw/vpx_get8x8var   sse2      neon msa/;
+
 
 add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
 add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
 add_proto qw/unsigned int vpx_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
 add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
 
-specialize qw/vpx_mse16x16 mmx avx2 sse2 media neon msa/;
-specialize qw/vpx_mse16x8           sse2            msa/;
-specialize qw/vpx_mse8x16           sse2            msa/;
-specialize qw/vpx_mse8x8            sse2            msa/;
+specialize qw/vpx_mse16x16          sse2 avx2 media neon msa/;
+specialize qw/vpx_mse16x8           sse2                 msa/;
+specialize qw/vpx_mse8x16           sse2                 msa/;
+specialize qw/vpx_mse8x8            sse2                 msa/;
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   foreach $bd (8, 10, 12) {
@@ -1302,7 +1382,7 @@
 add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *";
 add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride";
 
-specialize qw/vpx_get_mb_ss mmx sse2 msa/;
+specialize qw/vpx_get_mb_ss sse2 msa/;
 specialize qw/vpx_get4x4sse_cs neon msa/;
 
 #
@@ -1321,13 +1401,13 @@
 specialize qw/vpx_variance32x32     sse2 avx2       neon msa/;
 specialize qw/vpx_variance32x16     sse2 avx2            msa/;
 specialize qw/vpx_variance16x32     sse2                 msa/;
-specialize qw/vpx_variance16x16 mmx sse2 avx2 media neon msa/;
-specialize qw/vpx_variance16x8  mmx sse2            neon msa/;
-specialize qw/vpx_variance8x16  mmx sse2            neon msa/;
-specialize qw/vpx_variance8x8   mmx sse2      media neon msa/;
+specialize qw/vpx_variance16x16     sse2 avx2 media neon msa/;
+specialize qw/vpx_variance16x8      sse2            neon msa/;
+specialize qw/vpx_variance8x16      sse2            neon msa/;
+specialize qw/vpx_variance8x8       sse2      media neon msa/;
 specialize qw/vpx_variance8x4       sse2                 msa/;
 specialize qw/vpx_variance4x8       sse2                 msa/;
-specialize qw/vpx_variance4x4   mmx sse2                 msa/;
+specialize qw/vpx_variance4x4       sse2                 msa/;
 
 specialize qw/vpx_sub_pixel_variance64x64     avx2       neon msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
 specialize qw/vpx_sub_pixel_variance64x32                     msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
@@ -1335,13 +1415,13 @@
 specialize qw/vpx_sub_pixel_variance32x32     avx2       neon msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
 specialize qw/vpx_sub_pixel_variance32x16                     msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
 specialize qw/vpx_sub_pixel_variance16x32                     msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
-specialize qw/vpx_sub_pixel_variance16x16 mmx      media neon msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
-specialize qw/vpx_sub_pixel_variance16x8  mmx                 msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
-specialize qw/vpx_sub_pixel_variance8x16  mmx                 msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
-specialize qw/vpx_sub_pixel_variance8x8   mmx      media neon msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_variance16x16          media neon msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_variance16x8                      msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_variance8x16                      msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_variance8x8            media neon msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
 specialize qw/vpx_sub_pixel_variance8x4                       msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
-specialize qw/vpx_sub_pixel_variance4x8                       msa/, "$sse_x86inc",                  "$ssse3_x86inc";
-specialize qw/vpx_sub_pixel_variance4x4   mmx                 msa/, "$sse_x86inc",                  "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_variance4x8                       msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_variance4x4                       msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
 
 specialize qw/vpx_sub_pixel_avg_variance64x64 avx2 msa/,                "$sse2_x86inc", "$ssse3_x86inc";
 specialize qw/vpx_sub_pixel_avg_variance64x32      msa/,                "$sse2_x86inc", "$ssse3_x86inc";
@@ -1354,8 +1434,8 @@
 specialize qw/vpx_sub_pixel_avg_variance8x16       msa/,                "$sse2_x86inc", "$ssse3_x86inc";
 specialize qw/vpx_sub_pixel_avg_variance8x8        msa/,                "$sse2_x86inc", "$ssse3_x86inc";
 specialize qw/vpx_sub_pixel_avg_variance8x4        msa/,                "$sse2_x86inc", "$ssse3_x86inc";
-specialize qw/vpx_sub_pixel_avg_variance4x8        msa/, "$sse_x86inc",                 "$ssse3_x86inc";
-specialize qw/vpx_sub_pixel_avg_variance4x4        msa/, "$sse_x86inc",                 "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_avg_variance4x8        msa/,                "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_avg_variance4x4        msa/,                "$sse2_x86inc", "$ssse3_x86inc";
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   foreach $bd (8, 10, 12) {
@@ -1432,24 +1512,426 @@
   }
 }
 
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance64x64 avx2 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance64x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance32x64 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance32x32 avx2 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance32x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance16x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance16x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance16x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance8x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance8x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance4x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance4x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
 #
 # Specialty Subpixel
 #
-add_proto qw/uint32_t vpx_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse";
-specialize qw/vpx_variance_halfpixvar16x16_h mmx sse2 media/;
+add_proto qw/uint32_t vpx_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse";
+  specialize qw/vpx_variance_halfpixvar16x16_h sse2 media/;
 
-add_proto qw/uint32_t vpx_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse";
-specialize qw/vpx_variance_halfpixvar16x16_v mmx sse2 media/;
+add_proto qw/uint32_t vpx_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse";
+  specialize qw/vpx_variance_halfpixvar16x16_v sse2 media/;
 
-add_proto qw/uint32_t vpx_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse";
-specialize qw/vpx_variance_halfpixvar16x16_hv mmx sse2 media/;
+add_proto qw/uint32_t vpx_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse";
+  specialize qw/vpx_variance_halfpixvar16x16_hv sse2 media/;
 
 #
 # Comp Avg
 #
 add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance64x64 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance64x32 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance32x64 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance32x32 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance32x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance16x32 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance16x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance16x8 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance8x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance8x8 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+  add_proto qw/unsigned int vpx_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance64x64 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance64x32 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance32x64 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance32x32 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance32x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance16x32 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance16x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance16x8 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance8x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance8x8 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+  add_proto qw/unsigned int vpx_highbd_8_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance64x64 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance64x32 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance32x64 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance32x32 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance32x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance16x32 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance16x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance16x8 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance8x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance8x8 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_8_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_8_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+  add_proto qw/void vpx_highbd_8_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  add_proto qw/void vpx_highbd_8_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+
+  add_proto qw/void vpx_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  add_proto qw/void vpx_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+
+  add_proto qw/void vpx_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  add_proto qw/void vpx_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+
+  add_proto qw/unsigned int vpx_highbd_8_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_mse16x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_8_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_8_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_mse8x8 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_mse16x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_10_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_10_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_mse8x8 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_mse16x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_12_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_12_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_mse8x8 sse2/;
+
   add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride";
+
+  #
+  # Subpixel Variance
+  #
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance64x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance64x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance32x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance32x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance32x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance16x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance16x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance16x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance8x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance8x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance8x4/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance64x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance64x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance32x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance32x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance32x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance16x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance16x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance16x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance8x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance8x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance8x4/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance64x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance64x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance32x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance32x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance32x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance16x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance16x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance16x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance8x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance8x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance8x4/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+
+}  # CONFIG_VP9_HIGHBITDEPTH
+
+#
+# Post Processing
+#
+if (vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
+    add_proto qw/void vpx_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
+    specialize qw/vpx_plane_add_noise sse2 msa/;
 }
 
 }  # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
diff --git a/vpx_dsp/x86/add_noise_sse2.asm b/vpx_dsp/x86/add_noise_sse2.asm
new file mode 100644
index 0000000..ff61b19
--- /dev/null
+++ b/vpx_dsp/x86/add_noise_sse2.asm
@@ -0,0 +1,83 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vpx_plane_add_noise_sse2(unsigned char *start, unsigned char *noise,
+;                              unsigned char blackclamp[16],
+;                              unsigned char whiteclamp[16],
+;                              unsigned char bothclamp[16],
+;                              unsigned int width, unsigned int height,
+;                              int pitch)
+global sym(vpx_plane_add_noise_sse2) PRIVATE
+sym(vpx_plane_add_noise_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ; get the clamps in registers
+    mov     rdx, arg(2) ; blackclamp
+    movdqu  xmm3, [rdx]
+    mov     rdx, arg(3) ; whiteclamp
+    movdqu  xmm4, [rdx]
+    mov     rdx, arg(4) ; bothclamp
+    movdqu  xmm5, [rdx]
+
+.addnoise_loop:
+    call sym(LIBVPX_RAND) WRT_PLT
+    mov     rcx, arg(1) ;noise
+    and     rax, 0xff
+    add     rcx, rax
+
+    mov     rdi, rcx
+    movsxd  rcx, dword arg(5) ;[Width]
+    mov     rsi, arg(0) ;Pos
+    xor         rax,rax
+
+.addnoise_nextset:
+      movdqu      xmm1,[rsi+rax]         ; get the source
+
+      psubusb     xmm1, xmm3 ; subtract black clamp
+      paddusb     xmm1, xmm5 ; add both clamp
+      psubusb     xmm1, xmm4 ; subtract whiteclamp
+
+      movdqu      xmm2,[rdi+rax]         ; get the noise for this line
+      paddb       xmm1,xmm2              ; add it in
+      movdqu      [rsi+rax],xmm1         ; store the result
+
+      add         rax,16                 ; move to the next line
+
+      cmp         rax, rcx
+      jl          .addnoise_nextset
+
+    movsxd  rax, dword arg(7) ; Pitch
+    add     arg(0), rax ; Start += Pitch
+    sub     dword arg(6), 1   ; Height -= 1
+    jg      .addnoise_loop
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+rd42:
+    times 8 dw 0x04
+four8s:
+    times 4 dd 8
diff --git a/vpx_dsp/x86/halfpix_variance_sse2.c b/vpx_dsp/x86/halfpix_variance_sse2.c
index 5782155..4a8fb6d 100644
--- a/vpx_dsp/x86/halfpix_variance_sse2.c
+++ b/vpx_dsp/x86/halfpix_variance_sse2.c
@@ -8,6 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
+
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
@@ -40,7 +42,9 @@
                                     &xsum0, &xxsum0);
 
   *sse = xxsum0;
-  return (xxsum0 - (((uint32_t)xsum0 * xsum0) >> 8));
+  assert(xsum0 <= 255 * 16 * 16);
+  assert(xsum0 >= -255 * 16 * 16);
+  return (xxsum0 - ((uint32_t)((int64_t)xsum0 * xsum0) >> 8));
 }
 
 uint32_t vpx_variance_halfpixvar16x16_v_sse2(const unsigned char *src,
@@ -54,7 +58,9 @@
                                    &xsum0, &xxsum0);
 
   *sse = xxsum0;
-  return (xxsum0 - (((uint32_t)xsum0 * xsum0) >> 8));
+  assert(xsum0 <= 255 * 16 * 16);
+  assert(xsum0 >= -255 * 16 * 16);
+  return (xxsum0 - ((uint32_t)((int64_t)xsum0 * xsum0) >> 8));
 }
 
 
@@ -70,5 +76,7 @@
                                          &xsum0, &xxsum0);
 
   *sse = xxsum0;
-  return (xxsum0 - (((uint32_t)xsum0 * xsum0) >> 8));
+  assert(xsum0 <= 255 * 16 * 16);
+  assert(xsum0 >= -255 * 16 * 16);
+  return (xxsum0 - ((uint32_t)((int64_t)xsum0 * xsum0) >> 8));
 }
diff --git a/vpx_dsp/x86/highbd_variance_sse2.c b/vpx_dsp/x86/highbd_variance_sse2.c
index e2b79bf..7bfa383 100644
--- a/vpx_dsp/x86/highbd_variance_sse2.c
+++ b/vpx_dsp/x86/highbd_variance_sse2.c
@@ -259,13 +259,12 @@
                                                  int height, \
                                                  unsigned int *sse, \
                                                  void *unused0, void *unused);
-#define DECLS(opt1, opt2) \
-  DECL(8, opt1); \
-  DECL(16, opt1)
+#define DECLS(opt) \
+  DECL(8, opt); \
+  DECL(16, opt)
 
-DECLS(sse2, sse);
-// TODO(johannkoenig): enable the ssse3 or delete
-// DECLS(ssse3, ssse3);
+DECLS(sse2);
+
 #undef DECLS
 #undef DECL
 
@@ -402,21 +401,21 @@
   return sse - ((cast se * se) >> (wlog2 + hlog2)); \
 }
 
-#define FNS(opt1, opt2) \
-FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
-FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
-FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
-FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
-FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
-FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
-FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
-FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
-FN(8, 16, 8, 3, 4, opt1, (int64_t)); \
-FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
-FN(8, 4, 8, 3, 2, opt1, (int64_t));
+#define FNS(opt) \
+FN(64, 64, 16, 6, 6, opt, (int64_t)); \
+FN(64, 32, 16, 6, 5, opt, (int64_t)); \
+FN(32, 64, 16, 5, 6, opt, (int64_t)); \
+FN(32, 32, 16, 5, 5, opt, (int64_t)); \
+FN(32, 16, 16, 5, 4, opt, (int64_t)); \
+FN(16, 32, 16, 4, 5, opt, (int64_t)); \
+FN(16, 16, 16, 4, 4, opt, (int64_t)); \
+FN(16, 8, 16, 4, 3, opt, (int64_t)); \
+FN(8, 16, 8, 3, 4, opt, (int64_t)); \
+FN(8, 8, 8, 3, 3, opt, (int64_t)); \
+FN(8, 4, 8, 3, 2, opt, (int64_t));
 
 
-FNS(sse2, sse);
+FNS(sse2);
 
 #undef FNS
 #undef FN
diff --git a/vpx_dsp/x86/intrapred_sse2.asm b/vpx_dsp/x86/intrapred_sse2.asm
index c24d536..cd6a6ae 100644
--- a/vpx_dsp/x86/intrapred_sse2.asm
+++ b/vpx_dsp/x86/intrapred_sse2.asm
@@ -11,6 +11,7 @@
 %include "third_party/x86inc/x86inc.asm"
 
 SECTION_RODATA
+pb_1: times 16 db 1
 pw_4:  times 8 dw 4
 pw_8:  times 8 dw 8
 pw_16: times 8 dw 16
@@ -23,6 +24,115 @@
 
 SECTION .text
 
+; ------------------------------------------
+; input: x, y, z, result
+;
+; trick from pascal
+; (x+2y+z+2)>>2 can be calculated as:
+; result = avg(x,z)
+; result -= xor(x,z) & 1
+; result = avg(result,y)
+; ------------------------------------------
+%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4
+  pavgb               %4, %1, %3
+  pxor                %3, %1
+  pand                %3, [GLOBAL(pb_1)]
+  psubb               %4, %3
+  pavgb               %4, %2
+%endmacro
+
+INIT_XMM sse2
+cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset
+  GET_GOT     goffsetq
+
+  movq                 m0, [aboveq]
+  DEFINE_ARGS dst, stride, temp
+  psrldq               m1, m0, 1
+  psrldq               m2, m0, 2
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3
+
+  ; store 4 lines
+  movd   [dstq          ], m3
+  psrlq                m3, 8
+  movd   [dstq+strideq  ], m3
+  lea                dstq, [dstq+strideq*2]
+  psrlq                m3, 8
+  movd   [dstq          ], m3
+  psrlq                m3, 8
+  movd   [dstq+strideq  ], m3
+  psrlq                m0, 56
+  movd              tempq, m0
+  mov    [dstq+strideq+3], tempb
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal d45_predictor_8x8, 3, 4, 4, dst, stride, above, goffset
+  GET_GOT     goffsetq
+
+  movu                m1, [aboveq]
+  pslldq              m0, m1, 1
+  psrldq              m2, m1, 1
+  DEFINE_ARGS dst, stride, stride3
+  lea           stride3q, [strideq*3]
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3
+  punpckhbw           m0, m0 ; 7 7
+  punpcklwd           m0, m0 ; 7 7 7 7
+  punpckldq           m0, m0 ; 7 7 7 7 7 7 7 7
+  punpcklqdq          m3, m0 ; -1 0 1 2 3 4 5 6 7 7 7 7 7 7 7 7
+
+ ; store 4 lines
+  psrldq                m3, 1
+  movq    [dstq          ], m3
+  psrldq                m3, 1
+  movq    [dstq+strideq  ], m3
+  psrldq                m3, 1
+  movq    [dstq+strideq*2], m3
+  psrldq                m3, 1
+  movq    [dstq+stride3q ], m3
+  lea                 dstq, [dstq+strideq*4]
+
+  ; store next 4 lines
+  psrldq                m3, 1
+  movq    [dstq          ], m3
+  psrldq                m3, 1
+  movq    [dstq+strideq  ], m3
+  psrldq                m3, 1
+  movq    [dstq+strideq*2], m3
+  psrldq                m3, 1
+  movq    [dstq+stride3q ], m3
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal d207_predictor_4x4, 4, 4, 5, dst, stride, unused, left, goffset
+  GET_GOT     goffsetq
+
+  movd                m0, [leftq]                ; abcd [byte]
+  punpcklbw           m4, m0, m0                 ; aabb ccdd
+  punpcklwd           m4, m4                     ; aaaa bbbb cccc dddd
+  psrldq              m4, 12                     ; dddd
+  punpckldq           m0, m4                     ; abcd dddd
+  psrldq              m1, m0, 1                  ; bcdd
+  psrldq              m2, m0, 2                  ; cddd
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3   ; a2bc b2cd c3d d
+  pavgb               m1, m0                     ; ab, bc, cd, d [byte]
+
+  punpcklbw           m1, m3             ; ab, a2bc, bc, b2cd, cd, c3d, d, d
+  movd    [dstq        ], m1
+  psrlq               m1, 16             ; bc, b2cd, cd, c3d, d, d
+  movd    [dstq+strideq], m1
+
+  lea               dstq, [dstq+strideq*2]
+  psrlq               m1, 16             ; cd, c3d, d, d
+  movd    [dstq        ], m1
+  movd    [dstq+strideq], m4             ; d, d, d, d
+  RESTORE_GOT
+  RET
+
 INIT_XMM sse2
 cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset
   GET_GOT     goffsetq
diff --git a/vpx_dsp/x86/intrapred_ssse3.asm b/vpx_dsp/x86/intrapred_ssse3.asm
index d061278..5e0139f 100644
--- a/vpx_dsp/x86/intrapred_ssse3.asm
+++ b/vpx_dsp/x86/intrapred_ssse3.asm
@@ -13,7 +13,6 @@
 SECTION_RODATA
 
 pb_1: times 16 db 1
-sh_b01234577: db 0, 1, 2, 3, 4, 5, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
 sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
 sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
 sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7
@@ -28,77 +27,9 @@
 sh_b54321089: db 5, 4, 3, 2, 1, 0, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0
 sh_b89abcdef: db 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
 sh_bfedcba9876543210: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
-sh_b1233: db 1, 2, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-sh_b2333: db 2, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 
 SECTION .text
 
-INIT_MMX ssse3
-cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset
-  GET_GOT     goffsetq
-
-  movq                m0, [aboveq]
-  pshufb              m2, m0, [GLOBAL(sh_b23456777)]
-  pshufb              m1, m0, [GLOBAL(sh_b01234577)]
-  pshufb              m0, [GLOBAL(sh_b12345677)]
-  pavgb               m3, m2, m1
-  pxor                m2, m1
-  pand                m2, [GLOBAL(pb_1)]
-  psubb               m3, m2
-  pavgb               m0, m3
-
-  ; store 4 lines
-  movd    [dstq        ], m0
-  psrlq               m0, 8
-  movd    [dstq+strideq], m0
-  lea               dstq, [dstq+strideq*2]
-  psrlq               m0, 8
-  movd    [dstq        ], m0
-  psrlq               m0, 8
-  movd    [dstq+strideq], m0
-
-  RESTORE_GOT
-  RET
-
-INIT_MMX ssse3
-cglobal d45_predictor_8x8, 3, 4, 4, dst, stride, above, goffset
-  GET_GOT     goffsetq
-
-  movq                m0, [aboveq]
-  mova                m1, [GLOBAL(sh_b12345677)]
-  DEFINE_ARGS dst, stride, stride3
-  lea           stride3q, [strideq*3]
-  pshufb              m2, m0, [GLOBAL(sh_b23456777)]
-  pavgb               m3, m2, m0
-  pxor                m2, m0
-  pshufb              m0, m1
-  pand                m2, [GLOBAL(pb_1)]
-  psubb               m3, m2
-  pavgb               m0, m3
-
-  ; store 4 lines
-  movq  [dstq          ], m0
-  pshufb              m0, m1
-  movq  [dstq+strideq  ], m0
-  pshufb              m0, m1
-  movq  [dstq+strideq*2], m0
-  pshufb              m0, m1
-  movq  [dstq+stride3q ], m0
-  pshufb              m0, m1
-  lea               dstq, [dstq+strideq*4]
-
-  ; store next 4 lines
-  movq  [dstq          ], m0
-  pshufb              m0, m1
-  movq  [dstq+strideq  ], m0
-  pshufb              m0, m1
-  movq  [dstq+strideq*2], m0
-  pshufb              m0, m1
-  movq  [dstq+stride3q ], m0
-
-  RESTORE_GOT
-  RET
-
 INIT_XMM ssse3
 cglobal d45_predictor_16x16, 3, 6, 4, dst, stride, above, dst8, line, goffset
   GET_GOT     goffsetq
@@ -715,28 +646,6 @@
   RESTORE_GOT
   RET
 
-INIT_MMX ssse3
-cglobal d207_predictor_4x4, 4, 5, 4, dst, stride, unused, left, goffset
-  GET_GOT     goffsetq
-  movd                m0, [leftq]                ; abcd [byte]
-  pshufb              m1, m0, [GLOBAL(sh_b1233)] ; bcdd [byte]
-  pshufb              m3, m0, [GLOBAL(sh_b2333)] ; cddd
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m3, m2
-  pavgb               m1, m0             ; ab, bc, cd, d [byte]
-
-  punpcklbw           m1, m2             ; ab, a2bc, bc, b2cd, cd, c3d, d, d
-  movd    [dstq        ], m1
-  psrlq               m1, 16             ; bc, b2cd, cd, c3d, d, d
-  movd    [dstq+strideq], m1
-  lea               dstq, [dstq+strideq*2]
-  psrlq               m1, 16             ; cd, c3d, d, d
-  movd    [dstq        ], m1
-  pshufw              m1, m1, q1111      ; d, d, d, d
-  movd    [dstq+strideq], m1
-  RESTORE_GOT
-  RET
-
 INIT_XMM ssse3
 cglobal d207_predictor_8x8, 4, 5, 4, dst, stride, stride3, left, goffset
   GET_GOT     goffsetq
diff --git a/vpx_dsp/x86/inv_txfm_sse2.c b/vpx_dsp/x86/inv_txfm_sse2.c
index ae907fd..df5068c 100644
--- a/vpx_dsp/x86/inv_txfm_sse2.c
+++ b/vpx_dsp/x86/inv_txfm_sse2.c
@@ -158,8 +158,8 @@
   const __m128i zero = _mm_setzero_si128();
   int a;
 
-  a = dct_const_round_shift(input[0] * cospi_16_64);
-  a = dct_const_round_shift(a * cospi_16_64);
+  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
+  a = (int)dct_const_round_shift(a * cospi_16_64);
   a = ROUND_POWER_OF_TWO(a, 4);
 
   dc_value = _mm_set1_epi16(a);
@@ -527,8 +527,8 @@
   const __m128i zero = _mm_setzero_si128();
   int a;
 
-  a = dct_const_round_shift(input[0] * cospi_16_64);
-  a = dct_const_round_shift(a * cospi_16_64);
+  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
+  a = (int)dct_const_round_shift(a * cospi_16_64);
   a = ROUND_POWER_OF_TWO(a, 5);
 
   dc_value = _mm_set1_epi16(a);
@@ -1305,30 +1305,16 @@
   const __m128i zero = _mm_setzero_si128();
   int a, i;
 
-  a = dct_const_round_shift(input[0] * cospi_16_64);
-  a = dct_const_round_shift(a * cospi_16_64);
+  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
+  a = (int)dct_const_round_shift(a * cospi_16_64);
   a = ROUND_POWER_OF_TWO(a, 6);
 
   dc_value = _mm_set1_epi16(a);
 
-  for (i = 0; i < 2; ++i) {
-    RECON_AND_STORE(dest +  0 * stride, dc_value);
-    RECON_AND_STORE(dest +  1 * stride, dc_value);
-    RECON_AND_STORE(dest +  2 * stride, dc_value);
-    RECON_AND_STORE(dest +  3 * stride, dc_value);
-    RECON_AND_STORE(dest +  4 * stride, dc_value);
-    RECON_AND_STORE(dest +  5 * stride, dc_value);
-    RECON_AND_STORE(dest +  6 * stride, dc_value);
-    RECON_AND_STORE(dest +  7 * stride, dc_value);
-    RECON_AND_STORE(dest +  8 * stride, dc_value);
-    RECON_AND_STORE(dest +  9 * stride, dc_value);
-    RECON_AND_STORE(dest + 10 * stride, dc_value);
-    RECON_AND_STORE(dest + 11 * stride, dc_value);
-    RECON_AND_STORE(dest + 12 * stride, dc_value);
-    RECON_AND_STORE(dest + 13 * stride, dc_value);
-    RECON_AND_STORE(dest + 14 * stride, dc_value);
-    RECON_AND_STORE(dest + 15 * stride, dc_value);
-    dest += 8;
+  for (i = 0; i < 16; ++i) {
+    RECON_AND_STORE(dest +  0, dc_value);
+    RECON_AND_STORE(dest +  8, dc_value);
+    dest += stride;
   }
 }
 
@@ -3476,8 +3462,8 @@
   const __m128i zero = _mm_setzero_si128();
   int a, j;
 
-  a = dct_const_round_shift(input[0] * cospi_16_64);
-  a = dct_const_round_shift(a * cospi_16_64);
+  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
+  a = (int)dct_const_round_shift(a * cospi_16_64);
   a = ROUND_POWER_OF_TWO(a, 6);
 
   dc_value = _mm_set1_epi16(a);
diff --git a/vpx_dsp/x86/loopfilter_mmx.asm b/vpx_dsp/x86/loopfilter_mmx.asm
deleted file mode 100644
index 45d0ecc..0000000
--- a/vpx_dsp/x86/loopfilter_mmx.asm
+++ /dev/null
@@ -1,436 +0,0 @@
-;
-;  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-align 16
-tfe:
-    times 8 db 0xfe
-t80:
-    times 8 db 0x80
-t3:
-    times 8 db 0x03
-t4:
-    times 8 db 0x04
-ones:
-    times 4 dw 0x0001
-
-SECTION .text
-
-%define stkreg rsp
-
-%define t0                  0
-%define t1            t0 + 16
-%define p1            t1 + 16
-%define p0            p1 + 16
-%define q0            p0 + 16
-%define q1            q0 + 16
-%define lstacksize    q1 + 16
-
-%define goffsetq _limitq
-
-;void vpx_lpf_horizontal_4_mmx(unsigned char *src_ptr, int  src_pixel_step,
-;                              const char *blimit, const char *limit,
-;                              const char *thresh);
-INIT_MMX mmx
-cglobal lpf_horizontal_4, 5, 6, 8, 0 - lstacksize, \
-                                s, p, _blimit, _limit, _thresh, s1
-    movq                  m7, [_limitq]
-    GET_GOT         goffsetq
-%if GET_GOT_DEFINED=1
-    add rsp, gprsize                          ; restore stack
-%endif
-    lea                  s1q, [sq + pq]       ; s1q points to row +1
-
-    ; calculate breakout conditions
-    movq                  m2, [s1q + 2 * pq]  ; q3
-    movq                  m1, [ sq + 2 * pq]  ; q2
-    movq                  m6, m1              ; q2
-    psubusb               m1, m2              ; q2-=q3
-    psubusb               m2, m6              ; q3-=q2
-    por                   m1, m2              ; abs(q3-q2)
-    psubusb               m1, m7
-    movq                  m4, [sq + pq]       ; q1
-    movq                  m3, m4              ; q1
-    psubusb               m4, m6              ; q1-=q2
-    psubusb               m6, m3              ; q2-=q1
-    por                   m4, m6              ; abs(q2-q1)
-    psubusb               m4, m7
-    por                   m1, m4
-    movq                  m4, [sq]            ; q0
-    movq                  m0, m4              ; q0
-    psubusb               m4, m3              ; q0-=q1
-    psubusb               m3, m0              ; q1-=q0
-    por                   m4, m3              ; abs(q0-q1)
-    movq       [stkreg + t0], m4              ; save to t0
-    psubusb               m4, m7
-    por                   m1, m4
-    neg                   pq                  ; negate pitch to deal with
-                                              ; above border
-    movq                  m2, [ sq + 4 * pq]  ; p3
-    movq                  m4, [s1q + 4 * pq]  ; p2
-    movq                  m5, m4              ; p2
-    psubusb               m4, m2              ; p2-=p3
-    psubusb               m2, m5              ; p3-=p2
-    por                   m4, m2              ; abs(p3 - p2)
-    psubusb               m4, m7
-    por                   m1, m4
-    movq                  m4, [sq + 2 * pq]   ; p1
-    movq                  m3, m4              ; p1
-    psubusb               m4, m5              ; p1-=p2
-    psubusb               m5, m3              ; p2-=p1
-    por                   m4, m5              ; abs(p2 - p1)
-    psubusb               m4, m7
-    por                   m1, m4
-    movq                  m2, m3              ; p1
-    movq                  m4, [sq + pq]       ; p0
-    movq                  m5, m4              ; p0
-    psubusb               m4, m3              ; p0-=p1
-    psubusb               m3, m5              ; p1-=p0
-    por                   m4, m3              ; abs(p1 - p0)
-    movq       [stkreg + t1], m4              ; save to t1
-    psubusb               m4, m7
-    por                   m1, m4
-    movq                  m3, [s1q]           ; q1
-    movq                  m4, m3              ; q1
-    psubusb               m3, m2              ; q1-=p1
-    psubusb               m2, m4              ; p1-=q1
-    por                   m2, m3              ; abs(p1-q1)
-    pand                  m2, [GLOBAL(tfe)]   ; set lsb of each byte to zero
-    psrlw                 m2, 1               ; abs(p1-q1)/2
-    movq                  m6, m5              ; p0
-    movq                  m3, [sq]            ; q0
-    psubusb               m5, m3              ; p0-=q0
-    psubusb               m3, m6              ; q0-=p0
-    por                   m5, m3              ; abs(p0 - q0)
-    paddusb               m5, m5              ; abs(p0-q0)*2
-    paddusb               m5, m2              ; abs (p0 - q0) * 2 + abs(p1-q1)/2
-    movq                  m7, [_blimitq]            ; blimit
-    psubusb               m5, m7              ; abs (p0 - q0) * 2 +
-                                              ; abs(p1-q1)/2  > blimit
-    por                   m1, m5
-    pxor                  m5, m5
-    pcmpeqb               m1, m5              ; mask m1
-
-    ; calculate high edge variance
-    movq                  m7, [_threshq]
-    movq                  m4, [stkreg + t0]   ; get abs (q1 - q0)
-    psubusb               m4, m7
-    movq                  m3, [stkreg + t1]   ; get abs (p1 - p0)
-    psubusb               m3, m7
-    paddb                 m4, m3              ; abs(q1 - q0) > thresh ||
-                                              ; abs(p1 - p0) > thresh
-    pcmpeqb               m4, m5
-    pcmpeqb               m5, m5
-    movq                  m3, [GLOBAL(t80)]
-    pxor                  m4, m5
-
-    ; start work on filters
-    movq                  m2, [sq + 2 * pq]   ; p1
-    movq                  m7, [s1q]           ; q1
-    pxor                  m2, m3              ; p1 converted to signed values
-    pxor                  m7, m3              ; q1 converted to signed values
-    psubsb                m2, m7              ; p1 - q1
-    pand                  m2, m4              ; high var mask (hvm)(p1 - q1)
-    pxor                  m6, m3              ; p0 converted to signed values
-    pxor                  m0, m3              ; q0 converted to signed values
-    movq                  m3, m0              ; q0
-    psubsb                m0, m6              ; q0 - p0
-    paddsb                m2, m0              ; 1 * (q0 - p0) + hvm(p1 - q1)
-    paddsb                m2, m0              ; 2 * (q0 - p0) + hvm(p1 - q1)
-    paddsb                m2, m0              ; 3 * (q0 - p0) + hvm(p1 - q1)
-    pand                  m1, m2              ; mask filter values we don't
-                                              ; care about
-    movq                  m2, m1
-    paddsb                m1, [GLOBAL(t4)]    ; 3* (q0 - p0) + hvm(p1 - q1) + 4
-    paddsb                m2, [GLOBAL(t3)]    ; 3* (q0 - p0) + hvm(p1 - q1) + 3
-
-    pxor                  m0, m0
-    pxor                  m5, m5
-    punpcklbw             m0, m2
-    punpckhbw             m5, m2
-    psraw                 m0, 11
-    psraw                 m5, 11
-    packsswb              m0, m5
-    movq                  m2, m0              ; (3* (q0 - p0) + hvm(p1 - q1)
-                                              ; + 3) >> 3;
-    pxor                  m0, m0
-    movq                  m5, m1              ; abcdefgh
-    punpcklbw             m0, m1              ; e0f0g0h0
-    psraw                 m0, 11              ; sign extended shift right by 3
-    pxor                  m1, m1
-    punpckhbw             m1, m5              ; a0b0c0d0
-    psraw                 m1, 11              ; sign extended shift right by 3
-    movq                  m5, m0              ; save results
-
-    packsswb              m0, m1              ; (3* (q0 - p0) + hvm(p1 - q1)
-                                              ; + 4) >>3
-    paddsw                m5, [GLOBAL(ones)]
-    paddsw                m1, [GLOBAL(ones)]
-    psraw                 m5, 1
-    psraw                 m1, 1
-    packsswb              m5, m1              ; (3* (q0 - p0) + hvm(p1 - q1)
-                                              ; + 4) >>4
-    movq                  m1, [GLOBAL(t80)]
-    pandn                 m4, m5              ; high edge variance additive
-    paddsb                m6, m2              ; p0+= p0 add
-    pxor                  m6, m1              ; unoffset
-    movq           [sq + pq], m6              ; write back
-    movq                  m6, [sq + 2 * pq]   ; p1
-    pxor                  m6, m1              ; reoffset
-    paddsb                m6, m4              ; p1+= p1 add
-    pxor                  m6, m1              ; unoffset
-    movq       [sq + 2 * pq], m6              ; write back
-    psubsb                m3, m0              ; q0-= q0 add
-    pxor                  m3, m1              ; unoffset
-    movq                [sq], m3              ; write back
-    psubsb                m7, m4              ; q1-= q1 add
-    pxor                  m7, m1              ; unoffset
-    movq               [s1q], m7              ; write back
-    RET
-
-;void vpx_lpf_vertical_4_mmx(unsigned char *src_ptr, int  src_pixel_step,
-;                            const char *blimit, const char *limit,
-;                            const char *thresh);
-INIT_MMX mmx
-cglobal lpf_vertical_4, 5, 6, 8, 0 - lstacksize, \
-                              s, p, _blimit, _limit, _thresh, s1
-    lea                   sq, [sq + pq * 4 - 4]
-    lea                  s1q, [sq + pq]       ; s1q points to row +1
-    ;transpose
-    movq                  m6, [ sq + 2 * pq]  ; 67 66 65 64 63 62 61 60
-    movq                  m7, m6              ; 77 76 75 74 73 72 71 70
-    punpckhbw             m7, [s1q + 2 * pq]  ; 77 67 76 66 75 65 74 64
-    punpcklbw             m6, [s1q + 2 * pq]  ; 73 63 72 62 71 61 70 60
-    movq                  m4, [sq]            ; 47 46 45 44 43 42 41 40
-    movq                  m5, m4              ; 47 46 45 44 43 42 41 40
-    punpckhbw             m5, [sq + pq]       ; 57 47 56 46 55 45 54 44
-    punpcklbw             m4, [sq + pq]       ; 53 43 52 42 51 41 50 40
-    movq                  m3, m5              ; 57 47 56 46 55 45 54 44
-    punpckhwd             m5, m7              ; 77 67 57 47 76 66 56 46
-    punpcklwd             m3, m7              ; 75 65 55 45 74 64 54 44
-    movq                  m2, m4              ; 53 43 52 42 51 41 50 40
-    punpckhwd             m4, m6              ; 73 63 53 43 72 62 52 42
-    punpcklwd             m2, m6              ; 71 61 51 41 70 60 50 40
-    neg                   pq
-    movq                  m6, [ sq + pq * 2]  ; 27 26 25 24 23 22 21 20
-    movq                  m1, m6              ; 27 26 25 24 23 22 21 20
-    punpckhbw             m6, [ sq + pq    ]  ; 37 27 36 36 35 25 34 24
-    punpcklbw             m1, [ sq + pq    ]  ; 33 23 32 22 31 21 30 20
-    movq                  m7, [ sq + pq * 4]; ; 07 06 05 04 03 02 01 00
-    punpckhbw             m7, [s1q + pq * 4]  ; 17 07 16 06 15 05 14 04
-    movq                  m0, m7              ; 17 07 16 06 15 05 14 04
-    punpckhwd             m7, m6              ; 37 27 17 07 36 26 16 06
-    punpcklwd             m0, m6              ; 35 25 15 05 34 24 14 04
-    movq                  m6, m7              ; 37 27 17 07 36 26 16 06
-    punpckhdq             m7, m5              ; 77 67 57 47 37 27 17 07  = q3
-    punpckldq             m6, m5              ; 76 66 56 46 36 26 16 06  = q2
-    movq                  m5, m6              ; 76 66 56 46 36 26 16 06
-    psubusb               m5, m7              ; q2-q3
-    psubusb               m7, m6              ; q3-q2
-    por                   m7, m5;             ; m7=abs (q3-q2)
-    movq                  m5, m0              ; 35 25 15 05 34 24 14 04
-    punpckhdq             m5, m3              ; 75 65 55 45 35 25 15 05 = q1
-    punpckldq             m0, m3              ; 74 64 54 44 34 24 15 04 = q0
-    movq                  m3, m5              ; 75 65 55 45 35 25 15 05 = q1
-    psubusb               m3, m6              ; q1-q2
-    psubusb               m6, m5              ; q2-q1
-    por                   m6, m3              ; m6=abs(q2-q1)
-
-    movq       [stkreg + q1], m5              ; save q1
-    movq       [stkreg + q0], m0              ; save q0
-
-    movq                  m3, [ sq + pq * 4]  ; 07 06 05 04 03 02 01 00
-    punpcklbw             m3, [s1q + pq * 4]  ; 13 03 12 02 11 01 10 00
-    movq                  m0, m3              ; 13 03 12 02 11 01 10 00
-    punpcklwd             m0, m1              ; 31 21 11 01 30 20 10 00
-    punpckhwd             m3, m1              ; 33 23 13 03 32 22 12 02
-    movq                  m1, m0              ; 31 21 11 01 30 20 10 00
-    punpckldq             m0, m2              ; 70 60 50 40 30 20 10 00  =p3
-    punpckhdq             m1, m2              ; 71 61 51 41 31 21 11 01  =p2
-    movq                  m2, m1              ; 71 61 51 41 31 21 11 01  =p2
-    psubusb               m2, m0              ; p2-p3
-    psubusb               m0, m1              ; p3-p2
-    por                   m0, m2              ; m0=abs(p3-p2)
-    movq                  m2, m3              ; 33 23 13 03 32 22 12 02
-    punpckldq             m2, m4              ; 72 62 52 42 32 22 12 02 = p1
-    punpckhdq             m3, m4              ; 73 63 53 43 33 23 13 03 = p0
-
-    movq       [stkreg + p0], m3              ; save p0
-    movq       [stkreg + p1], m2              ; save p1
-    movq                  m5, m2              ; m5 = p1
-    psubusb               m2, m1              ; p1-p2
-    psubusb               m1, m5              ; p2-p1
-    por                   m1, m2              ; m1=abs(p2-p1)
-    movq                  m4, [_limitq]
-    GET_GOT         goffsetq
-%if GET_GOT_DEFINED=1
-    add rsp, gprsize                          ; restore stack
-%endif
-    psubusb               m7, m4
-    psubusb               m0, m4
-    psubusb               m1, m4
-    psubusb               m6, m4
-    por                   m7, m6
-    por                   m0, m1
-    por                   m0, m7              ; abs(q3-q2) > limit ||
-                                              ; abs(p3-p2) > limit ||
-                                              ; abs(p2-p1) > limit ||
-                                              ; abs(q2-q1) > limit
-    movq                  m1, m5              ; p1
-    movq                  m7, m3              ; m3=m7=p0
-    psubusb               m7, m5              ; p0 - p1
-    psubusb               m5, m3              ; p1 - p0
-    por                   m5, m7              ; abs(p1-p0)
-    movq       [stkreg + t0], m5              ; save abs(p1-p0)
-    psubusb               m5, m4
-    por                   m0, m5              ; m0=mask
-    movq                  m5, [stkreg + q0]   ; m5=q0
-    movq                  m7, [stkreg + q1]   ; m7=q1
-    movq                  m6, m5              ; m6=q0
-    movq                  m2, m7              ; q1
-    psubusb               m5, m7              ; q0-q1
-    psubusb               m7, m6              ; q1-q0
-    por                   m7, m5              ; abs(q1-q0)
-    movq       [stkreg + t1], m7              ; save abs(q1-q0)
-    psubusb               m7, m4
-    por                   m0, m7              ; mask
-    movq                  m5, m2              ; q1
-    psubusb               m5, m1              ; q1-=p1
-    psubusb               m1, m2              ; p1-=q1
-    por                   m5, m1              ; abs(p1-q1)
-    pand                  m5, [GLOBAL(tfe)]   ; set lsb of each byte to zero
-    psrlw                 m5, 1               ; abs(p1-q1)/2
-    movq                  m4, [_blimitq]
-    movq                  m1, m3              ; m1=m3=p0
-    movq                  m7, m6              ; m7=m6=q0
-    psubusb               m1, m7              ; p0-q0
-    psubusb               m7, m3              ; q0-p0
-    por                   m1, m7              ; abs(q0-p0)
-    paddusb               m1, m1              ; abs(q0-p0)*2
-    paddusb               m1, m5              ; abs(p0 - q0)*2 + abs(p1-q1)/2
-    psubusb               m1, m4              ; abs(p0 - q0)*2 + abs(p1-q1)/2
-                                              ; > blimit
-    por                   m1, m0;             ; mask
-    pxor                  m0, m0
-    pcmpeqb               m1, m0
-
-    ; calculate high edge variance
-    movq                  m7, [_threshq]
-    movq                  m4, [stkreg + t0]   ; get abs (q1 - q0)
-    psubusb               m4, m7
-    movq                  m3, [stkreg + t1]   ; get abs (p1 - p0)
-    psubusb               m3, m7
-    por                   m4, m3              ; abs(q1 - q0) > thresh ||
-                                              ; abs(p1 - p0) > thresh
-    pcmpeqb               m4, m0
-    pcmpeqb               m0, m0
-    movq                  m3, [GLOBAL(t80)]
-    pxor                  m4, m0
-
-    ; start work on filters
-    movq                  m2, [stkreg + p1]
-    movq                  m7, [stkreg + q1]
-    movq                  m6, [stkreg + p0]
-    movq                  m0, [stkreg + q0]
-    pxor                  m2, m3
-    pxor                  m7, m3
-    psubsb                m2, m7              ; p1 - q1
-    pand                  m2, m4              ; high var mask (hvm)(p1 - q1)
-    pxor                  m6, m3
-    pxor                  m0, m3
-    movq                  m3, m0              ; q0
-    psubsb                m0, m6              ; q0 - p0
-    paddsb                m2, m0              ; 1 * (q0 - p0) + hvm(p1 - q1)
-    paddsb                m2, m0              ; 2 * (q0 - p0) + hvm(p1 - q1)
-    paddsb                m2, m0              ; 3 * (q0 - p0) + hvm(p1 - q1)
-    pand                  m1, m2              ; mask filter values we don't
-                                              ; care about
-    movq                  m2, m1
-    paddsb                m1, [GLOBAL(t4)]    ; 3*(q0 - p0) + hvm(p1 - q1) + 4
-    paddsb                m2, [GLOBAL(t3)]    ; 3*(q0 - p0) + hvm(p1 - q1) + 3
-    pxor                  m0, m0
-    pxor                  m5, m5
-    punpcklbw             m0, m2
-    punpckhbw             m5, m2
-    psraw                 m0, 11
-    psraw                 m5, 11
-    packsswb              m0, m5
-    movq                  m2, m0              ; (3*(q0 - p0) + hvm(p1 - q1)
-                                              ; + 3) >> 3;
-    pxor                  m0, m0
-    movq                  m5, m1              ; abcdefgh
-    punpcklbw             m0, m1              ; e0f0g0h0
-    psraw                 m0, 11              ; sign extended shift right by 3
-    pxor                  m1, m1
-    punpckhbw             m1, m5              ; a0b0c0d0
-    psraw                 m1, 11              ; sign extended shift right by 3
-    movq                  m5, m0              ; save results
-    packsswb              m0, m1              ; (3*(q0 - p0) + hvm(p1 - q1)
-                                              ; + 4) >>3
-    paddsw                m5, [GLOBAL(ones)]
-    paddsw                m1, [GLOBAL(ones)]
-    psraw                 m5, 1
-    psraw                 m1, 1
-    packsswb              m5, m1              ; (3* (q0 - p0) + hvm(p1 - q1)
-                                              ; + 4) >>4
-    pandn                 m4, m5              ; high edge variance additive
-    movq                  m5, [GLOBAL(t80)]
-    paddsb                m6, m2              ; p0+= p0 add
-    pxor                  m6, m5              ; unoffset
-    ; m6=p0
-    movq                  m1, [stkreg + p1]
-    pxor                  m1, m5              ; reoffset
-    paddsb                m1, m4              ; p1+= p1 add
-    pxor                  m1, m5              ; unoffset
-    ; m6 = p0 m1 = p1
-    psubsb                m3, m0              ; q0-= q0 add
-    pxor                  m3, m5              ; unoffset
-    ; m3 = q0
-    psubsb                m7, m4              ; q1-= q1 add
-    pxor                  m7, m5              ; unoffset
-    ; m7 = q1
-    ; transpose and write back
-    ; m1 =    72 62 52 42 32 22 12 02
-    ; m6 =    73 63 53 43 33 23 13 03
-    ; m3 =    74 64 54 44 34 24 14 04
-    ; m7 =    75 65 55 45 35 25 15 05
-    movq                  m2, m1              ; 72 62 52 42 32 22 12 02
-    punpcklbw             m2, m6              ; 33 32 23 22 13 12 03 02
-    movq                  m4, m3              ; 74 64 54 44 34 24 14 04
-    punpckhbw             m1, m6              ; 73 72 63 62 53 52 43 42
-    punpcklbw             m4, m7              ; 35 34 25 24 15 14 05 04
-    punpckhbw             m3, m7              ; 75 74 65 64 55 54 45 44
-    movq                  m6, m2              ; 33 32 23 22 13 12 03 02
-    punpcklwd             m2, m4              ; 15 14 13 12 05 04 03 02
-    punpckhwd             m6, m4              ; 35 34 33 32 25 24 23 22
-    movq                  m5, m1              ; 73 72 63 62 53 52 43 42
-    punpcklwd             m1, m3              ; 55 54 53 52 45 44 43 42
-    punpckhwd             m5, m3              ; 75 74 73 72 65 64 63 62
-
-    ; m2 = 15 14 13 12 05 04 03 02
-    ; m6 = 35 34 33 32 25 24 23 22
-    ; m5 = 55 54 53 52 45 44 43 42
-    ; m1 = 75 74 73 72 65 64 63 62
-    movd   [sq + pq * 4 + 2], m2
-    psrlq                 m2, 32
-    movd  [s1q + pq * 4 + 2], m2
-    movd   [sq + pq * 2 + 2], m6
-    psrlq                 m6, 32
-    movd       [sq + pq + 2], m6
-    movd            [sq + 2], m1
-    psrlq                 m1, 32
-    movd           [s1q + 2], m1
-    neg                   pq
-    movd      [s1q + pq + 2], m5
-    psrlq                 m5, 32
-    movd  [s1q + pq * 2 + 2], m5
-    RET
diff --git a/vpx_dsp/x86/loopfilter_sse2.c b/vpx_dsp/x86/loopfilter_sse2.c
index e03508a..39a6ae3 100644
--- a/vpx_dsp/x86/loopfilter_sse2.c
+++ b/vpx_dsp/x86/loopfilter_sse2.c
@@ -18,6 +18,213 @@
   return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
 }
 
+// filter_mask and hev_mask
+#define FILTER_HEV_MASK do {                                                   \
+  /* (abs(q1 - q0), abs(p1 - p0) */                                            \
+  __m128i flat = abs_diff(q1p1, q0p0);                                         \
+  /* abs(p1 - q1), abs(p0 - q0) */                                             \
+  const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);                           \
+  __m128i abs_p0q0, abs_p1q1, work;                                            \
+                                                                               \
+  /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */          \
+  hev = _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero);  \
+  hev = _mm_cmpgt_epi16(hev, thresh);                                          \
+  hev = _mm_packs_epi16(hev, hev);                                             \
+                                                                               \
+  /* const int8_t mask = filter_mask(*limit, *blimit, */                       \
+  /*                                 p3, p2, p1, p0, q0, q1, q2, q3); */       \
+  abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0);  /* abs(p0 - q0) * 2 */\
+  abs_p1q1 = _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0);  /* abs(p1 - q1) */\
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);                                      \
+  abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1);  /* abs(p1 - q1) / 2 */      \
+  /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */                                    \
+  mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);                                    \
+  /* abs(p3 - p2), abs(p2 - p1) */                                             \
+  work = abs_diff(p3p2, p2p1);                                                 \
+  flat = _mm_max_epu8(work, flat);                                             \
+  /* abs(q3 - q2), abs(q2 - q1) */                                             \
+  work = abs_diff(q3q2, q2q1);                                                 \
+  flat = _mm_max_epu8(work, flat);                                             \
+  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));                          \
+  mask = _mm_unpacklo_epi64(mask, flat);                                       \
+  mask = _mm_subs_epu8(mask, limit);                                           \
+  mask = _mm_cmpeq_epi8(mask, zero);                                           \
+  mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));                         \
+} while (0)
+
+#define FILTER4 do {                                                           \
+  const __m128i t3t4 = _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3,                    \
+                                    4, 4, 4, 4, 4, 4, 4, 4);                   \
+  const __m128i t80 = _mm_set1_epi8(0x80);                                     \
+  __m128i filter, filter2filter1, work;                                        \
+                                                                               \
+  ps1ps0 = _mm_xor_si128(p1p0, t80);  /* ^ 0x80 */                             \
+  qs1qs0 = _mm_xor_si128(q1q0, t80);                                           \
+                                                                               \
+  /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */                    \
+  work = _mm_subs_epi8(ps1ps0, qs1qs0);                                        \
+  filter = _mm_and_si128(_mm_srli_si128(work, 8), hev);                        \
+  /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */           \
+  filter = _mm_subs_epi8(filter, work);                                        \
+  filter = _mm_subs_epi8(filter, work);                                        \
+  filter = _mm_subs_epi8(filter, work);  /* + 3 * (qs0 - ps0) */               \
+  filter = _mm_and_si128(filter, mask);  /* & mask */                          \
+  filter = _mm_unpacklo_epi64(filter, filter);                                 \
+                                                                               \
+  /* filter1 = signed_char_clamp(filter + 4) >> 3; */                          \
+  /* filter2 = signed_char_clamp(filter + 3) >> 3; */                          \
+  filter2filter1 = _mm_adds_epi8(filter, t3t4);  /* signed_char_clamp */       \
+  filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1);                  \
+  filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1);          \
+  filter2filter1 = _mm_srai_epi16(filter2filter1, 11);  /* >> 3 */             \
+  filter = _mm_srai_epi16(filter, 11);  /* >> 3 */                             \
+  filter2filter1 = _mm_packs_epi16(filter2filter1, filter);                    \
+                                                                               \
+  /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */                        \
+  filter = _mm_subs_epi8(filter2filter1, ff);  /* + 1 */                       \
+  filter = _mm_unpacklo_epi8(filter, filter);                                  \
+  filter = _mm_srai_epi16(filter, 9);  /* round */                             \
+  filter = _mm_packs_epi16(filter, filter);                                    \
+  filter = _mm_andnot_si128(hev, filter);                                      \
+                                                                               \
+  hev = _mm_unpackhi_epi64(filter2filter1, filter);                            \
+  filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter);                 \
+                                                                               \
+  /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */      \
+  qs1qs0 = _mm_subs_epi8(qs1qs0, filter2filter1);                              \
+  /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */      \
+  ps1ps0 = _mm_adds_epi8(ps1ps0, hev);                                         \
+  qs1qs0 = _mm_xor_si128(qs1qs0, t80);  /* ^ 0x80 */                           \
+  ps1ps0 = _mm_xor_si128(ps1ps0, t80);  /* ^ 0x80 */                           \
+} while (0)
+
+void vpx_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
+                               const uint8_t *_blimit, const uint8_t *_limit,
+                               const uint8_t *_thresh) {
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i limit =
+      _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
+                         _mm_loadl_epi64((const __m128i *)_limit));
+  const __m128i thresh =
+      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
+  const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+  __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
+  __m128i mask, hev;
+
+  p3p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+  p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
+                            _mm_loadl_epi64((__m128i *)(s - 4 * p)));
+  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));
+  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 0 * p)));
+  q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 3 * p)));
+  p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
+  p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
+  q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
+  q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
+
+  FILTER_HEV_MASK;
+  FILTER4;
+
+  _mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0));  // *op1
+  _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0);  // *op0
+  _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0);  // *oq0
+  _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0));  // *oq1
+}
+
+void vpx_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
+                             const uint8_t *_blimit, const uint8_t *_limit,
+                             const uint8_t *_thresh) {
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i limit =
+      _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
+                         _mm_loadl_epi64((const __m128i *)_limit));
+  const __m128i thresh =
+      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
+  const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+  __m128i x0, x1, x2, x3;
+  __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
+  __m128i mask, hev;
+
+  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+  q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * p - 4)),
+                           _mm_loadl_epi64((__m128i *)(s + 1 * p - 4)));
+
+  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+  x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * p - 4)),
+                         _mm_loadl_epi64((__m128i *)(s + 3 * p - 4)));
+
+  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+  x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * p - 4)),
+                         _mm_loadl_epi64((__m128i *)(s + 5 * p - 4)));
+
+  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+  x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * p - 4)),
+                         _mm_loadl_epi64((__m128i *)(s + 7 * p - 4)));
+
+  // Transpose 8x8
+  // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
+  p1p0 = _mm_unpacklo_epi16(q1q0, x1);
+  // 40 50 60 70 41 51 61 71  42 52 62 72 43 53 63 73
+  x0 = _mm_unpacklo_epi16(x2, x3);
+  // 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
+  p3p2 = _mm_unpacklo_epi32(p1p0, x0);
+  // 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
+  p1p0 = _mm_unpackhi_epi32(p1p0, x0);
+  p3p2 = _mm_unpackhi_epi64(p3p2, _mm_slli_si128(p3p2, 8));  // swap lo and high
+  p1p0 = _mm_unpackhi_epi64(p1p0, _mm_slli_si128(p1p0, 8));  // swap lo and high
+
+  // 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
+  q1q0 = _mm_unpackhi_epi16(q1q0, x1);
+  // 44 54 64 74 45 55 65 75  46 56 66 76 47 57 67 77
+  x2 = _mm_unpackhi_epi16(x2, x3);
+  // 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
+  q3q2 = _mm_unpackhi_epi32(q1q0, x2);
+  // 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
+  q1q0 = _mm_unpacklo_epi32(q1q0, x2);
+
+  q0p0 = _mm_unpacklo_epi64(p1p0, q1q0);
+  q1p1 = _mm_unpackhi_epi64(p1p0, q1q0);
+  p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
+  p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
+  q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
+
+  FILTER_HEV_MASK;
+  FILTER4;
+
+  // Transpose 8x4 to 4x8
+  // qs1qs0: 20 21 22 23 24 25 26 27  30 31 32 33 34 34 36 37
+  // ps1ps0: 10 11 12 13 14 15 16 17  00 01 02 03 04 05 06 07
+  // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
+  ps1ps0 = _mm_unpackhi_epi64(ps1ps0, _mm_slli_si128(ps1ps0, 8));
+  // 10 30 11 31 12 32 13 33  14 34 15 35 16 36 17 37
+  x0 = _mm_unpackhi_epi8(ps1ps0, qs1qs0);
+  // 00 20 01 21 02 22 03 23  04 24 05 25 06 26 07 27
+  ps1ps0 = _mm_unpacklo_epi8(ps1ps0, qs1qs0);
+  // 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
+  qs1qs0 = _mm_unpackhi_epi8(ps1ps0, x0);
+  // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
+  ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0);
+
+  *(int *)(s + 0 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
+  ps1ps0 = _mm_srli_si128(ps1ps0, 4);
+  *(int *)(s + 1 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
+  ps1ps0 = _mm_srli_si128(ps1ps0, 4);
+  *(int *)(s + 2 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
+  ps1ps0 = _mm_srli_si128(ps1ps0, 4);
+  *(int *)(s + 3 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
+
+  *(int *)(s + 4 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
+  qs1qs0 = _mm_srli_si128(qs1qs0, 4);
+  *(int *)(s + 5 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
+  qs1qs0 = _mm_srli_si128(qs1qs0, 4);
+  *(int *)(s + 6 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
+  qs1qs0 = _mm_srli_si128(qs1qs0, 4);
+  *(int *)(s + 7 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
+}
+
 void vpx_lpf_horizontal_edge_8_sse2(unsigned char *s, int p,
                                     const unsigned char *_blimit,
                                     const unsigned char *_limit,
diff --git a/vpx_dsp/x86/sad_mmx.asm b/vpx_dsp/x86/sad_mmx.asm
deleted file mode 100644
index 9968992..0000000
--- a/vpx_dsp/x86/sad_mmx.asm
+++ /dev/null
@@ -1,427 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-global sym(vpx_sad16x16_mmx) PRIVATE
-global sym(vpx_sad8x16_mmx) PRIVATE
-global sym(vpx_sad8x8_mmx) PRIVATE
-global sym(vpx_sad4x4_mmx) PRIVATE
-global sym(vpx_sad16x8_mmx) PRIVATE
-
-;unsigned int vpx_sad16x16_mmx(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-sym(vpx_sad16x16_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push rsi
-    push rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        lea             rcx,        [rsi+rax*8]
-
-        lea             rcx,        [rcx+rax*8]
-        pxor            mm7,        mm7
-
-        pxor            mm6,        mm6
-
-.x16x16sad_mmx_loop:
-
-        movq            mm0,        QWORD PTR [rsi]
-        movq            mm2,        QWORD PTR [rsi+8]
-
-        movq            mm1,        QWORD PTR [rdi]
-        movq            mm3,        QWORD PTR [rdi+8]
-
-        movq            mm4,        mm0
-        movq            mm5,        mm2
-
-        psubusb         mm0,        mm1
-        psubusb         mm1,        mm4
-
-        psubusb         mm2,        mm3
-        psubusb         mm3,        mm5
-
-        por             mm0,        mm1
-        por             mm2,        mm3
-
-        movq            mm1,        mm0
-        movq            mm3,        mm2
-
-        punpcklbw       mm0,        mm6
-        punpcklbw       mm2,        mm6
-
-        punpckhbw       mm1,        mm6
-        punpckhbw       mm3,        mm6
-
-        paddw           mm0,        mm2
-        paddw           mm1,        mm3
-
-
-        lea             rsi,        [rsi+rax]
-        add             rdi,        rdx
-
-        paddw           mm7,        mm0
-        paddw           mm7,        mm1
-
-        cmp             rsi,        rcx
-        jne             .x16x16sad_mmx_loop
-
-
-        movq            mm0,        mm7
-
-        punpcklwd       mm0,        mm6
-        punpckhwd       mm7,        mm6
-
-        paddw           mm0,        mm7
-        movq            mm7,        mm0
-
-
-        psrlq           mm0,        32
-        paddw           mm7,        mm0
-
-        movq            rax,        mm7
-
-    pop rdi
-    pop rsi
-    mov rsp, rbp
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vpx_sad8x16_mmx(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-sym(vpx_sad8x16_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push rsi
-    push rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        lea             rcx,        [rsi+rax*8]
-
-        lea             rcx,        [rcx+rax*8]
-        pxor            mm7,        mm7
-
-        pxor            mm6,        mm6
-
-.x8x16sad_mmx_loop:
-
-        movq            mm0,        QWORD PTR [rsi]
-        movq            mm1,        QWORD PTR [rdi]
-
-        movq            mm2,        mm0
-        psubusb         mm0,        mm1
-
-        psubusb         mm1,        mm2
-        por             mm0,        mm1
-
-        movq            mm2,        mm0
-        punpcklbw       mm0,        mm6
-
-        punpckhbw       mm2,        mm6
-        lea             rsi,        [rsi+rax]
-
-        add             rdi,        rdx
-        paddw           mm7,        mm0
-
-        paddw           mm7,        mm2
-        cmp             rsi,        rcx
-
-        jne             .x8x16sad_mmx_loop
-
-        movq            mm0,        mm7
-        punpcklwd       mm0,        mm6
-
-        punpckhwd       mm7,        mm6
-        paddw           mm0,        mm7
-
-        movq            mm7,        mm0
-        psrlq           mm0,        32
-
-        paddw           mm7,        mm0
-        movq            rax,        mm7
-
-    pop rdi
-    pop rsi
-    mov rsp, rbp
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vpx_sad8x8_mmx(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-sym(vpx_sad8x8_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push rsi
-    push rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        lea             rcx,        [rsi+rax*8]
-        pxor            mm7,        mm7
-
-        pxor            mm6,        mm6
-
-.x8x8sad_mmx_loop:
-
-        movq            mm0,        QWORD PTR [rsi]
-        movq            mm1,        QWORD PTR [rdi]
-
-        movq            mm2,        mm0
-        psubusb         mm0,        mm1
-
-        psubusb         mm1,        mm2
-        por             mm0,        mm1
-
-        movq            mm2,        mm0
-        punpcklbw       mm0,        mm6
-
-        punpckhbw       mm2,        mm6
-        paddw           mm0,        mm2
-
-        lea             rsi,       [rsi+rax]
-        add             rdi,        rdx
-
-        paddw           mm7,       mm0
-        cmp             rsi,        rcx
-
-        jne             .x8x8sad_mmx_loop
-
-        movq            mm0,        mm7
-        punpcklwd       mm0,        mm6
-
-        punpckhwd       mm7,        mm6
-        paddw           mm0,        mm7
-
-        movq            mm7,        mm0
-        psrlq           mm0,        32
-
-        paddw           mm7,        mm0
-        movq            rax,        mm7
-
-    pop rdi
-    pop rsi
-    mov rsp, rbp
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vpx_sad4x4_mmx(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-sym(vpx_sad4x4_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push rsi
-    push rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        movd            mm0,        DWORD PTR [rsi]
-        movd            mm1,        DWORD PTR [rdi]
-
-        movd            mm2,        DWORD PTR [rsi+rax]
-        movd            mm3,        DWORD PTR [rdi+rdx]
-
-        punpcklbw       mm0,        mm2
-        punpcklbw       mm1,        mm3
-
-        movq            mm2,        mm0
-        psubusb         mm0,        mm1
-
-        psubusb         mm1,        mm2
-        por             mm0,        mm1
-
-        movq            mm2,        mm0
-        pxor            mm3,        mm3
-
-        punpcklbw       mm0,        mm3
-        punpckhbw       mm2,        mm3
-
-        paddw           mm0,        mm2
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        movd            mm4,        DWORD PTR [rsi]
-        movd            mm5,        DWORD PTR [rdi]
-
-        movd            mm6,        DWORD PTR [rsi+rax]
-        movd            mm7,        DWORD PTR [rdi+rdx]
-
-        punpcklbw       mm4,        mm6
-        punpcklbw       mm5,        mm7
-
-        movq            mm6,        mm4
-        psubusb         mm4,        mm5
-
-        psubusb         mm5,        mm6
-        por             mm4,        mm5
-
-        movq            mm5,        mm4
-        punpcklbw       mm4,        mm3
-
-        punpckhbw       mm5,        mm3
-        paddw           mm4,        mm5
-
-        paddw           mm0,        mm4
-        movq            mm1,        mm0
-
-        punpcklwd       mm0,        mm3
-        punpckhwd       mm1,        mm3
-
-        paddw           mm0,        mm1
-        movq            mm1,        mm0
-
-        psrlq           mm0,        32
-        paddw           mm0,        mm1
-
-        movq            rax,        mm0
-
-    pop rdi
-    pop rsi
-    mov rsp, rbp
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vpx_sad16x8_mmx(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-sym(vpx_sad16x8_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push rsi
-    push rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        lea             rcx,        [rsi+rax*8]
-        pxor            mm7,        mm7
-
-        pxor            mm6,        mm6
-
-.x16x8sad_mmx_loop:
-
-        movq            mm0,       [rsi]
-        movq            mm1,       [rdi]
-
-        movq            mm2,        [rsi+8]
-        movq            mm3,        [rdi+8]
-
-        movq            mm4,        mm0
-        movq            mm5,        mm2
-
-        psubusb         mm0,        mm1
-        psubusb         mm1,        mm4
-
-        psubusb         mm2,        mm3
-        psubusb         mm3,        mm5
-
-        por             mm0,        mm1
-        por             mm2,        mm3
-
-        movq            mm1,        mm0
-        movq            mm3,        mm2
-
-        punpcklbw       mm0,        mm6
-        punpckhbw       mm1,        mm6
-
-        punpcklbw       mm2,        mm6
-        punpckhbw       mm3,        mm6
-
-
-        paddw           mm0,        mm2
-        paddw           mm1,        mm3
-
-        paddw           mm0,        mm1
-        lea             rsi,        [rsi+rax]
-
-        add             rdi,        rdx
-        paddw           mm7,        mm0
-
-        cmp             rsi,        rcx
-        jne             .x16x8sad_mmx_loop
-
-        movq            mm0,        mm7
-        punpcklwd       mm0,        mm6
-
-        punpckhwd       mm7,        mm6
-        paddw           mm0,        mm7
-
-        movq            mm7,        mm0
-        psrlq           mm0,        32
-
-        paddw           mm7,        mm0
-        movq            rax,        mm7
-
-    pop rdi
-    pop rsi
-    mov rsp, rbp
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/vpx_dsp/x86/subpel_variance_sse2.asm b/vpx_dsp/x86/subpel_variance_sse2.asm
index c655e4b..cee4468 100644
--- a/vpx_dsp/x86/subpel_variance_sse2.asm
+++ b/vpx_dsp/x86/subpel_variance_sse2.asm
@@ -57,8 +57,8 @@
   paddd                %6, %1
 %endmacro
 
-%macro STORE_AND_RET 0
-%if mmsize == 16
+%macro STORE_AND_RET 1
+%if %1 > 4
   ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
   ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
   ; We have to sign-extend it before adding the words within the register
@@ -78,16 +78,16 @@
   movd               [r1], m7           ; store sse
   paddd                m6, m4
   movd               raxd, m6           ; store sum as return value
-%else ; mmsize == 8
-  pshufw               m4, m6, 0xe
-  pshufw               m3, m7, 0xe
+%else ; 4xh
+  pshuflw              m4, m6, 0xe
+  pshuflw              m3, m7, 0xe
   paddw                m6, m4
   paddd                m7, m3
   pcmpgtw              m5, m6           ; mask for 0 > x
   mov                  r1, ssem         ; r1 = unsigned int *sse
   punpcklwd            m6, m5           ; sign-extend m6 word->dword
   movd               [r1], m7           ; store sse
-  pshufw               m4, m6, 0xe
+  pshuflw              m4, m6, 0xe
   paddd                m6, m4
   movd               raxd, m6           ; store sum as return value
 %endif
@@ -196,6 +196,12 @@
   %endif
 %endif
 
+%if %1 == 4
+  %define movx movd
+%else
+  %define movx movh
+%endif
+
   ASSERT               %1 <= 16         ; m6 overflows if w > 16
   pxor                 m6, m6           ; sum
   pxor                 m7, m7           ; sse
@@ -228,6 +234,7 @@
 %endif
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
+
 %if %2 == 0 ; !avg
   punpckhbw            m3, m1, m5
   punpcklbw            m1, m5
@@ -237,24 +244,37 @@
   add                srcq, src_strideq
   add                dstq, dst_strideq
 %else ; %1 < 16
-  movh                 m0, [srcq]
+  movx                 m0, [srcq]
 %if %2 == 1 ; avg
-%if mmsize == 16
+%if %1 > 4
   movhps               m0, [srcq+src_strideq]
-%else ; mmsize == 8
-  punpckldq            m0, [srcq+src_strideq]
+%else ; 4xh
+  movx                 m1, [srcq+src_strideq]
+  punpckldq            m0, m1
 %endif
 %else ; !avg
-  movh                 m2, [srcq+src_strideq]
+  movx                 m2, [srcq+src_strideq]
 %endif
-  movh                 m1, [dstq]
-  movh                 m3, [dstq+dst_strideq]
+
+  movx                 m1, [dstq]
+  movx                 m3, [dstq+dst_strideq]
+
 %if %2 == 1 ; avg
+%if %1 > 4
   pavgb                m0, [secq]
+%else
+  movh                 m2, [secq]
+  pavgb                m0, m2
+%endif
   punpcklbw            m3, m5
   punpcklbw            m1, m5
+%if %1 > 4
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
+%else ; 4xh
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
 %else ; !avg
   punpcklbw            m0, m5
   punpcklbw            m2, m5
@@ -271,10 +291,10 @@
 %endif
   dec                   block_height
   jg .x_zero_y_zero_loop
-  STORE_AND_RET
+  STORE_AND_RET %1
 
 .x_zero_y_nonzero:
-  cmp           y_offsetd, 8
+  cmp           y_offsetd, 4
   jne .x_zero_y_nonhalf
 
   ; x_offset == 0 && y_offset == 0.5
@@ -296,37 +316,41 @@
   add                srcq, src_strideq
   add                dstq, dst_strideq
 %else ; %1 < 16
-  movh                 m0, [srcq]
-  movh                 m2, [srcq+src_strideq]
+  movx                 m0, [srcq]
+  movx                 m2, [srcq+src_strideq]
 %if %2 == 1 ; avg
-%if mmsize == 16
+%if %1 > 4
   movhps               m2, [srcq+src_strideq*2]
-%else ; mmsize == 8
-%if %1 == 4
-  movh                 m1, [srcq+src_strideq*2]
+%else ; 4xh
+  movx                 m1, [srcq+src_strideq*2]
   punpckldq            m2, m1
-%else
-  punpckldq            m2, [srcq+src_strideq*2]
 %endif
-%endif
-  movh                 m1, [dstq]
-%if mmsize == 16
+  movx                 m1, [dstq]
+%if %1 > 4
   movlhps              m0, m2
-%else ; mmsize == 8
+%else ; 4xh
   punpckldq            m0, m2
 %endif
-  movh                 m3, [dstq+dst_strideq]
+  movx                 m3, [dstq+dst_strideq]
   pavgb                m0, m2
   punpcklbw            m1, m5
+%if %1 > 4
   pavgb                m0, [secq]
   punpcklbw            m3, m5
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
+%else ; 4xh
+  movh                 m4, [secq]
+  pavgb                m0, m4
+  punpcklbw            m3, m5
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
 %else ; !avg
-  movh                 m4, [srcq+src_strideq*2]
-  movh                 m1, [dstq]
+  movx                 m4, [srcq+src_strideq*2]
+  movx                 m1, [dstq]
   pavgb                m0, m2
-  movh                 m3, [dstq+dst_strideq]
+  movx                 m3, [dstq+dst_strideq]
   pavgb                m2, m4
   punpcklbw            m0, m5
   punpcklbw            m2, m5
@@ -343,7 +367,7 @@
 %endif
   dec                   block_height
   jg .x_zero_y_half_loop
-  STORE_AND_RET
+  STORE_AND_RET %1
 
 .x_zero_y_nonhalf:
   ; x_offset == 0 && y_offset == bilin interpolation
@@ -351,7 +375,7 @@
   lea        bilin_filter, [bilin_filter_m]
 %endif
   shl           y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
+%if ARCH_X86_64 && %1 > 4
   mova                 m8, [bilin_filter+y_offsetq]
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+y_offsetq+16]
@@ -424,12 +448,12 @@
   add                srcq, src_strideq
   add                dstq, dst_strideq
 %else ; %1 < 16
-  movh                 m0, [srcq]
-  movh                 m2, [srcq+src_strideq]
-  movh                 m4, [srcq+src_strideq*2]
-  movh                 m3, [dstq+dst_strideq]
+  movx                 m0, [srcq]
+  movx                 m2, [srcq+src_strideq]
+  movx                 m4, [srcq+src_strideq*2]
+  movx                 m3, [dstq+dst_strideq]
 %if cpuflag(ssse3)
-  movh                 m1, [dstq]
+  movx                 m1, [dstq]
   punpcklbw            m0, m2
   punpcklbw            m2, m4
   pmaddubsw            m0, filter_y_a
@@ -449,17 +473,27 @@
   pmullw               m4, filter_y_b
   paddw                m0, m1
   paddw                m2, filter_rnd
-  movh                 m1, [dstq]
+  movx                 m1, [dstq]
   paddw                m2, m4
 %endif
   psraw                m0, 4
   psraw                m2, 4
 %if %2 == 1 ; avg
   ; FIXME(rbultje) pipeline
+%if %1 == 4
+  movlhps              m0, m2
+%endif
   packuswb             m0, m2
+%if %1 > 4
   pavgb                m0, [secq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
+%else ; 4xh
+  movh                 m2, [secq]
+  pavgb                m0, m2
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
 %endif
   punpcklbw            m1, m5
   SUM_SSE              m0, m1, m2, m3, m6, m7
@@ -475,10 +509,10 @@
 %undef filter_y_a
 %undef filter_y_b
 %undef filter_rnd
-  STORE_AND_RET
+  STORE_AND_RET %1
 
 .x_nonzero:
-  cmp           x_offsetd, 8
+  cmp           x_offsetd, 4
   jne .x_nonhalf
   ; x_offset == 0.5
   test          y_offsetd, y_offsetd
@@ -503,30 +537,40 @@
   add                srcq, src_strideq
   add                dstq, dst_strideq
 %else ; %1 < 16
-  movh                 m0, [srcq]
-  movh                 m4, [srcq+1]
+  movx                 m0, [srcq]
+  movx                 m4, [srcq+1]
 %if %2 == 1 ; avg
-%if mmsize == 16
+%if %1 > 4
   movhps               m0, [srcq+src_strideq]
   movhps               m4, [srcq+src_strideq+1]
-%else ; mmsize == 8
-  punpckldq            m0, [srcq+src_strideq]
-  punpckldq            m4, [srcq+src_strideq+1]
+%else ; 4xh
+  movx                 m1, [srcq+src_strideq]
+  punpckldq            m0, m1
+  movx                 m2, [srcq+src_strideq+1]
+  punpckldq            m4, m2
 %endif
-  movh                 m1, [dstq]
-  movh                 m3, [dstq+dst_strideq]
+  movx                 m1, [dstq]
+  movx                 m3, [dstq+dst_strideq]
   pavgb                m0, m4
   punpcklbw            m3, m5
+%if %1 > 4
   pavgb                m0, [secq]
   punpcklbw            m1, m5
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
+%else ; 4xh
+  movh                 m2, [secq]
+  pavgb                m0, m2
+  punpcklbw            m1, m5
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
 %else ; !avg
-  movh                 m2, [srcq+src_strideq]
-  movh                 m1, [dstq]
+  movx                 m2, [srcq+src_strideq]
+  movx                 m1, [dstq]
   pavgb                m0, m4
-  movh                 m4, [srcq+src_strideq+1]
-  movh                 m3, [dstq+dst_strideq]
+  movx                 m4, [srcq+src_strideq+1]
+  movx                 m3, [dstq+dst_strideq]
   pavgb                m2, m4
   punpcklbw            m0, m5
   punpcklbw            m2, m5
@@ -543,10 +587,10 @@
 %endif
   dec                   block_height
   jg .x_half_y_zero_loop
-  STORE_AND_RET
+  STORE_AND_RET %1
 
 .x_half_y_nonzero:
-  cmp           y_offsetd, 8
+  cmp           y_offsetd, 4
   jne .x_half_y_nonhalf
 
   ; x_offset == 0.5 && y_offset == 0.5
@@ -578,53 +622,58 @@
   add                srcq, src_strideq
   add                dstq, dst_strideq
 %else ; %1 < 16
-  movh                 m0, [srcq]
-  movh                 m3, [srcq+1]
+  movx                 m0, [srcq]
+  movx                 m3, [srcq+1]
   add                srcq, src_strideq
   pavgb                m0, m3
 .x_half_y_half_loop:
-  movh                 m2, [srcq]
-  movh                 m3, [srcq+1]
+  movx                 m2, [srcq]
+  movx                 m3, [srcq+1]
 %if %2 == 1 ; avg
-%if mmsize == 16
+%if %1 > 4
   movhps               m2, [srcq+src_strideq]
   movhps               m3, [srcq+src_strideq+1]
 %else
-%if %1 == 4
-  movh                 m1, [srcq+src_strideq]
+  movx                 m1, [srcq+src_strideq]
   punpckldq            m2, m1
-  movh                 m1, [srcq+src_strideq+1]
+  movx                 m1, [srcq+src_strideq+1]
   punpckldq            m3, m1
-%else
-  punpckldq            m2, [srcq+src_strideq]
-  punpckldq            m3, [srcq+src_strideq+1]
-%endif
 %endif
   pavgb                m2, m3
-%if mmsize == 16
+%if %1 > 4
   movlhps              m0, m2
   movhlps              m4, m2
-%else ; mmsize == 8
+%else ; 4xh
   punpckldq            m0, m2
-  pshufw               m4, m2, 0xe
+  pshuflw              m4, m2, 0xe
 %endif
-  movh                 m1, [dstq]
+  movx                 m1, [dstq]
   pavgb                m0, m2
-  movh                 m3, [dstq+dst_strideq]
+  movx                 m3, [dstq+dst_strideq]
+%if %1 > 4
   pavgb                m0, [secq]
+%else
+  movh                 m2, [secq]
+  pavgb                m0, m2
+%endif
   punpcklbw            m3, m5
   punpcklbw            m1, m5
+%if %1 > 4
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
+%else
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
 %else ; !avg
-  movh                 m4, [srcq+src_strideq]
-  movh                 m1, [srcq+src_strideq+1]
+  movx                 m4, [srcq+src_strideq]
+  movx                 m1, [srcq+src_strideq+1]
   pavgb                m2, m3
   pavgb                m4, m1
   pavgb                m0, m2
   pavgb                m2, m4
-  movh                 m1, [dstq]
-  movh                 m3, [dstq+dst_strideq]
+  movx                 m1, [dstq]
+  movx                 m3, [dstq+dst_strideq]
   punpcklbw            m0, m5
   punpcklbw            m2, m5
   punpcklbw            m3, m5
@@ -641,7 +690,7 @@
 %endif
   dec                   block_height
   jg .x_half_y_half_loop
-  STORE_AND_RET
+  STORE_AND_RET %1
 
 .x_half_y_nonhalf:
   ; x_offset == 0.5 && y_offset == bilin interpolation
@@ -649,7 +698,7 @@
   lea        bilin_filter, [bilin_filter_m]
 %endif
   shl           y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
+%if ARCH_X86_64 && %1 > 4
   mova                 m8, [bilin_filter+y_offsetq]
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+y_offsetq+16]
@@ -724,23 +773,23 @@
   add                srcq, src_strideq
   add                dstq, dst_strideq
 %else ; %1 < 16
-  movh                 m0, [srcq]
-  movh                 m3, [srcq+1]
+  movx                 m0, [srcq]
+  movx                 m3, [srcq+1]
   add                srcq, src_strideq
   pavgb                m0, m3
 %if notcpuflag(ssse3)
   punpcklbw            m0, m5
 %endif
 .x_half_y_other_loop:
-  movh                 m2, [srcq]
-  movh                 m1, [srcq+1]
-  movh                 m4, [srcq+src_strideq]
-  movh                 m3, [srcq+src_strideq+1]
+  movx                 m2, [srcq]
+  movx                 m1, [srcq+1]
+  movx                 m4, [srcq+src_strideq]
+  movx                 m3, [srcq+src_strideq+1]
   pavgb                m2, m1
   pavgb                m4, m3
-  movh                 m3, [dstq+dst_strideq]
+  movx                 m3, [dstq+dst_strideq]
 %if cpuflag(ssse3)
-  movh                 m1, [dstq]
+  movx                 m1, [dstq]
   punpcklbw            m0, m2
   punpcklbw            m2, m4
   pmaddubsw            m0, filter_y_a
@@ -760,16 +809,26 @@
   pmullw               m1, m4, filter_y_b
   paddw                m2, filter_rnd
   paddw                m2, m1
-  movh                 m1, [dstq]
+  movx                 m1, [dstq]
 %endif
   psraw                m0, 4
   psraw                m2, 4
 %if %2 == 1 ; avg
   ; FIXME(rbultje) pipeline
+%if %1 == 4
+  movlhps              m0, m2
+%endif
   packuswb             m0, m2
+%if %1 > 4
   pavgb                m0, [secq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
+%else
+  movh                 m2, [secq]
+  pavgb                m0, m2
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
 %endif
   punpcklbw            m1, m5
   SUM_SSE              m0, m1, m2, m3, m6, m7
@@ -786,7 +845,7 @@
 %undef filter_y_a
 %undef filter_y_b
 %undef filter_rnd
-  STORE_AND_RET
+  STORE_AND_RET %1
 
 .x_nonhalf:
   test          y_offsetd, y_offsetd
@@ -797,7 +856,7 @@
   lea        bilin_filter, [bilin_filter_m]
 %endif
   shl           x_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
+%if ARCH_X86_64 && %1 > 4
   mova                 m8, [bilin_filter+x_offsetq]
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+x_offsetq+16]
@@ -865,14 +924,14 @@
   add                srcq, src_strideq
   add                dstq, dst_strideq
 %else ; %1 < 16
-  movh                 m0, [srcq]
-  movh                 m1, [srcq+1]
-  movh                 m2, [srcq+src_strideq]
-  movh                 m4, [srcq+src_strideq+1]
-  movh                 m3, [dstq+dst_strideq]
+  movx                 m0, [srcq]
+  movx                 m1, [srcq+1]
+  movx                 m2, [srcq+src_strideq]
+  movx                 m4, [srcq+src_strideq+1]
+  movx                 m3, [dstq+dst_strideq]
 %if cpuflag(ssse3)
   punpcklbw            m0, m1
-  movh                 m1, [dstq]
+  movx                 m1, [dstq]
   punpcklbw            m2, m4
   pmaddubsw            m0, filter_x_a
   pmaddubsw            m2, filter_x_a
@@ -892,17 +951,27 @@
   pmullw               m4, filter_x_b
   paddw                m0, m1
   paddw                m2, filter_rnd
-  movh                 m1, [dstq]
+  movx                 m1, [dstq]
   paddw                m2, m4
 %endif
   psraw                m0, 4
   psraw                m2, 4
 %if %2 == 1 ; avg
   ; FIXME(rbultje) pipeline
+%if %1 == 4
+  movlhps              m0, m2
+%endif
   packuswb             m0, m2
+%if %1 > 4
   pavgb                m0, [secq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
+%else
+  movh                 m2, [secq]
+  pavgb                m0, m2
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
 %endif
   punpcklbw            m1, m5
   SUM_SSE              m0, m1, m2, m3, m6, m7
@@ -918,10 +987,10 @@
 %undef filter_x_a
 %undef filter_x_b
 %undef filter_rnd
-  STORE_AND_RET
+  STORE_AND_RET %1
 
 .x_nonhalf_y_nonzero:
-  cmp           y_offsetd, 8
+  cmp           y_offsetd, 4
   jne .x_nonhalf_y_nonhalf
 
   ; x_offset == bilin interpolation && y_offset == 0.5
@@ -929,7 +998,7 @@
   lea        bilin_filter, [bilin_filter_m]
 %endif
   shl           x_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
+%if ARCH_X86_64 && %1 > 4
   mova                 m8, [bilin_filter+x_offsetq]
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+x_offsetq+16]
@@ -1037,8 +1106,8 @@
   add                srcq, src_strideq
   add                dstq, dst_strideq
 %else ; %1 < 16
-  movh                 m0, [srcq]
-  movh                 m1, [srcq+1]
+  movx                 m0, [srcq]
+  movx                 m1, [srcq+1]
 %if cpuflag(ssse3)
   punpcklbw            m0, m1
   pmaddubsw            m0, filter_x_a
@@ -1054,17 +1123,17 @@
   add                srcq, src_strideq
   psraw                m0, 4
 .x_other_y_half_loop:
-  movh                 m2, [srcq]
-  movh                 m1, [srcq+1]
-  movh                 m4, [srcq+src_strideq]
-  movh                 m3, [srcq+src_strideq+1]
+  movx                 m2, [srcq]
+  movx                 m1, [srcq+1]
+  movx                 m4, [srcq+src_strideq]
+  movx                 m3, [srcq+src_strideq+1]
 %if cpuflag(ssse3)
   punpcklbw            m2, m1
   punpcklbw            m4, m3
   pmaddubsw            m2, filter_x_a
   pmaddubsw            m4, filter_x_a
-  movh                 m1, [dstq]
-  movh                 m3, [dstq+dst_strideq]
+  movx                 m1, [dstq]
+  movx                 m3, [dstq+dst_strideq]
   paddw                m2, filter_rnd
   paddw                m4, filter_rnd
 %else
@@ -1079,9 +1148,9 @@
   pmullw               m3, filter_x_b
   paddw                m4, filter_rnd
   paddw                m2, m1
-  movh                 m1, [dstq]
+  movx                 m1, [dstq]
   paddw                m4, m3
-  movh                 m3, [dstq+dst_strideq]
+  movx                 m3, [dstq+dst_strideq]
 %endif
   psraw                m2, 4
   psraw                m4, 4
@@ -1089,10 +1158,20 @@
   pavgw                m2, m4
 %if %2 == 1 ; avg
   ; FIXME(rbultje) pipeline - also consider going to bytes here
+%if %1 == 4
+  movlhps              m0, m2
+%endif
   packuswb             m0, m2
+%if %1 > 4
   pavgb                m0, [secq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
+%else
+  movh                 m2, [secq]
+  pavgb                m0, m2
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
 %endif
   punpcklbw            m3, m5
   punpcklbw            m1, m5
@@ -1110,7 +1189,7 @@
 %undef filter_x_a
 %undef filter_x_b
 %undef filter_rnd
-  STORE_AND_RET
+  STORE_AND_RET %1
 
 .x_nonhalf_y_nonhalf:
 %ifdef PIC
@@ -1118,7 +1197,7 @@
 %endif
   shl           x_offsetd, filter_idx_shift
   shl           y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
+%if ARCH_X86_64 && %1 > 4
   mova                 m8, [bilin_filter+x_offsetq]
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+x_offsetq+16]
@@ -1261,8 +1340,8 @@
   INC_SRC_BY_SRC_STRIDE
   add                dstq, dst_strideq
 %else ; %1 < 16
-  movh                 m0, [srcq]
-  movh                 m1, [srcq+1]
+  movx                 m0, [srcq]
+  movx                 m1, [srcq+1]
 %if cpuflag(ssse3)
   punpcklbw            m0, m1
   pmaddubsw            m0, filter_x_a
@@ -1283,20 +1362,20 @@
   INC_SRC_BY_SRC_STRIDE
 
 .x_other_y_other_loop:
-  movh                 m2, [srcq]
-  movh                 m1, [srcq+1]
+  movx                 m2, [srcq]
+  movx                 m1, [srcq+1]
 
   INC_SRC_BY_SRC_STRIDE
-  movh                 m4, [srcq]
-  movh                 m3, [srcq+1]
+  movx                 m4, [srcq]
+  movx                 m3, [srcq+1]
 
 %if cpuflag(ssse3)
   punpcklbw            m2, m1
   punpcklbw            m4, m3
   pmaddubsw            m2, filter_x_a
   pmaddubsw            m4, filter_x_a
-  movh                 m3, [dstq+dst_strideq]
-  movh                 m1, [dstq]
+  movx                 m3, [dstq+dst_strideq]
+  movx                 m1, [dstq]
   paddw                m2, filter_rnd
   paddw                m4, filter_rnd
   psraw                m2, 4
@@ -1335,9 +1414,9 @@
   pmullw               m1, m4, filter_y_b
   paddw                m2, filter_rnd
   paddw                m0, m3
-  movh                 m3, [dstq+dst_strideq]
+  movx                 m3, [dstq+dst_strideq]
   paddw                m2, m1
-  movh                 m1, [dstq]
+  movx                 m1, [dstq]
   psraw                m0, 4
   psraw                m2, 4
   punpcklbw            m3, m5
@@ -1345,10 +1424,20 @@
 %endif
 %if %2 == 1 ; avg
   ; FIXME(rbultje) pipeline
+%if %1 == 4
+  movlhps              m0, m2
+%endif
   packuswb             m0, m2
+%if %1 > 4
   pavgb                m0, [secq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
+%else
+  movh                 m2, [secq]
+  pavgb                m0, m2
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
 %endif
   SUM_SSE              m0, m1, m2, m3, m6, m7
   mova                 m0, m4
@@ -1366,7 +1455,8 @@
 %undef filter_y_a
 %undef filter_y_b
 %undef filter_rnd
-  STORE_AND_RET
+%undef movx
+  STORE_AND_RET %1
 %endmacro
 
 ; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
@@ -1375,26 +1465,22 @@
 ; location in the sse/2 version, rather than duplicating that code in the
 ; binary.
 
-INIT_MMX sse
-SUBPEL_VARIANCE  4
 INIT_XMM sse2
+SUBPEL_VARIANCE  4
 SUBPEL_VARIANCE  8
 SUBPEL_VARIANCE 16
 
-INIT_MMX ssse3
-SUBPEL_VARIANCE  4
 INIT_XMM ssse3
+SUBPEL_VARIANCE  4
 SUBPEL_VARIANCE  8
 SUBPEL_VARIANCE 16
 
-INIT_MMX sse
-SUBPEL_VARIANCE  4, 1
 INIT_XMM sse2
+SUBPEL_VARIANCE  4, 1
 SUBPEL_VARIANCE  8, 1
 SUBPEL_VARIANCE 16, 1
 
-INIT_MMX ssse3
-SUBPEL_VARIANCE  4, 1
 INIT_XMM ssse3
+SUBPEL_VARIANCE  4, 1
 SUBPEL_VARIANCE  8, 1
 SUBPEL_VARIANCE 16, 1
diff --git a/vpx_dsp/x86/variance_avx2.c b/vpx_dsp/x86/variance_avx2.c
index 7851a98..f8c9711 100644
--- a/vpx_dsp/x86/variance_avx2.c
+++ b/vpx_dsp/x86/variance_avx2.c
@@ -45,7 +45,7 @@
   int sum;
   variance_avx2(src, src_stride, ref, ref_stride, 16, 16,
                 sse, &sum, vpx_get16x16var_avx2, 16);
-  return *sse - (((unsigned int)sum * sum) >> 8);
+  return *sse - (((uint32_t)((int64_t)sum * sum)) >> 8);
 }
 
 unsigned int vpx_mse16x16_avx2(const uint8_t *src, int src_stride,
diff --git a/vpx_dsp/x86/variance_impl_mmx.asm b/vpx_dsp/x86/variance_impl_mmx.asm
deleted file mode 100644
index b8ba79b..0000000
--- a/vpx_dsp/x86/variance_impl_mmx.asm
+++ /dev/null
@@ -1,744 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%define mmx_filter_shift            7
-
-;unsigned int vpx_get_mb_ss_mmx( short *src_ptr )
-global sym(vpx_get_mb_ss_mmx) PRIVATE
-sym(vpx_get_mb_ss_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    sub         rsp, 8
-    ; end prolog
-
-        mov         rax, arg(0) ;src_ptr
-        mov         rcx, 16
-        pxor        mm4, mm4
-
-.NEXTROW:
-        movq        mm0, [rax]
-        movq        mm1, [rax+8]
-        movq        mm2, [rax+16]
-        movq        mm3, [rax+24]
-        pmaddwd     mm0, mm0
-        pmaddwd     mm1, mm1
-        pmaddwd     mm2, mm2
-        pmaddwd     mm3, mm3
-
-        paddd       mm4, mm0
-        paddd       mm4, mm1
-        paddd       mm4, mm2
-        paddd       mm4, mm3
-
-        add         rax, 32
-        dec         rcx
-        ja          .NEXTROW
-        movq        QWORD PTR [rsp], mm4
-
-        ;return sum[0]+sum[1];
-        movsxd      rax, dword ptr [rsp]
-        movsxd      rcx, dword ptr [rsp+4]
-        add         rax, rcx
-
-    ; begin epilog
-    add rsp, 8
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vpx_get8x8var_mmx
-;(
-;    unsigned char *src_ptr,
-;    int  source_stride,
-;    unsigned char *ref_ptr,
-;    int  recon_stride,
-;    unsigned int *SSE,
-;    int *Sum
-;)
-global sym(vpx_get8x8var_mmx) PRIVATE
-sym(vpx_get8x8var_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push rsi
-    push rdi
-    push rbx
-    sub         rsp, 16
-    ; end prolog
-
-        pxor        mm5, mm5                    ; Blank mmx6
-        pxor        mm6, mm6                    ; Blank mmx7
-        pxor        mm7, mm7                    ; Blank mmx7
-
-        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
-        mov         rbx, arg(2) ;[ref_ptr]
-        movsxd      rcx, dword ptr arg(1) ;[source_stride]
-        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
-
-        ; Row 1
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 2
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 3
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 4
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 5
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        ;              movq        mm4, [rbx + rdx]
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 6
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 7
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 8
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Now accumulate the final results.
-        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
-        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
-        movsx       rdx, WORD PTR [rsp+8]
-        movsx       rcx, WORD PTR [rsp+10]
-        movsx       rbx, WORD PTR [rsp+12]
-        movsx       rax, WORD PTR [rsp+14]
-        add         rdx, rcx
-        add         rbx, rax
-        add         rdx, rbx    ;XSum
-        movsxd      rax, DWORD PTR [rsp]
-        movsxd      rcx, DWORD PTR [rsp+4]
-        add         rax, rcx    ;XXSum
-        mov         rsi, arg(4) ;SSE
-        mov         rdi, arg(5) ;Sum
-        mov         dword ptr [rsi], eax
-        mov         dword ptr [rdi], edx
-        xor         rax, rax    ; return 0
-
-    ; begin epilog
-    add rsp, 16
-    pop rbx
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void
-;vpx_get4x4var_mmx
-;(
-;    unsigned char *src_ptr,
-;    int  source_stride,
-;    unsigned char *ref_ptr,
-;    int  recon_stride,
-;    unsigned int *SSE,
-;    int *Sum
-;)
-global sym(vpx_get4x4var_mmx) PRIVATE
-sym(vpx_get4x4var_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push rsi
-    push rdi
-    push rbx
-    sub         rsp, 16
-    ; end prolog
-
-        pxor        mm5, mm5                    ; Blank mmx6
-        pxor        mm6, mm6                    ; Blank mmx7
-        pxor        mm7, mm7                    ; Blank mmx7
-
-        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
-        mov         rbx, arg(2) ;[ref_ptr]
-        movsxd      rcx, dword ptr arg(1) ;[source_stride]
-        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
-
-        ; Row 1
-        movd        mm0, [rax]                  ; Copy four bytes to mm0
-        movd        mm1, [rbx]                  ; Copy four bytes to mm1
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movd        mm1, [rbx]                  ; Copy four bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-        ; Row 2
-        movd        mm0, [rax]                  ; Copy four bytes to mm0
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movd        mm1, [rbx]                  ; Copy four bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-        ; Row 3
-        movd        mm0, [rax]                  ; Copy four bytes to mm0
-        punpcklbw   mm0, mm6                    ; unpack to higher precision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movd        mm1, [rbx]                  ; Copy four bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-        ; Row 4
-        movd        mm0, [rax]                  ; Copy four bytes to mm0
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-        ; Now accumulate the final results.
-        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
-        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
-        movsx       rdx, WORD PTR [rsp+8]
-        movsx       rcx, WORD PTR [rsp+10]
-        movsx       rbx, WORD PTR [rsp+12]
-        movsx       rax, WORD PTR [rsp+14]
-        add         rdx, rcx
-        add         rbx, rax
-        add         rdx, rbx    ;XSum
-        movsxd      rax, DWORD PTR [rsp]
-        movsxd      rcx, DWORD PTR [rsp+4]
-        add         rax, rcx    ;XXSum
-        mov         rsi, arg(4) ;SSE
-        mov         rdi, arg(5) ;Sum
-        mov         dword ptr [rsi], eax
-        mov         dword ptr [rdi], edx
-        xor         rax, rax    ; return 0
-
-    ; begin epilog
-    add rsp, 16
-    pop rbx
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vpx_filter_block2d_bil4x4_var_mmx
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned short *HFilter,
-;    unsigned short *VFilter,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vpx_filter_block2d_bil4x4_var_mmx) PRIVATE
-sym(vpx_filter_block2d_bil4x4_var_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    sub         rsp, 16
-    ; end prolog
-
-        pxor            mm6,            mm6                 ;
-        pxor            mm7,            mm7                 ;
-
-        mov             rax,            arg(4) ;HFilter             ;
-        mov             rdx,            arg(5) ;VFilter             ;
-
-        mov             rsi,            arg(0) ;ref_ptr              ;
-        mov             rdi,            arg(2) ;src_ptr              ;
-
-        mov             rcx,            4                   ;
-        pxor            mm0,            mm0                 ;
-
-        movd            mm1,            [rsi]               ;
-        movd            mm3,            [rsi+1]             ;
-
-        punpcklbw       mm1,            mm0                 ;
-        pmullw          mm1,            [rax]               ;
-
-        punpcklbw       mm3,            mm0                 ;
-        pmullw          mm3,            [rax+8]             ;
-
-        paddw           mm1,            mm3                 ;
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-
-        psraw           mm1,            mmx_filter_shift    ;
-        movq            mm5,            mm1
-
-%if ABI_IS_32BIT
-        add             rsi, dword ptr  arg(1) ;ref_pixels_per_line    ;
-%else
-        movsxd          r8, dword ptr  arg(1) ;ref_pixels_per_line    ;
-        add             rsi, r8
-%endif
-
-.filter_block2d_bil4x4_var_mmx_loop:
-
-        movd            mm1,            [rsi]               ;
-        movd            mm3,            [rsi+1]             ;
-
-        punpcklbw       mm1,            mm0                 ;
-        pmullw          mm1,            [rax]               ;
-
-        punpcklbw       mm3,            mm0                 ;
-        pmullw          mm3,            [rax+8]             ;
-
-        paddw           mm1,            mm3                 ;
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-
-        psraw           mm1,            mmx_filter_shift    ;
-        movq            mm3,            mm5                 ;
-
-        movq            mm5,            mm1                 ;
-        pmullw          mm3,            [rdx]               ;
-
-        pmullw          mm1,            [rdx+8]             ;
-        paddw           mm1,            mm3                 ;
-
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-        psraw           mm1,            mmx_filter_shift    ;
-
-        movd            mm3,            [rdi]               ;
-        punpcklbw       mm3,            mm0                 ;
-
-        psubw           mm1,            mm3                 ;
-        paddw           mm6,            mm1                 ;
-
-        pmaddwd         mm1,            mm1                 ;
-        paddd           mm7,            mm1                 ;
-
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
-        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
-%else
-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
-        add             rsi,            r8
-        add             rdi,            r9
-%endif
-        sub             rcx,            1                   ;
-        jnz             .filter_block2d_bil4x4_var_mmx_loop       ;
-
-        pxor            mm3,            mm3                 ;
-        pxor            mm2,            mm2                 ;
-
-        punpcklwd       mm2,            mm6                 ;
-        punpckhwd       mm3,            mm6                 ;
-
-        paddd           mm2,            mm3                 ;
-        movq            mm6,            mm2                 ;
-
-        psrlq           mm6,            32                  ;
-        paddd           mm2,            mm6                 ;
-
-        psrad           mm2,            16                  ;
-        movq            mm4,            mm7                 ;
-
-        psrlq           mm4,            32                  ;
-        paddd           mm4,            mm7                 ;
-
-        mov             rdi,            arg(6) ;sum
-        mov             rsi,            arg(7) ;sumsquared
-
-        movd            dword ptr [rdi],          mm2                 ;
-        movd            dword ptr [rsi],          mm4                 ;
-
-    ; begin epilog
-    add rsp, 16
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vpx_filter_block2d_bil_var_mmx
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    unsigned short *HFilter,
-;    unsigned short *VFilter,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vpx_filter_block2d_bil_var_mmx) PRIVATE
-sym(vpx_filter_block2d_bil_var_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 9
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    sub         rsp, 16
-    ; end prolog
-
-        pxor            mm6,            mm6                 ;
-        pxor            mm7,            mm7                 ;
-        mov             rax,            arg(5) ;HFilter             ;
-
-        mov             rdx,            arg(6) ;VFilter             ;
-        mov             rsi,            arg(0) ;ref_ptr              ;
-
-        mov             rdi,            arg(2) ;src_ptr              ;
-        movsxd          rcx,            dword ptr arg(4) ;Height              ;
-
-        pxor            mm0,            mm0                 ;
-        movq            mm1,            [rsi]               ;
-
-        movq            mm3,            [rsi+1]             ;
-        movq            mm2,            mm1                 ;
-
-        movq            mm4,            mm3                 ;
-        punpcklbw       mm1,            mm0                 ;
-
-        punpckhbw       mm2,            mm0                 ;
-        pmullw          mm1,            [rax]               ;
-
-        pmullw          mm2,            [rax]               ;
-        punpcklbw       mm3,            mm0                 ;
-
-        punpckhbw       mm4,            mm0                 ;
-        pmullw          mm3,            [rax+8]             ;
-
-        pmullw          mm4,            [rax+8]             ;
-        paddw           mm1,            mm3                 ;
-
-        paddw           mm2,            mm4                 ;
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-
-        psraw           mm1,            mmx_filter_shift    ;
-        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
-
-        psraw           mm2,            mmx_filter_shift    ;
-        movq            mm5,            mm1
-
-        packuswb        mm5,            mm2                 ;
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line
-%else
-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
-        add             rsi,            r8
-%endif
-
-.filter_block2d_bil_var_mmx_loop:
-
-        movq            mm1,            [rsi]               ;
-        movq            mm3,            [rsi+1]             ;
-
-        movq            mm2,            mm1                 ;
-        movq            mm4,            mm3                 ;
-
-        punpcklbw       mm1,            mm0                 ;
-        punpckhbw       mm2,            mm0                 ;
-
-        pmullw          mm1,            [rax]               ;
-        pmullw          mm2,            [rax]               ;
-
-        punpcklbw       mm3,            mm0                 ;
-        punpckhbw       mm4,            mm0                 ;
-
-        pmullw          mm3,            [rax+8]             ;
-        pmullw          mm4,            [rax+8]             ;
-
-        paddw           mm1,            mm3                 ;
-        paddw           mm2,            mm4                 ;
-
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-        psraw           mm1,            mmx_filter_shift    ;
-
-        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
-        psraw           mm2,            mmx_filter_shift    ;
-
-        movq            mm3,            mm5                 ;
-        movq            mm4,            mm5                 ;
-
-        punpcklbw       mm3,            mm0                 ;
-        punpckhbw       mm4,            mm0                 ;
-
-        movq            mm5,            mm1                 ;
-        packuswb        mm5,            mm2                 ;
-
-        pmullw          mm3,            [rdx]               ;
-        pmullw          mm4,            [rdx]               ;
-
-        pmullw          mm1,            [rdx+8]             ;
-        pmullw          mm2,            [rdx+8]             ;
-
-        paddw           mm1,            mm3                 ;
-        paddw           mm2,            mm4                 ;
-
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
-
-        psraw           mm1,            mmx_filter_shift    ;
-        psraw           mm2,            mmx_filter_shift    ;
-
-        movq            mm3,            [rdi]               ;
-        movq            mm4,            mm3                 ;
-
-        punpcklbw       mm3,            mm0                 ;
-        punpckhbw       mm4,            mm0                 ;
-
-        psubw           mm1,            mm3                 ;
-        psubw           mm2,            mm4                 ;
-
-        paddw           mm6,            mm1                 ;
-        pmaddwd         mm1,            mm1                 ;
-
-        paddw           mm6,            mm2                 ;
-        pmaddwd         mm2,            mm2                 ;
-
-        paddd           mm7,            mm1                 ;
-        paddd           mm7,            mm2                 ;
-
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
-        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
-%else
-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line    ;
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line    ;
-        add             rsi,            r8
-        add             rdi,            r9
-%endif
-        sub             rcx,            1                   ;
-        jnz             .filter_block2d_bil_var_mmx_loop       ;
-
-        pxor            mm3,            mm3                 ;
-        pxor            mm2,            mm2                 ;
-
-        punpcklwd       mm2,            mm6                 ;
-        punpckhwd       mm3,            mm6                 ;
-
-        paddd           mm2,            mm3                 ;
-        movq            mm6,            mm2                 ;
-
-        psrlq           mm6,            32                  ;
-        paddd           mm2,            mm6                 ;
-
-        psrad           mm2,            16                  ;
-        movq            mm4,            mm7                 ;
-
-        psrlq           mm4,            32                  ;
-        paddd           mm4,            mm7                 ;
-
-        mov             rdi,            arg(7) ;sum
-        mov             rsi,            arg(8) ;sumsquared
-
-        movd            dword ptr [rdi],          mm2                 ;
-        movd            dword ptr [rsi],          mm4                 ;
-
-    ; begin epilog
-    add rsp, 16
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-;short mmx_bi_rd[4] = { 64, 64, 64, 64};
-align 16
-mmx_bi_rd:
-    times 4 dw 64
diff --git a/vpx_dsp/x86/variance_mmx.c b/vpx_dsp/x86/variance_mmx.c
deleted file mode 100644
index f04f4e2..0000000
--- a/vpx_dsp/x86/variance_mmx.c
+++ /dev/null
@@ -1,249 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vpx_dsp_rtcd.h"
-
-#include "vpx_ports/mem.h"
-
-DECLARE_ALIGNED(16, static const int16_t, bilinear_filters_mmx[8][8]) = {
-  { 128, 128, 128, 128,   0,   0,   0,   0 },
-  { 112, 112, 112, 112,  16,  16,  16,  16 },
-  {  96,  96,  96,  96,  32,  32,  32,  32 },
-  {  80,  80,  80,  80,  48,  48,  48,  48 },
-  {  64,  64,  64,  64,  64,  64,  64,  64 },
-  {  48,  48,  48,  48,  80,  80,  80,  80 },
-  {  32,  32,  32,  32,  96,  96,  96,  96 },
-  {  16,  16,  16,  16, 112, 112, 112, 112 }
-};
-
-extern void vpx_get4x4var_mmx(const uint8_t *a, int a_stride,
-                              const uint8_t *b, int b_stride,
-                              unsigned int *sse, int *sum);
-
-extern void vpx_filter_block2d_bil4x4_var_mmx(const unsigned char *ref_ptr,
-                                              int ref_pixels_per_line,
-                                              const unsigned char *src_ptr,
-                                              int src_pixels_per_line,
-                                              const int16_t *HFilter,
-                                              const int16_t *VFilter,
-                                              int *sum,
-                                              unsigned int *sumsquared);
-
-extern void vpx_filter_block2d_bil_var_mmx(const unsigned char *ref_ptr,
-                                           int ref_pixels_per_line,
-                                           const unsigned char *src_ptr,
-                                           int src_pixels_per_line,
-                                           unsigned int Height,
-                                           const int16_t *HFilter,
-                                           const int16_t *VFilter,
-                                           int *sum,
-                                           unsigned int *sumsquared);
-
-
-unsigned int vpx_variance4x4_mmx(const unsigned char *a, int a_stride,
-                                 const unsigned char *b, int b_stride,
-                                 unsigned int *sse) {
-    unsigned int var;
-    int avg;
-
-    vpx_get4x4var_mmx(a, a_stride, b, b_stride, &var, &avg);
-    *sse = var;
-    return (var - (((unsigned int)avg * avg) >> 4));
-}
-
-unsigned int vpx_variance8x8_mmx(const unsigned char *a, int a_stride,
-                                 const unsigned char *b, int b_stride,
-                                 unsigned int *sse) {
-    unsigned int var;
-    int avg;
-
-    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &var, &avg);
-    *sse = var;
-
-    return (var - (((unsigned int)avg * avg) >> 6));
-}
-
-unsigned int vpx_mse16x16_mmx(const unsigned char *a, int a_stride,
-                              const unsigned char *b, int b_stride,
-                              unsigned int *sse) {
-    unsigned int sse0, sse1, sse2, sse3, var;
-    int sum0, sum1, sum2, sum3;
-
-    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
-    vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);
-    vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,
-                      b + 8 * b_stride, b_stride, &sse2, &sum2);
-    vpx_get8x8var_mmx(a + 8 * a_stride + 8, a_stride,
-                      b + 8 * b_stride + 8, b_stride, &sse3, &sum3);
-
-    var = sse0 + sse1 + sse2 + sse3;
-    *sse = var;
-    return var;
-}
-
-unsigned int vpx_variance16x16_mmx(const unsigned char *a, int a_stride,
-                                   const unsigned char *b, int b_stride,
-                                   unsigned int *sse) {
-    unsigned int sse0, sse1, sse2, sse3, var;
-    int sum0, sum1, sum2, sum3, avg;
-
-    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
-    vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);
-    vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,
-                      b + 8 * b_stride, b_stride, &sse2, &sum2);
-    vpx_get8x8var_mmx(a + 8 * a_stride + 8, a_stride,
-                      b + 8 * b_stride + 8, b_stride, &sse3, &sum3);
-
-    var = sse0 + sse1 + sse2 + sse3;
-    avg = sum0 + sum1 + sum2 + sum3;
-    *sse = var;
-    return (var - (((unsigned int)avg * avg) >> 8));
-}
-
-unsigned int vpx_variance16x8_mmx(const unsigned char *a, int a_stride,
-                                  const unsigned char *b, int b_stride,
-                                  unsigned int *sse) {
-    unsigned int sse0, sse1, var;
-    int sum0, sum1, avg;
-
-    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
-    vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);
-
-    var = sse0 + sse1;
-    avg = sum0 + sum1;
-    *sse = var;
-    return (var - (((unsigned int)avg * avg) >> 7));
-}
-
-unsigned int vpx_variance8x16_mmx(const unsigned char *a, int a_stride,
-                                  const unsigned char *b, int b_stride,
-                                  unsigned int *sse) {
-    unsigned int sse0, sse1, var;
-    int sum0, sum1, avg;
-
-    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
-    vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,
-                      b + 8 * b_stride, b_stride, &sse1, &sum1);
-
-    var = sse0 + sse1;
-    avg = sum0 + sum1;
-    *sse = var;
-
-    return (var - (((unsigned int)avg * avg) >> 7));
-}
-
-uint32_t vpx_sub_pixel_variance4x4_mmx(const uint8_t *a, int a_stride,
-                                       int xoffset, int yoffset,
-                                       const uint8_t *b, int b_stride,
-                                       uint32_t *sse) {
-    int xsum;
-    unsigned int xxsum;
-    vpx_filter_block2d_bil4x4_var_mmx(a, a_stride, b, b_stride,
-                                      bilinear_filters_mmx[xoffset],
-                                      bilinear_filters_mmx[yoffset],
-                                      &xsum, &xxsum);
-    *sse = xxsum;
-    return (xxsum - (((unsigned int)xsum * xsum) >> 4));
-}
-
-
-uint32_t vpx_sub_pixel_variance8x8_mmx(const uint8_t *a, int a_stride,
-                                       int xoffset, int yoffset,
-                                       const uint8_t *b, int b_stride,
-                                       uint32_t *sse) {
-    int xsum;
-    uint32_t xxsum;
-    vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 8,
-                                   bilinear_filters_mmx[xoffset],
-                                   bilinear_filters_mmx[yoffset],
-                                   &xsum, &xxsum);
-    *sse = xxsum;
-    return (xxsum - (((uint32_t)xsum * xsum) >> 6));
-}
-
-uint32_t vpx_sub_pixel_variance16x16_mmx(const uint8_t *a, int a_stride,
-                                         int xoffset, int yoffset,
-                                         const uint8_t *b, int b_stride,
-                                         uint32_t *sse) {
-    int xsum0, xsum1;
-    unsigned int xxsum0, xxsum1;
-
-    vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 16,
-                                   bilinear_filters_mmx[xoffset],
-                                   bilinear_filters_mmx[yoffset],
-                                   &xsum0, &xxsum0);
-
-    vpx_filter_block2d_bil_var_mmx(a + 8, a_stride, b + 8, b_stride, 16,
-                                   bilinear_filters_mmx[xoffset],
-                                   bilinear_filters_mmx[yoffset],
-                                   &xsum1, &xxsum1);
-
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
-
-    *sse = xxsum0;
-    return (xxsum0 - (((uint32_t)xsum0 * xsum0) >> 8));
-}
-
-uint32_t vpx_sub_pixel_variance16x8_mmx(const uint8_t *a, int a_stride,
-                                        int xoffset, int yoffset,
-                                        const uint8_t *b, int b_stride,
-                                        uint32_t *sse) {
-    int xsum0, xsum1;
-    unsigned int xxsum0, xxsum1;
-
-    vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 8,
-                                   bilinear_filters_mmx[xoffset],
-                                   bilinear_filters_mmx[yoffset],
-                                   &xsum0, &xxsum0);
-
-    vpx_filter_block2d_bil_var_mmx(a + 8, a_stride, b + 8, b_stride, 8,
-                                   bilinear_filters_mmx[xoffset],
-                                   bilinear_filters_mmx[yoffset],
-                                   &xsum1, &xxsum1);
-
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
-
-    *sse = xxsum0;
-    return (xxsum0 - (((uint32_t)xsum0 * xsum0) >> 7));
-}
-
-uint32_t vpx_sub_pixel_variance8x16_mmx(const uint8_t *a, int a_stride,
-                                        int xoffset, int yoffset,
-                                        const uint8_t *b, int b_stride,
-                                        uint32_t *sse) {
-    int xsum;
-    unsigned int xxsum;
-    vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 16,
-                                   bilinear_filters_mmx[xoffset],
-                                   bilinear_filters_mmx[yoffset],
-                                   &xsum, &xxsum);
-    *sse = xxsum;
-    return (xxsum - (((uint32_t)xsum * xsum) >> 7));
-}
-
-uint32_t vpx_variance_halfpixvar16x16_h_mmx(const uint8_t *a, int a_stride,
-                                            const uint8_t *b, int b_stride,
-                                            uint32_t *sse) {
-  return vpx_sub_pixel_variance16x16_mmx(a, a_stride, 4, 0, b, b_stride, sse);
-}
-
-uint32_t vpx_variance_halfpixvar16x16_v_mmx(const uint8_t *a, int a_stride,
-                                            const uint8_t *b, int b_stride,
-                                            uint32_t *sse) {
-  return vpx_sub_pixel_variance16x16_mmx(a, a_stride, 0, 4, b, b_stride, sse);
-}
-
-uint32_t vpx_variance_halfpixvar16x16_hv_mmx(const uint8_t *a, int a_stride,
-                                             const uint8_t *b, int b_stride,
-                                             uint32_t *sse) {
-  return vpx_sub_pixel_variance16x16_mmx(a, a_stride, 4, 4, b, b_stride, sse);
-}
diff --git a/vpx_dsp/x86/variance_sse2.c b/vpx_dsp/x86/variance_sse2.c
index dc51173..c2b55a3 100644
--- a/vpx_dsp/x86/variance_sse2.c
+++ b/vpx_dsp/x86/variance_sse2.c
@@ -171,7 +171,7 @@
                                   unsigned int *sse) {
   int sum;
   get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
-  return *sse - (((unsigned int)sum * sum) >> 4);
+  return *sse - ((sum * sum) >> 4);
 }
 
 unsigned int vpx_variance8x4_sse2(const uint8_t *src, int src_stride,
@@ -180,7 +180,7 @@
   int sum;
   variance_sse2(src, src_stride, ref, ref_stride, 8, 4,
                 sse, &sum, get4x4var_sse2, 4);
-  return *sse - (((unsigned int)sum * sum) >> 5);
+  return *sse - ((sum * sum) >> 5);
 }
 
 unsigned int vpx_variance4x8_sse2(const uint8_t *src, int src_stride,
@@ -189,7 +189,7 @@
   int sum;
   variance_sse2(src, src_stride, ref, ref_stride, 4, 8,
                 sse, &sum, get4x4var_sse2, 4);
-  return *sse - (((unsigned int)sum * sum) >> 5);
+  return *sse - ((sum * sum) >> 5);
 }
 
 unsigned int vpx_variance8x8_sse2(const unsigned char *src, int src_stride,
@@ -197,7 +197,7 @@
                                   unsigned int *sse) {
   int sum;
   vpx_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
-  return *sse - (((unsigned int)sum * sum) >> 6);
+  return *sse - ((sum * sum) >> 6);
 }
 
 unsigned int vpx_variance16x8_sse2(const unsigned char *src, int src_stride,
@@ -206,7 +206,7 @@
   int sum;
   variance_sse2(src, src_stride, ref, ref_stride, 16, 8,
                 sse, &sum, vpx_get8x8var_sse2, 8);
-  return *sse - (((unsigned int)sum * sum) >> 7);
+  return *sse - ((sum * sum) >> 7);
 }
 
 unsigned int vpx_variance8x16_sse2(const unsigned char *src, int src_stride,
@@ -215,7 +215,7 @@
   int sum;
   variance_sse2(src, src_stride, ref, ref_stride, 8, 16,
                 sse, &sum, vpx_get8x8var_sse2, 8);
-  return *sse - (((unsigned int)sum * sum) >> 7);
+  return *sse - ((sum * sum) >> 7);
 }
 
 unsigned int vpx_variance16x16_sse2(const unsigned char *src, int src_stride,
@@ -223,7 +223,7 @@
                                     unsigned int *sse) {
   int sum;
   vpx_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
-  return *sse - (((unsigned int)sum * sum) >> 8);
+  return *sse - (((uint32_t)((int64_t)sum * sum)) >> 8);
 }
 
 unsigned int vpx_variance32x32_sse2(const uint8_t *src, int src_stride,
@@ -320,16 +320,16 @@
                                           int height, unsigned int *sse, \
                                           void *unused0, void *unused)
 #define DECLS(opt1, opt2) \
-  DECL(4, opt2); \
+  DECL(4, opt1); \
   DECL(8, opt1); \
   DECL(16, opt1)
 
-DECLS(sse2, sse);
+DECLS(sse2, sse2);
 DECLS(ssse3, ssse3);
 #undef DECLS
 #undef DECL
 
-#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \
 unsigned int vpx_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \
                                                      int src_stride, \
                                                      int x_offset, \
@@ -365,25 +365,25 @@
     } \
   } \
   *sse_ptr = sse; \
-  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+  return sse - (cast_prod (cast se * se) >> (wlog2 + hlog2)); \
 }
 
 #define FNS(opt1, opt2) \
-FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
-FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
-FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
-FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
-FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
-FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
-FN(16, 16, 16, 4, 4, opt1, (uint32_t)); \
-FN(16,  8, 16, 4, 3, opt1, (uint32_t)); \
-FN(8,  16,  8, 3, 4, opt1, (uint32_t)); \
-FN(8,   8,  8, 3, 3, opt1, (uint32_t)); \
-FN(8,   4,  8, 3, 2, opt1, (uint32_t)); \
-FN(4,   8,  4, 2, 3, opt2, (uint32_t)); \
-FN(4,   4,  4, 2, 2, opt2, (uint32_t))
+FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)); \
+FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)); \
+FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)); \
+FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)); \
+FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)); \
+FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)); \
+FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \
+FN(16,  8, 16, 4, 3, opt1, (int32_t), (int32_t)); \
+FN(8,  16,  8, 3, 4, opt1, (int32_t), (int32_t)); \
+FN(8,   8,  8, 3, 3, opt1, (int32_t), (int32_t)); \
+FN(8,   4,  8, 3, 2, opt1, (int32_t), (int32_t)); \
+FN(4,   8,  4, 2, 3, opt1, (int32_t), (int32_t)); \
+FN(4,   4,  4, 2, 2, opt1, (int32_t), (int32_t))
 
-FNS(sse2, sse);
+FNS(sse2, sse2);
 FNS(ssse3, ssse3);
 
 #undef FNS
@@ -401,16 +401,16 @@
                                             int height, unsigned int *sse, \
                                             void *unused0, void *unused)
 #define DECLS(opt1, opt2) \
-DECL(4, opt2); \
+DECL(4, opt1); \
 DECL(8, opt1); \
 DECL(16, opt1)
 
-DECLS(sse2, sse);
+DECLS(sse2, sse2);
 DECLS(ssse3, ssse3);
 #undef DECL
 #undef DECLS
 
-#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \
 unsigned int vpx_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \
                                                          int src_stride, \
                                                          int x_offset, \
@@ -451,23 +451,23 @@
     } \
   } \
   *sseptr = sse; \
-  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+  return sse - (cast_prod (cast se * se) >> (wlog2 + hlog2)); \
 }
 
 #define FNS(opt1, opt2) \
-FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
-FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
-FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
-FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
-FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
-FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
-FN(16, 16, 16, 4, 4, opt1, (uint32_t)); \
-FN(16,  8, 16, 4, 3, opt1, (uint32_t)); \
-FN(8,  16,  8, 3, 4, opt1, (uint32_t)); \
-FN(8,   8,  8, 3, 3, opt1, (uint32_t)); \
-FN(8,   4,  8, 3, 2, opt1, (uint32_t)); \
-FN(4,   8,  4, 2, 3, opt2, (uint32_t)); \
-FN(4,   4,  4, 2, 2, opt2, (uint32_t))
+FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)); \
+FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)); \
+FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)); \
+FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)); \
+FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)); \
+FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)); \
+FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \
+FN(16,  8, 16, 4, 3, opt1, (uint32_t), (int32_t)); \
+FN(8,  16,  8, 3, 4, opt1, (uint32_t), (int32_t)); \
+FN(8,   8,  8, 3, 3, opt1, (uint32_t), (int32_t)); \
+FN(8,   4,  8, 3, 2, opt1, (uint32_t), (int32_t)); \
+FN(4,   8,  4, 2, 3, opt1, (uint32_t), (int32_t)); \
+FN(4,   4,  4, 2, 2, opt1, (uint32_t), (int32_t))
 
 FNS(sse2, sse);
 FNS(ssse3, ssse3);
diff --git a/vpx_ports/mem_ops.h b/vpx_ports/mem_ops.h
index 1f8f914..620df31 100644
--- a/vpx_ports/mem_ops.h
+++ b/vpx_ports/mem_ops.h
@@ -89,7 +89,7 @@
   unsigned MEM_VALUE_T  val;
   const MAU_T          *mem = (const MAU_T *)vmem;
 
-  val = mem[0] << 24;
+  val = ((unsigned MEM_VALUE_T)mem[0]) << 24;
   val |= mem[1] << 16;
   val |= mem[2] << 8;
   val |= mem[3];
@@ -125,7 +125,7 @@
   unsigned MEM_VALUE_T  val;
   const MAU_T          *mem = (const MAU_T *)vmem;
 
-  val = mem[3] << 24;
+  val = ((unsigned MEM_VALUE_T)mem[3]) << 24;
   val |= mem[2] << 16;
   val |= mem[1] << 8;
   val |= mem[0];
diff --git a/vpx_ports/x86.h b/vpx_ports/x86.h
index e3ebc53..bae25ac 100644
--- a/vpx_ports/x86.h
+++ b/vpx_ports/x86.h
@@ -12,6 +12,11 @@
 #ifndef VPX_PORTS_X86_H_
 #define VPX_PORTS_X86_H_
 #include <stdlib.h>
+
+#if defined(_MSC_VER)
+#include <intrin.h>  /* For __cpuidex, __rdtsc */
+#endif
+
 #include "vpx_config.h"
 #include "vpx/vpx_integer.h"
 
@@ -77,16 +82,12 @@
 #else /* end __SUNPRO__ */
 #if ARCH_X86_64
 #if defined(_MSC_VER) && _MSC_VER > 1500
-void __cpuidex(int CPUInfo[4], int info_type, int ecxvalue);
-#pragma intrinsic(__cpuidex)
 #define cpuid(func, func2, a, b, c, d) do {\
     int regs[4];\
     __cpuidex(regs, func, func2); \
     a = regs[0];  b = regs[1];  c = regs[2];  d = regs[3];\
   } while(0)
 #else
-void __cpuid(int CPUInfo[4], int info_type);
-#pragma intrinsic(__cpuid)
 #define cpuid(func, func2, a, b, c, d) do {\
     int regs[4];\
     __cpuid(regs, func); \
@@ -172,7 +173,7 @@
   env = getenv("VPX_SIMD_CAPS_MASK");
 
   if (env && *env)
-    mask = strtol(env, NULL, 0);
+    mask = (unsigned int)strtoul(env, NULL, 0);
 
   /* Ensure that the CPUID instruction supports extended features */
   cpuid(0, 0, max_cpuid_val, reg_ebx, reg_ecx, reg_edx);
@@ -212,10 +213,6 @@
   return flags & mask;
 }
 
-#if ARCH_X86_64 && defined(_MSC_VER)
-unsigned __int64 __rdtsc(void);
-#pragma intrinsic(__rdtsc)
-#endif
 // Note:
 //  32-bit CPU cycle counter is light-weighted for most function performance
 //  measurement. For large function (CPU time > a couple of seconds), 64-bit
diff --git a/vpx_util/vpx_thread.h b/vpx_util/vpx_thread.h
index de63c4d..2062abd 100644
--- a/vpx_util/vpx_thread.h
+++ b/vpx_util/vpx_thread.h
@@ -147,6 +147,152 @@
   pthread_mutex_lock(mutex);
   return !ok;
 }
+#elif defined(__OS2__)
+#define INCL_DOS
+#include <os2.h>    // NOLINT
+
+#include <errno.h>  // NOLINT
+#include <stdlib.h> // NOLINT
+#include <sys/builtin.h> // NOLINT
+
+#define pthread_t TID
+#define pthread_mutex_t HMTX
+
+typedef struct {
+  HEV event_sem_;
+  HEV ack_sem_;
+  volatile unsigned wait_count_;
+} pthread_cond_t;
+
+//------------------------------------------------------------------------------
+// simplistic pthread emulation layer
+
+#define THREADFN void *
+#define THREAD_RETURN(val) (val)
+
+typedef struct {
+  void* (*start_)(void*);
+  void* arg_;
+} thread_arg;
+
+static void thread_start(void* arg) {
+  thread_arg targ = *(thread_arg *)arg;
+  free(arg);
+
+  targ.start_(targ.arg_);
+}
+
+static INLINE int pthread_create(pthread_t* const thread, const void* attr,
+                                 void* (*start)(void*),
+                                 void* arg) {
+  int tid;
+  thread_arg *targ = (thread_arg *)malloc(sizeof(*targ));
+  if (targ == NULL) return 1;
+
+  (void)attr;
+
+  targ->start_ = start;
+  targ->arg_ = arg;
+  tid = (pthread_t)_beginthread(thread_start, NULL, 1024 * 1024, targ);
+  if (tid == -1) {
+    free(targ);
+    return 1;
+  }
+
+  *thread = tid;
+  return 0;
+}
+
+static INLINE int pthread_join(pthread_t thread, void** value_ptr) {
+  (void)value_ptr;
+  return DosWaitThread(&thread, DCWW_WAIT) != 0;
+}
+
+// Mutex
+static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex,
+                                     void* mutexattr) {
+  (void)mutexattr;
+  return DosCreateMutexSem(NULL, mutex, 0, FALSE) != 0;
+}
+
+static INLINE int pthread_mutex_trylock(pthread_mutex_t *const mutex) {
+  return DosRequestMutexSem(*mutex, SEM_IMMEDIATE_RETURN) == 0 ? 0 : EBUSY;
+}
+
+static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) {
+  return DosRequestMutexSem(*mutex, SEM_INDEFINITE_WAIT) != 0;
+}
+
+static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) {
+  return DosReleaseMutexSem(*mutex) != 0;
+}
+
+static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) {
+  return DosCloseMutexSem(*mutex) != 0;
+}
+
+// Condition
+static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) {
+  int ok = 1;
+  ok &= DosCloseEventSem(condition->event_sem_) == 0;
+  ok &= DosCloseEventSem(condition->ack_sem_) == 0;
+  return !ok;
+}
+
+static INLINE int pthread_cond_init(pthread_cond_t *const condition,
+                                    void* cond_attr) {
+  int ok = 1;
+  (void)cond_attr;
+
+  ok &= DosCreateEventSem(NULL, &condition->event_sem_, DCE_POSTONE, FALSE)
+          == 0;
+  ok &= DosCreateEventSem(NULL, &condition->ack_sem_, DCE_POSTONE, FALSE) == 0;
+  if (!ok) {
+    pthread_cond_destroy(condition);
+    return 1;
+  }
+  condition->wait_count_ = 0;
+  return 0;
+}
+
+static INLINE int pthread_cond_signal(pthread_cond_t *const condition) {
+  int ok = 1;
+
+  if (!__atomic_cmpxchg32(&condition->wait_count_, 0, 0)) {
+    ok &= DosPostEventSem(condition->event_sem_) == 0;
+    ok &= DosWaitEventSem(condition->ack_sem_, SEM_INDEFINITE_WAIT) == 0;
+  }
+
+  return !ok;
+}
+
+static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) {
+  int ok = 1;
+
+  while (!__atomic_cmpxchg32(&condition->wait_count_, 0, 0))
+      ok &= pthread_cond_signal(condition) == 0;
+
+  return !ok;
+}
+
+static INLINE int pthread_cond_wait(pthread_cond_t *const condition,
+                                    pthread_mutex_t *const mutex) {
+  int ok = 1;
+
+  __atomic_increment(&condition->wait_count_);
+
+  ok &= pthread_mutex_unlock(mutex) == 0;
+
+  ok &= DosWaitEventSem(condition->event_sem_, SEM_INDEFINITE_WAIT) == 0;
+
+  __atomic_decrement(&condition->wait_count_);
+
+  ok &= DosPostEventSem(condition->ack_sem_) == 0;
+
+  pthread_mutex_lock(mutex);
+
+  return !ok;
+}
 #else  // _WIN32
 #include <pthread.h> // NOLINT
 # define THREADFN void*
diff --git a/vpxdec.c b/vpxdec.c
index 235d17a..d96b39c 100644
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -268,8 +268,7 @@
   switch (input->vpx_input_ctx->file_type) {
 #if CONFIG_WEBM_IO
     case FILE_TYPE_WEBM:
-      return webm_read_frame(input->webm_ctx,
-                             buf, bytes_in_buffer, buffer_size);
+      return webm_read_frame(input->webm_ctx, buf, bytes_in_buffer);
 #endif
     case FILE_TYPE_RAW:
       return raw_read_frame(input->vpx_input_ctx->file,
diff --git a/vpxenc.c b/vpxenc.c
index d988b30..6463334 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -444,6 +444,11 @@
 
 static const arg_def_t tune_content = ARG_DEF_ENUM(
     NULL, "tune-content", 1, "Tune content type", tune_content_enum);
+
+static const arg_def_t target_level = ARG_DEF(
+    NULL, "target-level", 1,
+    "Target level (255: off (default); 0: only keep level stats; 10: level 1.0;"
+    " 11: level 1.1; ... 62: level 6.2)");
 #endif
 
 #if CONFIG_VP9_ENCODER
@@ -454,7 +459,7 @@
   &gf_cbr_boost_pct, &lossless,
   &frame_parallel_decoding, &aq_mode, &frame_periodic_boost,
   &noise_sens, &tune_content, &input_color_space,
-  &min_gf_interval, &max_gf_interval,
+  &min_gf_interval, &max_gf_interval, &target_level,
 #if CONFIG_VP9_HIGHBITDEPTH
   &bitdeptharg, &inbitdeptharg,
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -470,7 +475,7 @@
   VP9E_SET_LOSSLESS, VP9E_SET_FRAME_PARALLEL_DECODING, VP9E_SET_AQ_MODE,
   VP9E_SET_FRAME_PERIODIC_BOOST, VP9E_SET_NOISE_SENSITIVITY,
   VP9E_SET_TUNE_CONTENT, VP9E_SET_COLOR_SPACE,
-  VP9E_SET_MIN_GF_INTERVAL, VP9E_SET_MAX_GF_INTERVAL,
+  VP9E_SET_MIN_GF_INTERVAL, VP9E_SET_MAX_GF_INTERVAL, VP9E_SET_TARGET_LEVEL,
   0
 };
 #endif
@@ -807,7 +812,7 @@
 
 #if !CONFIG_WEBM_IO
 typedef int stereo_format_t;
-struct EbmlGlobal { int debug; };
+struct WebmOutputContext { int debug; };
 #endif
 
 /* Per-stream configuration */
@@ -835,7 +840,7 @@
   struct stream_config      config;
   FILE                     *file;
   struct rate_hist         *rate_hist;
-  struct EbmlGlobal         ebml;
+  struct WebmOutputContext  webm_ctx;
   uint64_t                  psnr_sse_total;
   uint64_t                  psnr_samples_total;
   double                    psnr_totals[4];
@@ -1078,13 +1083,13 @@
     stream->config.write_webm = 1;
 #if CONFIG_WEBM_IO
     stream->config.stereo_fmt = STEREO_FORMAT_MONO;
-    stream->ebml.last_pts_ns = -1;
-    stream->ebml.writer = NULL;
-    stream->ebml.segment = NULL;
+    stream->webm_ctx.last_pts_ns = -1;
+    stream->webm_ctx.writer = NULL;
+    stream->webm_ctx.segment = NULL;
 #endif
 
     /* Allows removal of the application version from the EBML tags */
-    stream->ebml.debug = global->debug;
+    stream->webm_ctx.debug = global->debug;
 
     /* Default lag_in_frames is 0 in realtime mode */
     if (global->deadline == VPX_DL_REALTIME)
@@ -1466,8 +1471,8 @@
 
 #if CONFIG_WEBM_IO
   if (stream->config.write_webm) {
-    stream->ebml.stream = stream->file;
-    write_webm_file_header(&stream->ebml, cfg,
+    stream->webm_ctx.stream = stream->file;
+    write_webm_file_header(&stream->webm_ctx, cfg,
                            &global->framerate,
                            stream->config.stereo_fmt,
                            global->codec->fourcc,
@@ -1492,7 +1497,7 @@
 
 #if CONFIG_WEBM_IO
   if (stream->config.write_webm) {
-    write_webm_file_footer(&stream->ebml);
+    write_webm_file_footer(&stream->webm_ctx);
   }
 #endif
 
@@ -1730,7 +1735,7 @@
         update_rate_histogram(stream->rate_hist, cfg, pkt);
 #if CONFIG_WEBM_IO
         if (stream->config.write_webm) {
-          write_webm_block(&stream->ebml, cfg, pkt);
+          write_webm_block(&stream->webm_ctx, cfg, pkt);
         }
 #endif
         if (!stream->config.write_webm) {
diff --git a/webmdec.cc b/webmdec.cc
index 81150aa..93835e1 100644
--- a/webmdec.cc
+++ b/webmdec.cc
@@ -122,7 +122,6 @@
 
 int webm_read_frame(struct WebmInputContext *webm_ctx,
                     uint8_t **buffer,
-                    size_t *bytes_in_buffer,
                     size_t *buffer_size) {
   // This check is needed for frame parallel decoding, in which case this
   // function could be called even after it has reached end of input stream.
@@ -147,7 +146,7 @@
     } else if (block_entry_eos || block_entry->EOS()) {
       cluster = segment->GetNext(cluster);
       if (cluster == NULL || cluster->EOS()) {
-        *bytes_in_buffer = 0;
+        *buffer_size = 0;
         webm_ctx->reached_eos = 1;
         return 1;
       }
@@ -187,10 +186,9 @@
     if (*buffer == NULL) {
       return -1;
     }
-    *buffer_size = frame.len;
     webm_ctx->buffer = *buffer;
   }
-  *bytes_in_buffer = frame.len;
+  *buffer_size = frame.len;
   webm_ctx->timestamp_ns = block->GetTime(cluster);
   webm_ctx->is_key_frame = block->IsKey();
 
@@ -203,10 +201,9 @@
                          struct VpxInputContext *vpx_ctx) {
   uint32_t i = 0;
   uint8_t *buffer = NULL;
-  size_t bytes_in_buffer = 0;
   size_t buffer_size = 0;
   while (webm_ctx->timestamp_ns < 1000000000 && i < 50) {
-    if (webm_read_frame(webm_ctx, &buffer, &bytes_in_buffer, &buffer_size)) {
+    if (webm_read_frame(webm_ctx, &buffer, &buffer_size)) {
       break;
     }
     ++i;
diff --git a/webmdec.h b/webmdec.h
index 7d16380..aa371f3 100644
--- a/webmdec.h
+++ b/webmdec.h
@@ -42,22 +42,18 @@
 
 // Reads a WebM Video Frame. Memory for the buffer is created, owned and managed
 // by this function. For the first call, |buffer| should be NULL and
-// |*bytes_in_buffer| should be 0. Once all the frames are read and used,
+// |*buffer_size| should be 0. Once all the frames are read and used,
 // webm_free() should be called, otherwise there will be a leak.
 // Parameters:
 //      webm_ctx - WebmInputContext object
 //      buffer - pointer where the frame data will be filled.
-//      bytes_in_buffer - pointer to buffer size.
-//      buffer_size - unused TODO(vigneshv): remove this
+//      buffer_size - pointer to buffer size.
 // Return values:
 //      0 - Success
 //      1 - End of Stream
 //     -1 - Error
-// TODO(vigneshv): Make the return values consistent across all functions in
-// this file.
 int webm_read_frame(struct WebmInputContext *webm_ctx,
                     uint8_t **buffer,
-                    size_t *bytes_in_buffer,
                     size_t *buffer_size);
 
 // Guesses the frame rate of the input file based on the container timestamps.
diff --git a/webmenc.cc b/webmenc.cc
index caf4391..b1d3259 100644
--- a/webmenc.cc
+++ b/webmenc.cc
@@ -20,13 +20,13 @@
 const int kVideoTrackNumber = 1;
 }  // namespace
 
-void write_webm_file_header(struct EbmlGlobal *glob,
+void write_webm_file_header(struct WebmOutputContext *webm_ctx,
                             const vpx_codec_enc_cfg_t *cfg,
                             const struct vpx_rational *fps,
                             stereo_format_t stereo_fmt,
                             unsigned int fourcc,
                             const struct VpxRational *par) {
-  mkvmuxer::MkvWriter *const writer = new mkvmuxer::MkvWriter(glob->stream);
+  mkvmuxer::MkvWriter *const writer = new mkvmuxer::MkvWriter(webm_ctx->stream);
   mkvmuxer::Segment *const segment = new mkvmuxer::Segment();
   segment->Init(writer);
   segment->set_mode(mkvmuxer::Segment::kFile);
@@ -36,7 +36,7 @@
   const uint64_t kTimecodeScale = 1000000;
   info->set_timecode_scale(kTimecodeScale);
   std::string version = "vpxenc";
-  if (!glob->debug) {
+  if (!webm_ctx->debug) {
     version.append(std::string(" ") + vpx_codec_version_str());
   }
   info->set_writing_app(version.c_str());
@@ -74,23 +74,23 @@
     video_track->set_display_width(display_width);
     video_track->set_display_height(cfg->g_h);
   }
-  if (glob->debug) {
+  if (webm_ctx->debug) {
     video_track->set_uid(kDebugTrackUid);
   }
-  glob->writer = writer;
-  glob->segment = segment;
+  webm_ctx->writer = writer;
+  webm_ctx->segment = segment;
 }
 
-void write_webm_block(struct EbmlGlobal *glob,
+void write_webm_block(struct WebmOutputContext *webm_ctx,
                       const vpx_codec_enc_cfg_t *cfg,
                       const vpx_codec_cx_pkt_t *pkt) {
   mkvmuxer::Segment *const segment =
-      reinterpret_cast<mkvmuxer::Segment*>(glob->segment);
+      reinterpret_cast<mkvmuxer::Segment*>(webm_ctx->segment);
   int64_t pts_ns = pkt->data.frame.pts * 1000000000ll *
                    cfg->g_timebase.num / cfg->g_timebase.den;
-  if (pts_ns <= glob->last_pts_ns)
-    pts_ns = glob->last_pts_ns + 1000000;
-  glob->last_pts_ns = pts_ns;
+  if (pts_ns <= webm_ctx->last_pts_ns)
+    pts_ns = webm_ctx->last_pts_ns + 1000000;
+  webm_ctx->last_pts_ns = pts_ns;
 
   segment->AddFrame(static_cast<uint8_t*>(pkt->data.frame.buf),
                     pkt->data.frame.sz,
@@ -99,14 +99,14 @@
                     pkt->data.frame.flags & VPX_FRAME_IS_KEY);
 }
 
-void write_webm_file_footer(struct EbmlGlobal *glob) {
+void write_webm_file_footer(struct WebmOutputContext *webm_ctx) {
   mkvmuxer::MkvWriter *const writer =
-      reinterpret_cast<mkvmuxer::MkvWriter*>(glob->writer);
+      reinterpret_cast<mkvmuxer::MkvWriter*>(webm_ctx->writer);
   mkvmuxer::Segment *const segment =
-      reinterpret_cast<mkvmuxer::Segment*>(glob->segment);
+      reinterpret_cast<mkvmuxer::Segment*>(webm_ctx->segment);
   segment->Finalize();
   delete segment;
   delete writer;
-  glob->writer = NULL;
-  glob->segment = NULL;
+  webm_ctx->writer = NULL;
+  webm_ctx->segment = NULL;
 }
diff --git a/webmenc.h b/webmenc.h
index c255d3d..ad30664 100644
--- a/webmenc.h
+++ b/webmenc.h
@@ -20,8 +20,7 @@
 extern "C" {
 #endif
 
-/* TODO(vigneshv): Rename this struct */
-struct EbmlGlobal {
+struct WebmOutputContext {
   int debug;
   FILE *stream;
   int64_t last_pts_ns;
@@ -38,18 +37,18 @@
   STEREO_FORMAT_RIGHT_LEFT = 11
 } stereo_format_t;
 
-void write_webm_file_header(struct EbmlGlobal *glob,
+void write_webm_file_header(struct WebmOutputContext *webm_ctx,
                             const vpx_codec_enc_cfg_t *cfg,
                             const struct vpx_rational *fps,
                             stereo_format_t stereo_fmt,
                             unsigned int fourcc,
                             const struct VpxRational *par);
 
-void write_webm_block(struct EbmlGlobal *glob,
+void write_webm_block(struct WebmOutputContext *webm_ctx,
                       const vpx_codec_enc_cfg_t *cfg,
                       const vpx_codec_cx_pkt_t *pkt);
 
-void write_webm_file_footer(struct EbmlGlobal *glob);
+void write_webm_file_footer(struct WebmOutputContext *webm_ctx);
 
 #ifdef __cplusplus
 }  // extern "C"
